"""Test for removing false heading markers from single-formula content. OCR sometimes incorrectly identifies a single formula as a heading by adding '#' prefix. This test verifies that the heading marker is correctly removed. """ import re def _remove_false_heading_from_single_formula(markdown_content: str) -> str: """Remove false heading markers from single-formula content.""" if not markdown_content or not markdown_content.strip(): return markdown_content lines = markdown_content.split('\n') # Count formulas and heading lines formula_count = 0 heading_lines = [] has_non_formula_text = False for i, line in enumerate(lines): line_stripped = line.strip() if not line_stripped: continue # Check if line starts with heading marker heading_match = re.match(r'^(#{1,6})\s+(.+)$', line_stripped) if heading_match: heading_level = heading_match.group(1) content = heading_match.group(2) # Check if the heading content is a formula if re.fullmatch(r'\$\$?.+\$\$?', content): # This is a heading with a formula heading_lines.append((i, heading_level, content)) formula_count += 1 else: # This is a real heading with text has_non_formula_text = True elif re.fullmatch(r'\$\$?.+\$\$?', line_stripped): # Standalone formula line (not in a heading) formula_count += 1 elif line_stripped and not re.match(r'^#+\s*$', line_stripped): # Non-empty, non-heading, non-formula line has_non_formula_text = True # Only remove heading markers if: # 1. There's exactly one formula # 2. That formula is in a heading line # 3. There's no other text content if formula_count == 1 and len(heading_lines) == 1 and not has_non_formula_text: # Remove the heading marker from the formula line_idx, heading_level, formula_content = heading_lines[0] lines[line_idx] = formula_content return '\n'.join(lines) # Test cases test_cases = [ # Should remove heading marker (single formula with heading) ( "# $$E = mc^2$$", "$$E = mc^2$$", "Single display formula with heading" ), ( "# $x = y$", "$x = y$", "Single inline formula with heading" ), ( "## $$\\frac{a}{b}$$", "$$\\frac{a}{b}$$", "Single formula with level-2 heading" ), ( "### $$\\lambda_{1}$$", "$$\\lambda_{1}$$", "Single formula with level-3 heading" ), # Should NOT remove heading marker (has text content) ( "# Introduction\n$$E = mc^2$$", "# Introduction\n$$E = mc^2$$", "Heading with text + formula (keep heading)" ), ( "# Title\nSome text\n$$E = mc^2$$", "# Title\nSome text\n$$E = mc^2$$", "Heading + text + formula (keep heading)" ), ( "$$E = mc^2$$\n# Summary", "$$E = mc^2$$\n# Summary", "Formula + heading with text (keep heading)" ), # Should NOT remove heading marker (multiple formulas) ( "# $$x = y$$\n$$a = b$$", "# $$x = y$$\n$$a = b$$", "Multiple formulas (keep heading)" ), ( "$$x = y$$\n# $$a = b$$", "$$x = y$$\n# $$a = b$$", "Two formulas, one with heading (keep heading)" ), # Should NOT remove heading marker (standalone formula without heading) ( "$$E = mc^2$$", "$$E = mc^2$$", "Single formula without heading (no change)" ), ( "$x = y$", "$x = y$", "Single inline formula without heading (no change)" ), # Edge cases ( "", "", "Empty string" ), ( "# ", "# ", "Empty heading" ), ( "#", "#", "Just hash symbol" ), ( "# $$E = mc^2$$\n\n", "$$E = mc^2$$\n\n", "Formula with heading and trailing newlines" ), ( "\n\n# $$E = mc^2$$", "\n\n$$E = mc^2$$", "Formula with heading and leading newlines" ), # Complex formulas ( "# $$\\int_{0}^{\\infty} e^{-x^2} dx = \\frac{\\sqrt{\\pi}}{2}$$", "$$\\int_{0}^{\\infty} e^{-x^2} dx = \\frac{\\sqrt{\\pi}}{2}$$", "Complex integral formula with heading" ), ( "# $$\\begin{pmatrix} a & b \\\\ c & d \\end{pmatrix}$$", "$$\\begin{pmatrix} a & b \\\\ c & d \\end{pmatrix}$$", "Matrix formula with heading" ), ] print("=" * 80) print("Remove False Heading from Single Formula - Test") print("=" * 80) passed = 0 failed = 0 for i, (input_text, expected, description) in enumerate(test_cases, 1): result = _remove_false_heading_from_single_formula(input_text) if result == expected: status = "✅ PASS" passed += 1 else: status = "❌ FAIL" failed += 1 print(f"\n{status} Test {i}: {description}") print(f" Input: {repr(input_text)}") print(f" Expected: {repr(expected)}") print(f" Got: {repr(result)}") if result != expected: print(f" >>> MISMATCH!") print("\n" + "=" * 80) print("SUMMARY") print("=" * 80) print(f"Total tests: {len(test_cases)}") print(f"✅ Passed: {passed}") print(f"❌ Failed: {failed}") if failed == 0: print("\n✅ All tests passed!") else: print(f"\n⚠️ {failed} test(s) failed") print("\n" + "=" * 80) print("KEY SCENARIOS") print("=" * 80) key_scenarios = [ ("# $$E = mc^2$$", "$$E = mc^2$$", "✅ Remove heading"), ("# Introduction\n$$E = mc^2$$", "# Introduction\n$$E = mc^2$$", "❌ Keep heading (has text)"), ("# $$x = y$$\n$$a = b$$", "# $$x = y$$\n$$a = b$$", "❌ Keep heading (multiple formulas)"), ("$$E = mc^2$$", "$$E = mc^2$$", "→ No change (no heading)"), ] print("\nBehavior Summary:") for input_text, expected, explanation in key_scenarios: result = _remove_false_heading_from_single_formula(input_text) match = "✓" if result == expected else "✗" print(f" {match} {explanation}") print(f" {repr(input_text)} → {repr(result)}") print("\n" + "=" * 80) print("DECISION LOGIC") print("=" * 80) print(""" Remove heading marker ONLY when ALL conditions are met: 1. ✅ Exactly ONE formula in the entire content 2. ✅ That formula is on a line starting with '#' (heading marker) 3. ✅ No other text content exists (only formula and empty lines) Otherwise: Keep the heading marker as-is. """) print("=" * 80)