feat: add rm fake title
This commit is contained in:
233
test_remove_false_heading.py
Normal file
233
test_remove_false_heading.py
Normal file
@@ -0,0 +1,233 @@
|
||||
"""Test for removing false heading markers from single-formula content.
|
||||
|
||||
OCR sometimes incorrectly identifies a single formula as a heading by adding '#' prefix.
|
||||
This test verifies that the heading marker is correctly removed.
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
|
||||
def _remove_false_heading_from_single_formula(markdown_content: str) -> str:
|
||||
"""Remove false heading markers from single-formula content."""
|
||||
if not markdown_content or not markdown_content.strip():
|
||||
return markdown_content
|
||||
|
||||
lines = markdown_content.split('\n')
|
||||
|
||||
# Count formulas and heading lines
|
||||
formula_count = 0
|
||||
heading_lines = []
|
||||
has_non_formula_text = False
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
line_stripped = line.strip()
|
||||
|
||||
if not line_stripped:
|
||||
continue
|
||||
|
||||
# Check if line starts with heading marker
|
||||
heading_match = re.match(r'^(#{1,6})\s+(.+)$', line_stripped)
|
||||
|
||||
if heading_match:
|
||||
heading_level = heading_match.group(1)
|
||||
content = heading_match.group(2)
|
||||
|
||||
# Check if the heading content is a formula
|
||||
if re.fullmatch(r'\$\$?.+\$\$?', content):
|
||||
# This is a heading with a formula
|
||||
heading_lines.append((i, heading_level, content))
|
||||
formula_count += 1
|
||||
else:
|
||||
# This is a real heading with text
|
||||
has_non_formula_text = True
|
||||
elif re.fullmatch(r'\$\$?.+\$\$?', line_stripped):
|
||||
# Standalone formula line (not in a heading)
|
||||
formula_count += 1
|
||||
elif line_stripped and not re.match(r'^#+\s*$', line_stripped):
|
||||
# Non-empty, non-heading, non-formula line
|
||||
has_non_formula_text = True
|
||||
|
||||
# Only remove heading markers if:
|
||||
# 1. There's exactly one formula
|
||||
# 2. That formula is in a heading line
|
||||
# 3. There's no other text content
|
||||
if formula_count == 1 and len(heading_lines) == 1 and not has_non_formula_text:
|
||||
# Remove the heading marker from the formula
|
||||
line_idx, heading_level, formula_content = heading_lines[0]
|
||||
lines[line_idx] = formula_content
|
||||
|
||||
return '\n'.join(lines)
|
||||
|
||||
|
||||
# Test cases
|
||||
test_cases = [
|
||||
# Should remove heading marker (single formula with heading)
|
||||
(
|
||||
"# $$E = mc^2$$",
|
||||
"$$E = mc^2$$",
|
||||
"Single display formula with heading"
|
||||
),
|
||||
(
|
||||
"# $x = y$",
|
||||
"$x = y$",
|
||||
"Single inline formula with heading"
|
||||
),
|
||||
(
|
||||
"## $$\\frac{a}{b}$$",
|
||||
"$$\\frac{a}{b}$$",
|
||||
"Single formula with level-2 heading"
|
||||
),
|
||||
(
|
||||
"### $$\\lambda_{1}$$",
|
||||
"$$\\lambda_{1}$$",
|
||||
"Single formula with level-3 heading"
|
||||
),
|
||||
|
||||
# Should NOT remove heading marker (has text content)
|
||||
(
|
||||
"# Introduction\n$$E = mc^2$$",
|
||||
"# Introduction\n$$E = mc^2$$",
|
||||
"Heading with text + formula (keep heading)"
|
||||
),
|
||||
(
|
||||
"# Title\nSome text\n$$E = mc^2$$",
|
||||
"# Title\nSome text\n$$E = mc^2$$",
|
||||
"Heading + text + formula (keep heading)"
|
||||
),
|
||||
(
|
||||
"$$E = mc^2$$\n# Summary",
|
||||
"$$E = mc^2$$\n# Summary",
|
||||
"Formula + heading with text (keep heading)"
|
||||
),
|
||||
|
||||
# Should NOT remove heading marker (multiple formulas)
|
||||
(
|
||||
"# $$x = y$$\n$$a = b$$",
|
||||
"# $$x = y$$\n$$a = b$$",
|
||||
"Multiple formulas (keep heading)"
|
||||
),
|
||||
(
|
||||
"$$x = y$$\n# $$a = b$$",
|
||||
"$$x = y$$\n# $$a = b$$",
|
||||
"Two formulas, one with heading (keep heading)"
|
||||
),
|
||||
|
||||
# Should NOT remove heading marker (standalone formula without heading)
|
||||
(
|
||||
"$$E = mc^2$$",
|
||||
"$$E = mc^2$$",
|
||||
"Single formula without heading (no change)"
|
||||
),
|
||||
(
|
||||
"$x = y$",
|
||||
"$x = y$",
|
||||
"Single inline formula without heading (no change)"
|
||||
),
|
||||
|
||||
# Edge cases
|
||||
(
|
||||
"",
|
||||
"",
|
||||
"Empty string"
|
||||
),
|
||||
(
|
||||
"# ",
|
||||
"# ",
|
||||
"Empty heading"
|
||||
),
|
||||
(
|
||||
"#",
|
||||
"#",
|
||||
"Just hash symbol"
|
||||
),
|
||||
(
|
||||
"# $$E = mc^2$$\n\n",
|
||||
"$$E = mc^2$$\n\n",
|
||||
"Formula with heading and trailing newlines"
|
||||
),
|
||||
(
|
||||
"\n\n# $$E = mc^2$$",
|
||||
"\n\n$$E = mc^2$$",
|
||||
"Formula with heading and leading newlines"
|
||||
),
|
||||
|
||||
# Complex formulas
|
||||
(
|
||||
"# $$\\int_{0}^{\\infty} e^{-x^2} dx = \\frac{\\sqrt{\\pi}}{2}$$",
|
||||
"$$\\int_{0}^{\\infty} e^{-x^2} dx = \\frac{\\sqrt{\\pi}}{2}$$",
|
||||
"Complex integral formula with heading"
|
||||
),
|
||||
(
|
||||
"# $$\\begin{pmatrix} a & b \\\\ c & d \\end{pmatrix}$$",
|
||||
"$$\\begin{pmatrix} a & b \\\\ c & d \\end{pmatrix}$$",
|
||||
"Matrix formula with heading"
|
||||
),
|
||||
]
|
||||
|
||||
print("=" * 80)
|
||||
print("Remove False Heading from Single Formula - Test")
|
||||
print("=" * 80)
|
||||
|
||||
passed = 0
|
||||
failed = 0
|
||||
|
||||
for i, (input_text, expected, description) in enumerate(test_cases, 1):
|
||||
result = _remove_false_heading_from_single_formula(input_text)
|
||||
|
||||
if result == expected:
|
||||
status = "✅ PASS"
|
||||
passed += 1
|
||||
else:
|
||||
status = "❌ FAIL"
|
||||
failed += 1
|
||||
|
||||
print(f"\n{status} Test {i}: {description}")
|
||||
print(f" Input: {repr(input_text)}")
|
||||
print(f" Expected: {repr(expected)}")
|
||||
print(f" Got: {repr(result)}")
|
||||
if result != expected:
|
||||
print(f" >>> MISMATCH!")
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("SUMMARY")
|
||||
print("=" * 80)
|
||||
print(f"Total tests: {len(test_cases)}")
|
||||
print(f"✅ Passed: {passed}")
|
||||
print(f"❌ Failed: {failed}")
|
||||
|
||||
if failed == 0:
|
||||
print("\n✅ All tests passed!")
|
||||
else:
|
||||
print(f"\n⚠️ {failed} test(s) failed")
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("KEY SCENARIOS")
|
||||
print("=" * 80)
|
||||
|
||||
key_scenarios = [
|
||||
("# $$E = mc^2$$", "$$E = mc^2$$", "✅ Remove heading"),
|
||||
("# Introduction\n$$E = mc^2$$", "# Introduction\n$$E = mc^2$$", "❌ Keep heading (has text)"),
|
||||
("# $$x = y$$\n$$a = b$$", "# $$x = y$$\n$$a = b$$", "❌ Keep heading (multiple formulas)"),
|
||||
("$$E = mc^2$$", "$$E = mc^2$$", "→ No change (no heading)"),
|
||||
]
|
||||
|
||||
print("\nBehavior Summary:")
|
||||
for input_text, expected, explanation in key_scenarios:
|
||||
result = _remove_false_heading_from_single_formula(input_text)
|
||||
match = "✓" if result == expected else "✗"
|
||||
print(f" {match} {explanation}")
|
||||
print(f" {repr(input_text)} → {repr(result)}")
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("DECISION LOGIC")
|
||||
print("=" * 80)
|
||||
print("""
|
||||
Remove heading marker ONLY when ALL conditions are met:
|
||||
1. ✅ Exactly ONE formula in the entire content
|
||||
2. ✅ That formula is on a line starting with '#' (heading marker)
|
||||
3. ✅ No other text content exists (only formula and empty lines)
|
||||
|
||||
Otherwise: Keep the heading marker as-is.
|
||||
""")
|
||||
|
||||
print("=" * 80)
|
||||
Reference in New Issue
Block a user