Files
doc_processer/test_remove_false_heading.py

234 lines
6.6 KiB
Python
Raw Normal View History

2026-02-05 17:59:54 +08:00
"""Test for removing false heading markers from single-formula content.
OCR sometimes incorrectly identifies a single formula as a heading by adding '#' prefix.
This test verifies that the heading marker is correctly removed.
"""
import re
def _remove_false_heading_from_single_formula(markdown_content: str) -> str:
"""Remove false heading markers from single-formula content."""
if not markdown_content or not markdown_content.strip():
return markdown_content
lines = markdown_content.split('\n')
# Count formulas and heading lines
formula_count = 0
heading_lines = []
has_non_formula_text = False
for i, line in enumerate(lines):
line_stripped = line.strip()
if not line_stripped:
continue
# Check if line starts with heading marker
heading_match = re.match(r'^(#{1,6})\s+(.+)$', line_stripped)
if heading_match:
heading_level = heading_match.group(1)
content = heading_match.group(2)
# Check if the heading content is a formula
if re.fullmatch(r'\$\$?.+\$\$?', content):
# This is a heading with a formula
heading_lines.append((i, heading_level, content))
formula_count += 1
else:
# This is a real heading with text
has_non_formula_text = True
elif re.fullmatch(r'\$\$?.+\$\$?', line_stripped):
# Standalone formula line (not in a heading)
formula_count += 1
elif line_stripped and not re.match(r'^#+\s*$', line_stripped):
# Non-empty, non-heading, non-formula line
has_non_formula_text = True
# Only remove heading markers if:
# 1. There's exactly one formula
# 2. That formula is in a heading line
# 3. There's no other text content
if formula_count == 1 and len(heading_lines) == 1 and not has_non_formula_text:
# Remove the heading marker from the formula
line_idx, heading_level, formula_content = heading_lines[0]
lines[line_idx] = formula_content
return '\n'.join(lines)
# Test cases
test_cases = [
# Should remove heading marker (single formula with heading)
(
"# $$E = mc^2$$",
"$$E = mc^2$$",
"Single display formula with heading"
),
(
"# $x = y$",
"$x = y$",
"Single inline formula with heading"
),
(
"## $$\\frac{a}{b}$$",
"$$\\frac{a}{b}$$",
"Single formula with level-2 heading"
),
(
"### $$\\lambda_{1}$$",
"$$\\lambda_{1}$$",
"Single formula with level-3 heading"
),
# Should NOT remove heading marker (has text content)
(
"# Introduction\n$$E = mc^2$$",
"# Introduction\n$$E = mc^2$$",
"Heading with text + formula (keep heading)"
),
(
"# Title\nSome text\n$$E = mc^2$$",
"# Title\nSome text\n$$E = mc^2$$",
"Heading + text + formula (keep heading)"
),
(
"$$E = mc^2$$\n# Summary",
"$$E = mc^2$$\n# Summary",
"Formula + heading with text (keep heading)"
),
# Should NOT remove heading marker (multiple formulas)
(
"# $$x = y$$\n$$a = b$$",
"# $$x = y$$\n$$a = b$$",
"Multiple formulas (keep heading)"
),
(
"$$x = y$$\n# $$a = b$$",
"$$x = y$$\n# $$a = b$$",
"Two formulas, one with heading (keep heading)"
),
# Should NOT remove heading marker (standalone formula without heading)
(
"$$E = mc^2$$",
"$$E = mc^2$$",
"Single formula without heading (no change)"
),
(
"$x = y$",
"$x = y$",
"Single inline formula without heading (no change)"
),
# Edge cases
(
"",
"",
"Empty string"
),
(
"# ",
"# ",
"Empty heading"
),
(
"#",
"#",
"Just hash symbol"
),
(
"# $$E = mc^2$$\n\n",
"$$E = mc^2$$\n\n",
"Formula with heading and trailing newlines"
),
(
"\n\n# $$E = mc^2$$",
"\n\n$$E = mc^2$$",
"Formula with heading and leading newlines"
),
# Complex formulas
(
"# $$\\int_{0}^{\\infty} e^{-x^2} dx = \\frac{\\sqrt{\\pi}}{2}$$",
"$$\\int_{0}^{\\infty} e^{-x^2} dx = \\frac{\\sqrt{\\pi}}{2}$$",
"Complex integral formula with heading"
),
(
"# $$\\begin{pmatrix} a & b \\\\ c & d \\end{pmatrix}$$",
"$$\\begin{pmatrix} a & b \\\\ c & d \\end{pmatrix}$$",
"Matrix formula with heading"
),
]
print("=" * 80)
print("Remove False Heading from Single Formula - Test")
print("=" * 80)
passed = 0
failed = 0
for i, (input_text, expected, description) in enumerate(test_cases, 1):
result = _remove_false_heading_from_single_formula(input_text)
if result == expected:
status = "✅ PASS"
passed += 1
else:
status = "❌ FAIL"
failed += 1
print(f"\n{status} Test {i}: {description}")
print(f" Input: {repr(input_text)}")
print(f" Expected: {repr(expected)}")
print(f" Got: {repr(result)}")
if result != expected:
print(f" >>> MISMATCH!")
print("\n" + "=" * 80)
print("SUMMARY")
print("=" * 80)
print(f"Total tests: {len(test_cases)}")
print(f"✅ Passed: {passed}")
print(f"❌ Failed: {failed}")
if failed == 0:
print("\n✅ All tests passed!")
else:
print(f"\n⚠️ {failed} test(s) failed")
print("\n" + "=" * 80)
print("KEY SCENARIOS")
print("=" * 80)
key_scenarios = [
("# $$E = mc^2$$", "$$E = mc^2$$", "✅ Remove heading"),
("# Introduction\n$$E = mc^2$$", "# Introduction\n$$E = mc^2$$", "❌ Keep heading (has text)"),
("# $$x = y$$\n$$a = b$$", "# $$x = y$$\n$$a = b$$", "❌ Keep heading (multiple formulas)"),
("$$E = mc^2$$", "$$E = mc^2$$", "→ No change (no heading)"),
]
print("\nBehavior Summary:")
for input_text, expected, explanation in key_scenarios:
result = _remove_false_heading_from_single_formula(input_text)
match = "" if result == expected else ""
print(f" {match} {explanation}")
print(f" {repr(input_text)}{repr(result)}")
print("\n" + "=" * 80)
print("DECISION LOGIC")
print("=" * 80)
print("""
Remove heading marker ONLY when ALL conditions are met:
1. Exactly ONE formula in the entire content
2. That formula is on a line starting with '#' (heading marker)
3. No other text content exists (only formula and empty lines)
Otherwise: Keep the heading marker as-is.
""")
print("=" * 80)