feat: add rm fake title

This commit is contained in:
liuyuanchuang
2026-02-05 17:59:54 +08:00
parent cee93ab616
commit 83e9bf0fb1
6 changed files with 1192 additions and 155 deletions

View File

@@ -272,7 +272,87 @@ def _postprocess_markdown(markdown_content: str) -> str:
return f"${_postprocess_math(seg[1:-1])}$"
return seg
return _MATH_SEGMENT_PATTERN.sub(_fix_segment, markdown_content)
markdown_content = _MATH_SEGMENT_PATTERN.sub(_fix_segment, markdown_content)
# Apply markdown-level postprocessing (after LaTeX processing)
markdown_content = _remove_false_heading_from_single_formula(markdown_content)
return markdown_content
def _remove_false_heading_from_single_formula(markdown_content: str) -> str:
"""Remove false heading markers from single-formula content.
OCR sometimes incorrectly identifies a single formula as a heading by adding '#' prefix.
This function detects and removes the heading marker when:
1. The content contains only one formula (display or inline)
2. The formula line starts with '#' (heading marker)
3. No other non-formula text content exists
Examples:
Input: "# $$E = mc^2$$"
Output: "$$E = mc^2$$"
Input: "# $x = y$"
Output: "$x = y$"
Input: "# Introduction\n$$E = mc^2$$" (has text, keep heading)
Output: "# Introduction\n$$E = mc^2$$"
Args:
markdown_content: Markdown text with potential false headings.
Returns:
Markdown text with false heading markers removed.
"""
if not markdown_content or not markdown_content.strip():
return markdown_content
lines = markdown_content.split('\n')
# Count formulas and heading lines
formula_count = 0
heading_lines = []
has_non_formula_text = False
for i, line in enumerate(lines):
line_stripped = line.strip()
if not line_stripped:
continue
# Check if line starts with heading marker
heading_match = re.match(r'^(#{1,6})\s+(.+)$', line_stripped)
if heading_match:
heading_level = heading_match.group(1)
content = heading_match.group(2)
# Check if the heading content is a formula
if re.fullmatch(r'\$\$?.+\$\$?', content):
# This is a heading with a formula
heading_lines.append((i, heading_level, content))
formula_count += 1
else:
# This is a real heading with text
has_non_formula_text = True
elif re.fullmatch(r'\$\$?.+\$\$?', line_stripped):
# Standalone formula line (not in a heading)
formula_count += 1
elif line_stripped and not re.match(r'^#+\s*$', line_stripped):
# Non-empty, non-heading, non-formula line
has_non_formula_text = True
# Only remove heading markers if:
# 1. There's exactly one formula
# 2. That formula is in a heading line
# 3. There's no other text content
if formula_count == 1 and len(heading_lines) == 1 and not has_non_formula_text:
# Remove the heading marker from the formula
line_idx, heading_level, formula_content = heading_lines[0]
lines[line_idx] = formula_content
return '\n'.join(lines)
class OCRServiceBase(ABC):