feat: add rm fake title
This commit is contained in:
@@ -272,7 +272,87 @@ def _postprocess_markdown(markdown_content: str) -> str:
|
||||
return f"${_postprocess_math(seg[1:-1])}$"
|
||||
return seg
|
||||
|
||||
return _MATH_SEGMENT_PATTERN.sub(_fix_segment, markdown_content)
|
||||
markdown_content = _MATH_SEGMENT_PATTERN.sub(_fix_segment, markdown_content)
|
||||
|
||||
# Apply markdown-level postprocessing (after LaTeX processing)
|
||||
markdown_content = _remove_false_heading_from_single_formula(markdown_content)
|
||||
|
||||
return markdown_content
|
||||
|
||||
|
||||
def _remove_false_heading_from_single_formula(markdown_content: str) -> str:
|
||||
"""Remove false heading markers from single-formula content.
|
||||
|
||||
OCR sometimes incorrectly identifies a single formula as a heading by adding '#' prefix.
|
||||
This function detects and removes the heading marker when:
|
||||
1. The content contains only one formula (display or inline)
|
||||
2. The formula line starts with '#' (heading marker)
|
||||
3. No other non-formula text content exists
|
||||
|
||||
Examples:
|
||||
Input: "# $$E = mc^2$$"
|
||||
Output: "$$E = mc^2$$"
|
||||
|
||||
Input: "# $x = y$"
|
||||
Output: "$x = y$"
|
||||
|
||||
Input: "# Introduction\n$$E = mc^2$$" (has text, keep heading)
|
||||
Output: "# Introduction\n$$E = mc^2$$"
|
||||
|
||||
Args:
|
||||
markdown_content: Markdown text with potential false headings.
|
||||
|
||||
Returns:
|
||||
Markdown text with false heading markers removed.
|
||||
"""
|
||||
if not markdown_content or not markdown_content.strip():
|
||||
return markdown_content
|
||||
|
||||
lines = markdown_content.split('\n')
|
||||
|
||||
# Count formulas and heading lines
|
||||
formula_count = 0
|
||||
heading_lines = []
|
||||
has_non_formula_text = False
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
line_stripped = line.strip()
|
||||
|
||||
if not line_stripped:
|
||||
continue
|
||||
|
||||
# Check if line starts with heading marker
|
||||
heading_match = re.match(r'^(#{1,6})\s+(.+)$', line_stripped)
|
||||
|
||||
if heading_match:
|
||||
heading_level = heading_match.group(1)
|
||||
content = heading_match.group(2)
|
||||
|
||||
# Check if the heading content is a formula
|
||||
if re.fullmatch(r'\$\$?.+\$\$?', content):
|
||||
# This is a heading with a formula
|
||||
heading_lines.append((i, heading_level, content))
|
||||
formula_count += 1
|
||||
else:
|
||||
# This is a real heading with text
|
||||
has_non_formula_text = True
|
||||
elif re.fullmatch(r'\$\$?.+\$\$?', line_stripped):
|
||||
# Standalone formula line (not in a heading)
|
||||
formula_count += 1
|
||||
elif line_stripped and not re.match(r'^#+\s*$', line_stripped):
|
||||
# Non-empty, non-heading, non-formula line
|
||||
has_non_formula_text = True
|
||||
|
||||
# Only remove heading markers if:
|
||||
# 1. There's exactly one formula
|
||||
# 2. That formula is in a heading line
|
||||
# 3. There's no other text content
|
||||
if formula_count == 1 and len(heading_lines) == 1 and not has_non_formula_text:
|
||||
# Remove the heading marker from the formula
|
||||
line_idx, heading_level, formula_content = heading_lines[0]
|
||||
lines[line_idx] = formula_content
|
||||
|
||||
return '\n'.join(lines)
|
||||
|
||||
|
||||
class OCRServiceBase(ABC):
|
||||
|
||||
Reference in New Issue
Block a user