refact: add log
This commit is contained in:
@@ -547,27 +547,35 @@ class MineruOCRService(OCRServiceBase):
|
||||
Returns:
|
||||
Markdown content with formulas recognized by PaddleOCR-VL.
|
||||
"""
|
||||
# Pattern to match image references: 
|
||||
# Pattern to match image references:  or 
|
||||
image_pattern = re.compile(r"!\[\]\(images/[^)]+\)")
|
||||
|
||||
if not image_pattern.search(markdown_content):
|
||||
return markdown_content
|
||||
|
||||
print(f"[DEBUG] Found image reference in markdown, triggering PaddleOCR-VL recognition")
|
||||
|
||||
try:
|
||||
# For now, use the entire image for formula recognition
|
||||
# TODO: Extract specific regions if image paths contain coordinates
|
||||
formula_text = self._recognize_formula_with_paddleocr_vl(original_image)
|
||||
|
||||
print(f"[DEBUG] PaddleOCR-VL recognized formula: {formula_text[:100] if formula_text else 'Empty'}...")
|
||||
|
||||
# Replace image references with recognized formulas
|
||||
# Wrap in display math delimiters if not already wrapped
|
||||
if not formula_text.startswith("$$"):
|
||||
if formula_text and not formula_text.startswith("$$"):
|
||||
formula_text = f"$${formula_text}$$"
|
||||
|
||||
markdown_content = image_pattern.sub(formula_text, markdown_content)
|
||||
print(f"[DEBUG] Formula recognition successful, updated markdown")
|
||||
|
||||
except Exception as e:
|
||||
# If formula recognition fails, keep original content
|
||||
print(f"Warning: Formula recognition failed: {e}")
|
||||
# If formula recognition fails, keep original content and log error
|
||||
import traceback
|
||||
|
||||
print(f"[ERROR] Formula recognition failed: {e}")
|
||||
print(f"[ERROR] Traceback: {traceback.format_exc()}")
|
||||
|
||||
return markdown_content
|
||||
|
||||
@@ -622,10 +630,15 @@ class MineruOCRService(OCRServiceBase):
|
||||
if "results" in result and "image" in result["results"]:
|
||||
markdown_content = result["results"]["image"].get("md_content", "")
|
||||
|
||||
print(f"[DEBUG] Markdown content from Mineru: {markdown_content[:200]}...")
|
||||
|
||||
# Check if markdown contains formula image references
|
||||
if "
|
||||
# Use PaddleOCR-VL to recognize the formula
|
||||
markdown_content = self._extract_and_recognize_formulas(markdown_content, image)
|
||||
else:
|
||||
print(f"[DEBUG] No image reference found in markdown")
|
||||
|
||||
# Apply postprocessing to fix OCR errors
|
||||
markdown_content = _postprocess_markdown(markdown_content)
|
||||
|
||||
Reference in New Issue
Block a user