refact: add log

This commit is contained in:
liuyuanchuang
2026-02-05 20:50:04 +08:00
parent 15986c8966
commit c93eba2839
4 changed files with 74 additions and 7 deletions

View File

@@ -547,27 +547,35 @@ class MineruOCRService(OCRServiceBase):
Returns:
Markdown content with formulas recognized by PaddleOCR-VL.
"""
# Pattern to match image references: ![](images/xxx.png)
# Pattern to match image references: ![](images/xxx.png) or ![](images/xxx.jpg)
image_pattern = re.compile(r"!\[\]\(images/[^)]+\)")
if not image_pattern.search(markdown_content):
return markdown_content
print(f"[DEBUG] Found image reference in markdown, triggering PaddleOCR-VL recognition")
try:
# For now, use the entire image for formula recognition
# TODO: Extract specific regions if image paths contain coordinates
formula_text = self._recognize_formula_with_paddleocr_vl(original_image)
print(f"[DEBUG] PaddleOCR-VL recognized formula: {formula_text[:100] if formula_text else 'Empty'}...")
# Replace image references with recognized formulas
# Wrap in display math delimiters if not already wrapped
if not formula_text.startswith("$$"):
if formula_text and not formula_text.startswith("$$"):
formula_text = f"$${formula_text}$$"
markdown_content = image_pattern.sub(formula_text, markdown_content)
print(f"[DEBUG] Formula recognition successful, updated markdown")
except Exception as e:
# If formula recognition fails, keep original content
print(f"Warning: Formula recognition failed: {e}")
# If formula recognition fails, keep original content and log error
import traceback
print(f"[ERROR] Formula recognition failed: {e}")
print(f"[ERROR] Traceback: {traceback.format_exc()}")
return markdown_content
@@ -622,10 +630,15 @@ class MineruOCRService(OCRServiceBase):
if "results" in result and "image" in result["results"]:
markdown_content = result["results"]["image"].get("md_content", "")
print(f"[DEBUG] Markdown content from Mineru: {markdown_content[:200]}...")
# Check if markdown contains formula image references
if "![](images/" in markdown_content:
print(f"[DEBUG] Detected image reference, calling PaddleOCR-VL...")
# Use PaddleOCR-VL to recognize the formula
markdown_content = self._extract_and_recognize_formulas(markdown_content, image)
else:
print(f"[DEBUG] No image reference found in markdown")
# Apply postprocessing to fix OCR errors
markdown_content = _postprocess_markdown(markdown_content)