From ff820214670f241153b76338ab301120b3564016 Mon Sep 17 00:00:00 2001 From: yoge Date: Thu, 12 Mar 2026 22:30:27 +0800 Subject: [PATCH] optimize: formula is recognize text --- app/services/layout_detector.py | 2 +- app/services/ocr_service.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/app/services/layout_detector.py b/app/services/layout_detector.py index 10a366a..c34a4f4 100644 --- a/app/services/layout_detector.py +++ b/app/services/layout_detector.py @@ -148,7 +148,7 @@ class LayoutDetector: ) ) - mixed_recognition = any(region.type == "text" and region.score > 0.3 for region in regions) + mixed_recognition = any(region.type == "text" and region.score > 0.85 for region in regions) return LayoutInfo(regions=regions, MixedRecognition=mixed_recognition) diff --git a/app/services/ocr_service.py b/app/services/ocr_service.py index 0124ba0..7b063bf 100644 --- a/app/services/ocr_service.py +++ b/app/services/ocr_service.py @@ -781,11 +781,11 @@ class MineruOCRService(OCRServiceBase): # Task-specific prompts (from GLM-OCR SDK config.yaml) _TASK_PROMPTS: dict[str, str] = { - "text": "Text Recognition. If the content is a formula, please ouput latex code, else output text", + "text": "Text Recognition. If the content is a formula, please ouput display latex code, else output text", "formula": "Formula Recognition:", "table": "Table Recognition:", } -_DEFAULT_PROMPT = "Text Recognition. If the content is a formula, please ouput latex code, else output text" +_DEFAULT_PROMPT = "Text Recognition. If the content is a formula, please ouput display latex code, else output text" class GLMOCREndToEndService(OCRServiceBase): @@ -874,7 +874,7 @@ class GLMOCREndToEndService(OCRServiceBase): layout_info.regions.sort(key=lambda r: (r.bbox[1], r.bbox[0])) # 3. OCR: per-region (parallel) or full-image fallback - if not layout_info.regions: + if not layout_info.regions or (len(layout_info.regions) == 1 and not layout_info.MixedRecognition): # No layout detected → assume it's a formula, use formula recognition logger.info("No layout regions detected, treating image as formula") raw_content = self._call_vllm(image, _TASK_PROMPTS["formula"])