diff --git a/app/core/config.py b/app/core/config.py index e014cef..0f9d13a 100644 --- a/app/core/config.py +++ b/app/core/config.py @@ -50,9 +50,7 @@ class Settings(BaseSettings): max_tokens: int = 4096 # Model Paths - pp_doclayout_model_dir: str | None = ( - "/home/yoge/.cache/modelscope/hub/models/PaddlePaddle/PP-DocLayoutV3" - ) + pp_doclayout_model_dir: str | None = "/home/yoge/.cache/modelscope/hub/models/PaddlePaddle/PP-DocLayoutV3" # Image Processing max_image_size_mb: int = 10 diff --git a/app/services/glm_postprocess.py b/app/services/glm_postprocess.py index 80d0d7e..79beafb 100644 --- a/app/services/glm_postprocess.py +++ b/app/services/glm_postprocess.py @@ -265,7 +265,7 @@ class GLMResultFormatter: # Formula wrapping if label == "formula": content = content.strip() - for s, e in [("$$", "$$"), (r"\[", r"\]"), (r"\(", r"\)")]: + for s, e in [("$$", "$$"), (r"\[", r"\]"), (r"\(", r"\)"), ("$", "$")]: if content.startswith(s): content = content[len(s) :].strip() if content.endswith(e): diff --git a/app/services/ocr_service.py b/app/services/ocr_service.py index 7b063bf..d4a678c 100644 --- a/app/services/ocr_service.py +++ b/app/services/ocr_service.py @@ -781,11 +781,11 @@ class MineruOCRService(OCRServiceBase): # Task-specific prompts (from GLM-OCR SDK config.yaml) _TASK_PROMPTS: dict[str, str] = { - "text": "Text Recognition. If the content is a formula, please ouput display latex code, else output text", + "text": "Text Recognition. If the content is a formula, please output display latex code, else output text", "formula": "Formula Recognition:", "table": "Table Recognition:", } -_DEFAULT_PROMPT = "Text Recognition. If the content is a formula, please ouput display latex code, else output text" +_DEFAULT_PROMPT = "Text Recognition. If the content is a formula, please output display latex code, else output text" class GLMOCREndToEndService(OCRServiceBase): @@ -868,7 +868,8 @@ class GLMOCREndToEndService(OCRServiceBase): """ # 1. Layout detection img_h, img_w = image.shape[:2] - layout_info = self.layout_detector.detect(image) + padded_image = self.image_processor.add_padding(image) + layout_info = self.layout_detector.detect(padded_image) # Sort regions in reading order: top-to-bottom, left-to-right layout_info.regions.sort(key=lambda r: (r.bbox[1], r.bbox[0])) @@ -890,7 +891,7 @@ class GLMOCREndToEndService(OCRServiceBase): if region.type == "figure": continue x1, y1, x2, y2 = (int(c) for c in region.bbox) - cropped = image[y1:y2, x1:x2] + cropped = padded_image[y1:y2, x1:x2] if cropped.size == 0 or cropped.shape[0] < 10 or cropped.shape[1] < 10: logger.warning( "Skipping region idx=%d (label=%s): crop too small %s",