From 5504bbbf1ed24072c8a1ffa93c753458b0481552 Mon Sep 17 00:00:00 2001 From: liuyuanchuang Date: Sat, 7 Feb 2026 21:38:41 +0800 Subject: [PATCH] fix:glm max tokens --- app/api/v1/endpoints/image.py | 36 ++++++++++++++---- app/services/ocr_service.py | 71 ++++++++++++++++++----------------- 2 files changed, 65 insertions(+), 42 deletions(-) diff --git a/app/api/v1/endpoints/image.py b/app/api/v1/endpoints/image.py index 0074cf2..b992009 100644 --- a/app/api/v1/endpoints/image.py +++ b/app/api/v1/endpoints/image.py @@ -62,20 +62,20 @@ async def process_image_ocr( try: log.info("Starting image OCR processing") - # Preprocess image + # Preprocess image (load only, no padding yet) preprocess_start = time.time() image = image_processor.preprocess( image_url=request.image_url, image_base64=request.image_base64, ) - # Apply padding if enabled (before layout detection) + # Apply padding only for layout detection processed_image = image if image_processor and settings.is_padding: processed_image = image_processor.add_padding(image) preprocess_time = time.time() - preprocess_start - log.debug(f"Image preprocessing completed in {preprocess_time:.3f}s") + log.debug(f"Image loading completed in {preprocess_time:.3f}s") # Layout detection (using padded image if padding is enabled) layout_start = time.time() @@ -83,14 +83,14 @@ async def process_image_ocr( layout_time = time.time() - layout_start log.info(f"Layout detection completed in {layout_time:.3f}s") - # OCR recognition + # OCR recognition (use original image without padding) ocr_start = time.time() if layout_info.MixedRecognition: recognition_method = "MixedRecognition (MinerU)" log.info(f"Using {recognition_method}") - # Convert numpy array to image bytes (image already padded) - success, encoded_image = cv2.imencode(".png", processed_image) + # Convert original image (without padding) to bytes + success, encoded_image = cv2.imencode(".png", image) if not success: raise RuntimeError("Failed to encode image") @@ -100,7 +100,29 @@ async def process_image_ocr( else: recognition_method = "FormulaOnly (GLMOCR)" log.info(f"Using {recognition_method}") - ocr_result = glmocr_service.recognize(processed_image) + + # Try GLM-OCR first, fallback to MinerU if token limit exceeded + try: + ocr_result = glmocr_service.recognize(image) + except Exception as e: + error_msg = str(e) + # Check if error is due to token limit (max_model_len exceeded) + if "max_model_len" in error_msg or "decoder prompt" in error_msg or "BadRequestError" in error_msg: + log.warning(f"GLM-OCR failed due to token limit: {error_msg}") + log.info("Falling back to MinerU for recognition") + recognition_method = "FormulaOnly (MinerU fallback)" + + # Convert original image to bytes for MinerU + success, encoded_image = cv2.imencode(".png", image) + if not success: + raise RuntimeError("Failed to encode image") + + image_bytes = BytesIO(encoded_image.tobytes()) + image_bytes.seek(0) + ocr_result = mineru_service.recognize(image_bytes) + else: + # Re-raise other errors + raise ocr_time = time.time() - ocr_start total_time = time.time() - preprocess_start diff --git a/app/services/ocr_service.py b/app/services/ocr_service.py index 18f5b85..8b52015 100644 --- a/app/services/ocr_service.py +++ b/app/services/ocr_service.py @@ -532,50 +532,51 @@ class GLMOCRService(OCRServiceBase): Returns: Dict with 'latex', 'markdown', 'mathml', 'mml' keys. + + Raises: + RuntimeError: If recognition fails (preserves original exception for fallback handling). """ - try: - # Add padding to image - padded_image = self.image_processor.add_padding(image) + # Add padding to image + padded_image = self.image_processor.add_padding(image) - # Encode image to base64 - success, encoded_image = cv2.imencode(".png", padded_image) - if not success: - raise RuntimeError("Failed to encode image") + # Encode image to base64 + success, encoded_image = cv2.imencode(".png", padded_image) + if not success: + raise RuntimeError("Failed to encode image") - image_base64 = base64.b64encode(encoded_image.tobytes()).decode("utf-8") - image_url = f"data:image/png;base64,{image_base64}" + image_base64 = base64.b64encode(encoded_image.tobytes()).decode("utf-8") + image_url = f"data:image/png;base64,{image_base64}" - # Call OpenAI-compatible API with formula recognition prompt - prompt = "Formula Recognition:" - messages = [{"role": "user", "content": [{"type": "image_url", "image_url": {"url": image_url}}, {"type": "text", "text": prompt}]}] + # Call OpenAI-compatible API with formula recognition prompt + prompt = "Formula Recognition:" + messages = [{"role": "user", "content": [{"type": "image_url", "image_url": {"url": image_url}}, {"type": "text", "text": prompt}]}] - response = self.openai_client.chat.completions.create( - model="glm-ocr", - messages=messages, - temperature=0.0, - ) + # Don't catch exceptions here - let them propagate for fallback handling + response = self.openai_client.chat.completions.create( + model="glm-ocr", + messages=messages, + temperature=0.0, + ) - markdown_content = response.choices[0].message.content + markdown_content = response.choices[0].message.content - # Process LaTeX delimiters - if markdown_content.startswith(r"\[") or markdown_content.startswith(r"\("): - markdown_content = markdown_content.replace(r"\[", "$$").replace(r"\(", "$$") - markdown_content = markdown_content.replace(r"\]", "$$").replace(r"\)", "$$") - elif not markdown_content.startswith("$$") and not markdown_content.startswith("$"): - markdown_content = f"$${markdown_content}$$" + # Process LaTeX delimiters + if markdown_content.startswith(r"\[") or markdown_content.startswith(r"\("): + markdown_content = markdown_content.replace(r"\[", "$$").replace(r"\(", "$$") + markdown_content = markdown_content.replace(r"\]", "$$").replace(r"\)", "$$") + elif not markdown_content.startswith("$$") and not markdown_content.startswith("$"): + markdown_content = f"$${markdown_content}$$" - # Apply postprocessing - markdown_content = _postprocess_markdown(markdown_content) - convert_result = self.converter.convert_to_formats(markdown_content) + # Apply postprocessing + markdown_content = _postprocess_markdown(markdown_content) + convert_result = self.converter.convert_to_formats(markdown_content) - return { - "latex": convert_result.latex, - "mathml": convert_result.mathml, - "mml": convert_result.mml, - "markdown": markdown_content, - } - except Exception as e: - raise RuntimeError(f"GLM formula recognition failed: {e}") from e + return { + "latex": convert_result.latex, + "mathml": convert_result.mathml, + "mml": convert_result.mml, + "markdown": markdown_content, + } def recognize(self, image: np.ndarray) -> dict: """Recognize content using GLM-4V.