From 5504bbbf1ed24072c8a1ffa93c753458b0481552 Mon Sep 17 00:00:00 2001
From: liuyuanchuang <yuanchuang_liu@qingsongchou.com>
Date: Sat, 7 Feb 2026 21:38:41 +0800
Subject: [PATCH] fix:glm  max tokens

---
 app/api/v1/endpoints/image.py | 36 ++++++++++++++----
 app/services/ocr_service.py   | 71 ++++++++++++++++++-----------------
 2 files changed, 65 insertions(+), 42 deletions(-)

diff --git a/app/api/v1/endpoints/image.py b/app/api/v1/endpoints/image.py
index 0074cf2..b992009 100644
--- a/app/api/v1/endpoints/image.py
+++ b/app/api/v1/endpoints/image.py
@@ -62,20 +62,20 @@ async def process_image_ocr(
     try:
         log.info("Starting image OCR processing")
 
-        # Preprocess image
+        # Preprocess image (load only, no padding yet)
         preprocess_start = time.time()
         image = image_processor.preprocess(
             image_url=request.image_url,
             image_base64=request.image_base64,
         )
 
-        # Apply padding if enabled (before layout detection)
+        # Apply padding only for layout detection
         processed_image = image
         if image_processor and settings.is_padding:
             processed_image = image_processor.add_padding(image)
 
         preprocess_time = time.time() - preprocess_start
-        log.debug(f"Image preprocessing completed in {preprocess_time:.3f}s")
+        log.debug(f"Image loading completed in {preprocess_time:.3f}s")
 
         # Layout detection (using padded image if padding is enabled)
         layout_start = time.time()
@@ -83,14 +83,14 @@ async def process_image_ocr(
         layout_time = time.time() - layout_start
         log.info(f"Layout detection completed in {layout_time:.3f}s")
 
-        # OCR recognition
+        # OCR recognition (use original image without padding)
         ocr_start = time.time()
         if layout_info.MixedRecognition:
             recognition_method = "MixedRecognition (MinerU)"
             log.info(f"Using {recognition_method}")
 
-            # Convert numpy array to image bytes (image already padded)
-            success, encoded_image = cv2.imencode(".png", processed_image)
+            # Convert original image (without padding) to bytes
+            success, encoded_image = cv2.imencode(".png", image)
             if not success:
                 raise RuntimeError("Failed to encode image")
 
@@ -100,7 +100,29 @@ async def process_image_ocr(
         else:
             recognition_method = "FormulaOnly (GLMOCR)"
             log.info(f"Using {recognition_method}")
-            ocr_result = glmocr_service.recognize(processed_image)
+
+            # Try GLM-OCR first, fallback to MinerU if token limit exceeded
+            try:
+                ocr_result = glmocr_service.recognize(image)
+            except Exception as e:
+                error_msg = str(e)
+                # Check if error is due to token limit (max_model_len exceeded)
+                if "max_model_len" in error_msg or "decoder prompt" in error_msg or "BadRequestError" in error_msg:
+                    log.warning(f"GLM-OCR failed due to token limit: {error_msg}")
+                    log.info("Falling back to MinerU for recognition")
+                    recognition_method = "FormulaOnly (MinerU fallback)"
+
+                    # Convert original image to bytes for MinerU
+                    success, encoded_image = cv2.imencode(".png", image)
+                    if not success:
+                        raise RuntimeError("Failed to encode image")
+
+                    image_bytes = BytesIO(encoded_image.tobytes())
+                    image_bytes.seek(0)
+                    ocr_result = mineru_service.recognize(image_bytes)
+                else:
+                    # Re-raise other errors
+                    raise
         ocr_time = time.time() - ocr_start
 
         total_time = time.time() - preprocess_start
diff --git a/app/services/ocr_service.py b/app/services/ocr_service.py
index 18f5b85..8b52015 100644
--- a/app/services/ocr_service.py
+++ b/app/services/ocr_service.py
@@ -532,50 +532,51 @@ class GLMOCRService(OCRServiceBase):
 
         Returns:
             Dict with 'latex', 'markdown', 'mathml', 'mml' keys.
+        
+        Raises:
+            RuntimeError: If recognition fails (preserves original exception for fallback handling).
         """
-        try:
-            # Add padding to image
-            padded_image = self.image_processor.add_padding(image)
+        # Add padding to image
+        padded_image = self.image_processor.add_padding(image)
 
-            # Encode image to base64
-            success, encoded_image = cv2.imencode(".png", padded_image)
-            if not success:
-                raise RuntimeError("Failed to encode image")
+        # Encode image to base64
+        success, encoded_image = cv2.imencode(".png", padded_image)
+        if not success:
+            raise RuntimeError("Failed to encode image")
 
-            image_base64 = base64.b64encode(encoded_image.tobytes()).decode("utf-8")
-            image_url = f"data:image/png;base64,{image_base64}"
+        image_base64 = base64.b64encode(encoded_image.tobytes()).decode("utf-8")
+        image_url = f"data:image/png;base64,{image_base64}"
 
-            # Call OpenAI-compatible API with formula recognition prompt
-            prompt = "Formula Recognition:"
-            messages = [{"role": "user", "content": [{"type": "image_url", "image_url": {"url": image_url}}, {"type": "text", "text": prompt}]}]
+        # Call OpenAI-compatible API with formula recognition prompt
+        prompt = "Formula Recognition:"
+        messages = [{"role": "user", "content": [{"type": "image_url", "image_url": {"url": image_url}}, {"type": "text", "text": prompt}]}]
 
-            response = self.openai_client.chat.completions.create(
-                model="glm-ocr",
-                messages=messages,
-                temperature=0.0,
-            )
+        # Don't catch exceptions here - let them propagate for fallback handling
+        response = self.openai_client.chat.completions.create(
+            model="glm-ocr",
+            messages=messages,
+            temperature=0.0,
+        )
 
-            markdown_content = response.choices[0].message.content
+        markdown_content = response.choices[0].message.content
 
-            # Process LaTeX delimiters
-            if markdown_content.startswith(r"\[") or markdown_content.startswith(r"\("):
-                markdown_content = markdown_content.replace(r"\[", "$$").replace(r"\(", "$$")
-                markdown_content = markdown_content.replace(r"\]", "$$").replace(r"\)", "$$")
-            elif not markdown_content.startswith("$$") and not markdown_content.startswith("$"):
-                markdown_content = f"$${markdown_content}$$"
+        # Process LaTeX delimiters
+        if markdown_content.startswith(r"\[") or markdown_content.startswith(r"\("):
+            markdown_content = markdown_content.replace(r"\[", "$$").replace(r"\(", "$$")
+            markdown_content = markdown_content.replace(r"\]", "$$").replace(r"\)", "$$")
+        elif not markdown_content.startswith("$$") and not markdown_content.startswith("$"):
+            markdown_content = f"$${markdown_content}$$"
 
-            # Apply postprocessing
-            markdown_content = _postprocess_markdown(markdown_content)
-            convert_result = self.converter.convert_to_formats(markdown_content)
+        # Apply postprocessing
+        markdown_content = _postprocess_markdown(markdown_content)
+        convert_result = self.converter.convert_to_formats(markdown_content)
 
-            return {
-                "latex": convert_result.latex,
-                "mathml": convert_result.mathml,
-                "mml": convert_result.mml,
-                "markdown": markdown_content,
-            }
-        except Exception as e:
-            raise RuntimeError(f"GLM formula recognition failed: {e}") from e
+        return {
+            "latex": convert_result.latex,
+            "mathml": convert_result.mathml,
+            "mml": convert_result.mml,
+            "markdown": markdown_content,
+        }
 
     def recognize(self, image: np.ndarray) -> dict:
         """Recognize content using GLM-4V.