fix:glm max tokens

This commit is contained in:
liuyuanchuang
2026-02-07 21:38:41 +08:00
parent 1a4d54ce34
commit 5504bbbf1e
2 changed files with 65 additions and 42 deletions

View File

@@ -62,20 +62,20 @@ async def process_image_ocr(
try: try:
log.info("Starting image OCR processing") log.info("Starting image OCR processing")
# Preprocess image # Preprocess image (load only, no padding yet)
preprocess_start = time.time() preprocess_start = time.time()
image = image_processor.preprocess( image = image_processor.preprocess(
image_url=request.image_url, image_url=request.image_url,
image_base64=request.image_base64, image_base64=request.image_base64,
) )
# Apply padding if enabled (before layout detection) # Apply padding only for layout detection
processed_image = image processed_image = image
if image_processor and settings.is_padding: if image_processor and settings.is_padding:
processed_image = image_processor.add_padding(image) processed_image = image_processor.add_padding(image)
preprocess_time = time.time() - preprocess_start preprocess_time = time.time() - preprocess_start
log.debug(f"Image preprocessing completed in {preprocess_time:.3f}s") log.debug(f"Image loading completed in {preprocess_time:.3f}s")
# Layout detection (using padded image if padding is enabled) # Layout detection (using padded image if padding is enabled)
layout_start = time.time() layout_start = time.time()
@@ -83,14 +83,14 @@ async def process_image_ocr(
layout_time = time.time() - layout_start layout_time = time.time() - layout_start
log.info(f"Layout detection completed in {layout_time:.3f}s") log.info(f"Layout detection completed in {layout_time:.3f}s")
# OCR recognition # OCR recognition (use original image without padding)
ocr_start = time.time() ocr_start = time.time()
if layout_info.MixedRecognition: if layout_info.MixedRecognition:
recognition_method = "MixedRecognition (MinerU)" recognition_method = "MixedRecognition (MinerU)"
log.info(f"Using {recognition_method}") log.info(f"Using {recognition_method}")
# Convert numpy array to image bytes (image already padded) # Convert original image (without padding) to bytes
success, encoded_image = cv2.imencode(".png", processed_image) success, encoded_image = cv2.imencode(".png", image)
if not success: if not success:
raise RuntimeError("Failed to encode image") raise RuntimeError("Failed to encode image")
@@ -100,7 +100,29 @@ async def process_image_ocr(
else: else:
recognition_method = "FormulaOnly (GLMOCR)" recognition_method = "FormulaOnly (GLMOCR)"
log.info(f"Using {recognition_method}") log.info(f"Using {recognition_method}")
ocr_result = glmocr_service.recognize(processed_image)
# Try GLM-OCR first, fallback to MinerU if token limit exceeded
try:
ocr_result = glmocr_service.recognize(image)
except Exception as e:
error_msg = str(e)
# Check if error is due to token limit (max_model_len exceeded)
if "max_model_len" in error_msg or "decoder prompt" in error_msg or "BadRequestError" in error_msg:
log.warning(f"GLM-OCR failed due to token limit: {error_msg}")
log.info("Falling back to MinerU for recognition")
recognition_method = "FormulaOnly (MinerU fallback)"
# Convert original image to bytes for MinerU
success, encoded_image = cv2.imencode(".png", image)
if not success:
raise RuntimeError("Failed to encode image")
image_bytes = BytesIO(encoded_image.tobytes())
image_bytes.seek(0)
ocr_result = mineru_service.recognize(image_bytes)
else:
# Re-raise other errors
raise
ocr_time = time.time() - ocr_start ocr_time = time.time() - ocr_start
total_time = time.time() - preprocess_start total_time = time.time() - preprocess_start

View File

@@ -532,50 +532,51 @@ class GLMOCRService(OCRServiceBase):
Returns: Returns:
Dict with 'latex', 'markdown', 'mathml', 'mml' keys. Dict with 'latex', 'markdown', 'mathml', 'mml' keys.
Raises:
RuntimeError: If recognition fails (preserves original exception for fallback handling).
""" """
try: # Add padding to image
# Add padding to image padded_image = self.image_processor.add_padding(image)
padded_image = self.image_processor.add_padding(image)
# Encode image to base64 # Encode image to base64
success, encoded_image = cv2.imencode(".png", padded_image) success, encoded_image = cv2.imencode(".png", padded_image)
if not success: if not success:
raise RuntimeError("Failed to encode image") raise RuntimeError("Failed to encode image")
image_base64 = base64.b64encode(encoded_image.tobytes()).decode("utf-8") image_base64 = base64.b64encode(encoded_image.tobytes()).decode("utf-8")
image_url = f"data:image/png;base64,{image_base64}" image_url = f"data:image/png;base64,{image_base64}"
# Call OpenAI-compatible API with formula recognition prompt # Call OpenAI-compatible API with formula recognition prompt
prompt = "Formula Recognition:" prompt = "Formula Recognition:"
messages = [{"role": "user", "content": [{"type": "image_url", "image_url": {"url": image_url}}, {"type": "text", "text": prompt}]}] messages = [{"role": "user", "content": [{"type": "image_url", "image_url": {"url": image_url}}, {"type": "text", "text": prompt}]}]
response = self.openai_client.chat.completions.create( # Don't catch exceptions here - let them propagate for fallback handling
model="glm-ocr", response = self.openai_client.chat.completions.create(
messages=messages, model="glm-ocr",
temperature=0.0, messages=messages,
) temperature=0.0,
)
markdown_content = response.choices[0].message.content markdown_content = response.choices[0].message.content
# Process LaTeX delimiters # Process LaTeX delimiters
if markdown_content.startswith(r"\[") or markdown_content.startswith(r"\("): if markdown_content.startswith(r"\[") or markdown_content.startswith(r"\("):
markdown_content = markdown_content.replace(r"\[", "$$").replace(r"\(", "$$") markdown_content = markdown_content.replace(r"\[", "$$").replace(r"\(", "$$")
markdown_content = markdown_content.replace(r"\]", "$$").replace(r"\)", "$$") markdown_content = markdown_content.replace(r"\]", "$$").replace(r"\)", "$$")
elif not markdown_content.startswith("$$") and not markdown_content.startswith("$"): elif not markdown_content.startswith("$$") and not markdown_content.startswith("$"):
markdown_content = f"$${markdown_content}$$" markdown_content = f"$${markdown_content}$$"
# Apply postprocessing # Apply postprocessing
markdown_content = _postprocess_markdown(markdown_content) markdown_content = _postprocess_markdown(markdown_content)
convert_result = self.converter.convert_to_formats(markdown_content) convert_result = self.converter.convert_to_formats(markdown_content)
return { return {
"latex": convert_result.latex, "latex": convert_result.latex,
"mathml": convert_result.mathml, "mathml": convert_result.mathml,
"mml": convert_result.mml, "mml": convert_result.mml,
"markdown": markdown_content, "markdown": markdown_content,
} }
except Exception as e:
raise RuntimeError(f"GLM formula recognition failed: {e}") from e
def recognize(self, image: np.ndarray) -> dict: def recognize(self, image: np.ndarray) -> dict:
"""Recognize content using GLM-4V. """Recognize content using GLM-4V.