fix:glm max tokens
This commit is contained in:
@@ -62,20 +62,20 @@ async def process_image_ocr(
|
|||||||
try:
|
try:
|
||||||
log.info("Starting image OCR processing")
|
log.info("Starting image OCR processing")
|
||||||
|
|
||||||
# Preprocess image
|
# Preprocess image (load only, no padding yet)
|
||||||
preprocess_start = time.time()
|
preprocess_start = time.time()
|
||||||
image = image_processor.preprocess(
|
image = image_processor.preprocess(
|
||||||
image_url=request.image_url,
|
image_url=request.image_url,
|
||||||
image_base64=request.image_base64,
|
image_base64=request.image_base64,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Apply padding if enabled (before layout detection)
|
# Apply padding only for layout detection
|
||||||
processed_image = image
|
processed_image = image
|
||||||
if image_processor and settings.is_padding:
|
if image_processor and settings.is_padding:
|
||||||
processed_image = image_processor.add_padding(image)
|
processed_image = image_processor.add_padding(image)
|
||||||
|
|
||||||
preprocess_time = time.time() - preprocess_start
|
preprocess_time = time.time() - preprocess_start
|
||||||
log.debug(f"Image preprocessing completed in {preprocess_time:.3f}s")
|
log.debug(f"Image loading completed in {preprocess_time:.3f}s")
|
||||||
|
|
||||||
# Layout detection (using padded image if padding is enabled)
|
# Layout detection (using padded image if padding is enabled)
|
||||||
layout_start = time.time()
|
layout_start = time.time()
|
||||||
@@ -83,14 +83,14 @@ async def process_image_ocr(
|
|||||||
layout_time = time.time() - layout_start
|
layout_time = time.time() - layout_start
|
||||||
log.info(f"Layout detection completed in {layout_time:.3f}s")
|
log.info(f"Layout detection completed in {layout_time:.3f}s")
|
||||||
|
|
||||||
# OCR recognition
|
# OCR recognition (use original image without padding)
|
||||||
ocr_start = time.time()
|
ocr_start = time.time()
|
||||||
if layout_info.MixedRecognition:
|
if layout_info.MixedRecognition:
|
||||||
recognition_method = "MixedRecognition (MinerU)"
|
recognition_method = "MixedRecognition (MinerU)"
|
||||||
log.info(f"Using {recognition_method}")
|
log.info(f"Using {recognition_method}")
|
||||||
|
|
||||||
# Convert numpy array to image bytes (image already padded)
|
# Convert original image (without padding) to bytes
|
||||||
success, encoded_image = cv2.imencode(".png", processed_image)
|
success, encoded_image = cv2.imencode(".png", image)
|
||||||
if not success:
|
if not success:
|
||||||
raise RuntimeError("Failed to encode image")
|
raise RuntimeError("Failed to encode image")
|
||||||
|
|
||||||
@@ -100,7 +100,29 @@ async def process_image_ocr(
|
|||||||
else:
|
else:
|
||||||
recognition_method = "FormulaOnly (GLMOCR)"
|
recognition_method = "FormulaOnly (GLMOCR)"
|
||||||
log.info(f"Using {recognition_method}")
|
log.info(f"Using {recognition_method}")
|
||||||
ocr_result = glmocr_service.recognize(processed_image)
|
|
||||||
|
# Try GLM-OCR first, fallback to MinerU if token limit exceeded
|
||||||
|
try:
|
||||||
|
ocr_result = glmocr_service.recognize(image)
|
||||||
|
except Exception as e:
|
||||||
|
error_msg = str(e)
|
||||||
|
# Check if error is due to token limit (max_model_len exceeded)
|
||||||
|
if "max_model_len" in error_msg or "decoder prompt" in error_msg or "BadRequestError" in error_msg:
|
||||||
|
log.warning(f"GLM-OCR failed due to token limit: {error_msg}")
|
||||||
|
log.info("Falling back to MinerU for recognition")
|
||||||
|
recognition_method = "FormulaOnly (MinerU fallback)"
|
||||||
|
|
||||||
|
# Convert original image to bytes for MinerU
|
||||||
|
success, encoded_image = cv2.imencode(".png", image)
|
||||||
|
if not success:
|
||||||
|
raise RuntimeError("Failed to encode image")
|
||||||
|
|
||||||
|
image_bytes = BytesIO(encoded_image.tobytes())
|
||||||
|
image_bytes.seek(0)
|
||||||
|
ocr_result = mineru_service.recognize(image_bytes)
|
||||||
|
else:
|
||||||
|
# Re-raise other errors
|
||||||
|
raise
|
||||||
ocr_time = time.time() - ocr_start
|
ocr_time = time.time() - ocr_start
|
||||||
|
|
||||||
total_time = time.time() - preprocess_start
|
total_time = time.time() - preprocess_start
|
||||||
|
|||||||
@@ -532,50 +532,51 @@ class GLMOCRService(OCRServiceBase):
|
|||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dict with 'latex', 'markdown', 'mathml', 'mml' keys.
|
Dict with 'latex', 'markdown', 'mathml', 'mml' keys.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
RuntimeError: If recognition fails (preserves original exception for fallback handling).
|
||||||
"""
|
"""
|
||||||
try:
|
# Add padding to image
|
||||||
# Add padding to image
|
padded_image = self.image_processor.add_padding(image)
|
||||||
padded_image = self.image_processor.add_padding(image)
|
|
||||||
|
|
||||||
# Encode image to base64
|
# Encode image to base64
|
||||||
success, encoded_image = cv2.imencode(".png", padded_image)
|
success, encoded_image = cv2.imencode(".png", padded_image)
|
||||||
if not success:
|
if not success:
|
||||||
raise RuntimeError("Failed to encode image")
|
raise RuntimeError("Failed to encode image")
|
||||||
|
|
||||||
image_base64 = base64.b64encode(encoded_image.tobytes()).decode("utf-8")
|
image_base64 = base64.b64encode(encoded_image.tobytes()).decode("utf-8")
|
||||||
image_url = f"data:image/png;base64,{image_base64}"
|
image_url = f"data:image/png;base64,{image_base64}"
|
||||||
|
|
||||||
# Call OpenAI-compatible API with formula recognition prompt
|
# Call OpenAI-compatible API with formula recognition prompt
|
||||||
prompt = "Formula Recognition:"
|
prompt = "Formula Recognition:"
|
||||||
messages = [{"role": "user", "content": [{"type": "image_url", "image_url": {"url": image_url}}, {"type": "text", "text": prompt}]}]
|
messages = [{"role": "user", "content": [{"type": "image_url", "image_url": {"url": image_url}}, {"type": "text", "text": prompt}]}]
|
||||||
|
|
||||||
response = self.openai_client.chat.completions.create(
|
# Don't catch exceptions here - let them propagate for fallback handling
|
||||||
model="glm-ocr",
|
response = self.openai_client.chat.completions.create(
|
||||||
messages=messages,
|
model="glm-ocr",
|
||||||
temperature=0.0,
|
messages=messages,
|
||||||
)
|
temperature=0.0,
|
||||||
|
)
|
||||||
|
|
||||||
markdown_content = response.choices[0].message.content
|
markdown_content = response.choices[0].message.content
|
||||||
|
|
||||||
# Process LaTeX delimiters
|
# Process LaTeX delimiters
|
||||||
if markdown_content.startswith(r"\[") or markdown_content.startswith(r"\("):
|
if markdown_content.startswith(r"\[") or markdown_content.startswith(r"\("):
|
||||||
markdown_content = markdown_content.replace(r"\[", "$$").replace(r"\(", "$$")
|
markdown_content = markdown_content.replace(r"\[", "$$").replace(r"\(", "$$")
|
||||||
markdown_content = markdown_content.replace(r"\]", "$$").replace(r"\)", "$$")
|
markdown_content = markdown_content.replace(r"\]", "$$").replace(r"\)", "$$")
|
||||||
elif not markdown_content.startswith("$$") and not markdown_content.startswith("$"):
|
elif not markdown_content.startswith("$$") and not markdown_content.startswith("$"):
|
||||||
markdown_content = f"$${markdown_content}$$"
|
markdown_content = f"$${markdown_content}$$"
|
||||||
|
|
||||||
# Apply postprocessing
|
# Apply postprocessing
|
||||||
markdown_content = _postprocess_markdown(markdown_content)
|
markdown_content = _postprocess_markdown(markdown_content)
|
||||||
convert_result = self.converter.convert_to_formats(markdown_content)
|
convert_result = self.converter.convert_to_formats(markdown_content)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"latex": convert_result.latex,
|
"latex": convert_result.latex,
|
||||||
"mathml": convert_result.mathml,
|
"mathml": convert_result.mathml,
|
||||||
"mml": convert_result.mml,
|
"mml": convert_result.mml,
|
||||||
"markdown": markdown_content,
|
"markdown": markdown_content,
|
||||||
}
|
}
|
||||||
except Exception as e:
|
|
||||||
raise RuntimeError(f"GLM formula recognition failed: {e}") from e
|
|
||||||
|
|
||||||
def recognize(self, image: np.ndarray) -> dict:
|
def recognize(self, image: np.ndarray) -> dict:
|
||||||
"""Recognize content using GLM-4V.
|
"""Recognize content using GLM-4V.
|
||||||
|
|||||||
Reference in New Issue
Block a user