feat: optimize the format convert
This commit is contained in:
@@ -17,13 +17,31 @@ settings = get_settings()
|
||||
|
||||
_COMMANDS_NEED_SPACE = {
|
||||
# operators / calculus
|
||||
"cdot", "times", "div", "pm", "mp",
|
||||
"int", "iint", "iiint", "oint", "sum", "prod", "lim",
|
||||
"cdot",
|
||||
"times",
|
||||
"div",
|
||||
"pm",
|
||||
"mp",
|
||||
"int",
|
||||
"iint",
|
||||
"iiint",
|
||||
"oint",
|
||||
"sum",
|
||||
"prod",
|
||||
"lim",
|
||||
# common functions
|
||||
"sin", "cos", "tan", "cot", "sec", "csc",
|
||||
"log", "ln", "exp",
|
||||
"sin",
|
||||
"cos",
|
||||
"tan",
|
||||
"cot",
|
||||
"sec",
|
||||
"csc",
|
||||
"log",
|
||||
"ln",
|
||||
"exp",
|
||||
# misc
|
||||
"partial", "nabla",
|
||||
"partial",
|
||||
"nabla",
|
||||
}
|
||||
|
||||
_MATH_SEGMENT_PATTERN = re.compile(r"\$\$.*?\$\$|\$.*?\$", re.DOTALL)
|
||||
@@ -58,7 +76,7 @@ def _split_glued_command_token(token: str) -> str:
|
||||
if not best:
|
||||
return token
|
||||
|
||||
suffix = body[len(best):]
|
||||
suffix = body[len(best) :]
|
||||
if not suffix:
|
||||
return token
|
||||
|
||||
@@ -118,11 +136,11 @@ class OCRService(OCRServiceBase):
|
||||
image_processor: Image processor instance.
|
||||
"""
|
||||
self.vl_server_url = vl_server_url or settings.paddleocr_vl_url
|
||||
self.layout_detector = layout_detector
|
||||
self.layout_detector = layout_detector
|
||||
self.image_processor = image_processor
|
||||
self.converter = converter
|
||||
|
||||
def _get_pipeline(self):
|
||||
def _get_pipeline(self):
|
||||
"""Get or create PaddleOCR-VL pipeline.
|
||||
|
||||
Returns:
|
||||
@@ -159,12 +177,13 @@ class OCRService(OCRServiceBase):
|
||||
markdown_content += res.markdown.get("markdown_texts", "")
|
||||
|
||||
markdown_content = _postprocess_markdown(markdown_content)
|
||||
convert_result = self.converter.convert_to_formats(markdown_content)
|
||||
convert_result = self.converter.convert_to_formats(markdown_content)
|
||||
|
||||
return {
|
||||
"markdown": markdown_content,
|
||||
"latex": convert_result.latex,
|
||||
"mathml": convert_result.mathml,
|
||||
"mml": convert_result.mml,
|
||||
}
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Mixed recognition failed: {e}") from e
|
||||
@@ -196,6 +215,7 @@ class OCRService(OCRServiceBase):
|
||||
return {
|
||||
"latex": convert_result.latex,
|
||||
"mathml": convert_result.mathml,
|
||||
"mml": convert_result.mml,
|
||||
"markdown": markdown_content,
|
||||
}
|
||||
except Exception as e:
|
||||
@@ -220,7 +240,7 @@ class OCRService(OCRServiceBase):
|
||||
|
||||
class MineruOCRService(OCRServiceBase):
|
||||
"""Service for OCR using local file_parse API."""
|
||||
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
api_url: str = "http://127.0.0.1:8000/file_parse",
|
||||
@@ -228,7 +248,7 @@ class MineruOCRService(OCRServiceBase):
|
||||
converter: Optional[Converter] = None,
|
||||
):
|
||||
"""Initialize Local API service.
|
||||
|
||||
|
||||
Args:
|
||||
api_url: URL of the local file_parse API endpoint.
|
||||
converter: Optional converter instance for format conversion.
|
||||
@@ -236,13 +256,13 @@ class MineruOCRService(OCRServiceBase):
|
||||
self.api_url = api_url
|
||||
self.image_processor = image_processor
|
||||
self.converter = converter
|
||||
|
||||
|
||||
def recognize(self, image: np.ndarray) -> dict:
|
||||
"""Recognize content using local file_parse API.
|
||||
|
||||
|
||||
Args:
|
||||
image: Input image as numpy array in BGR format.
|
||||
|
||||
|
||||
Returns:
|
||||
Dict with 'markdown', 'latex', 'mathml' keys.
|
||||
"""
|
||||
@@ -251,78 +271,71 @@ class MineruOCRService(OCRServiceBase):
|
||||
image = self.image_processor.add_padding(image)
|
||||
|
||||
# Convert numpy array to image bytes
|
||||
success, encoded_image = cv2.imencode('.png', image)
|
||||
success, encoded_image = cv2.imencode(".png", image)
|
||||
if not success:
|
||||
raise RuntimeError("Failed to encode image")
|
||||
|
||||
|
||||
image_bytes = BytesIO(encoded_image.tobytes())
|
||||
|
||||
|
||||
# Prepare multipart form data
|
||||
files = {
|
||||
'files': ('image.png', image_bytes, 'image/png')
|
||||
}
|
||||
|
||||
files = {"files": ("image.png", image_bytes, "image/png")}
|
||||
|
||||
data = {
|
||||
'return_middle_json': 'false',
|
||||
'return_model_output': 'false',
|
||||
'return_md': 'true',
|
||||
'return_images': 'false',
|
||||
'end_page_id': '99999',
|
||||
'start_page_id': '0',
|
||||
'lang_list': 'en',
|
||||
'server_url': 'string',
|
||||
'return_content_list': 'false',
|
||||
'backend': 'hybrid-auto-engine',
|
||||
'table_enable': 'true',
|
||||
'response_format_zip': 'false',
|
||||
'formula_enable': 'true',
|
||||
'parse_method': 'ocr'
|
||||
"return_middle_json": "false",
|
||||
"return_model_output": "false",
|
||||
"return_md": "true",
|
||||
"return_images": "false",
|
||||
"end_page_id": "99999",
|
||||
"start_page_id": "0",
|
||||
"lang_list": "en",
|
||||
"server_url": "string",
|
||||
"return_content_list": "false",
|
||||
"backend": "hybrid-auto-engine",
|
||||
"table_enable": "true",
|
||||
"response_format_zip": "false",
|
||||
"formula_enable": "true",
|
||||
"parse_method": "ocr",
|
||||
}
|
||||
|
||||
|
||||
# Make API request
|
||||
response = requests.post(
|
||||
self.api_url,
|
||||
files=files,
|
||||
data=data,
|
||||
headers={'accept': 'application/json'},
|
||||
timeout=30
|
||||
)
|
||||
response = requests.post(self.api_url, files=files, data=data, headers={"accept": "application/json"}, timeout=30)
|
||||
response.raise_for_status()
|
||||
|
||||
|
||||
result = response.json()
|
||||
|
||||
|
||||
# Extract markdown content from response
|
||||
markdown_content = ""
|
||||
if 'results' in result and 'image' in result['results']:
|
||||
markdown_content = result['results']['image'].get('md_content', '')
|
||||
if "results" in result and "image" in result["results"]:
|
||||
markdown_content = result["results"]["image"].get("md_content", "")
|
||||
|
||||
# markdown_content = _postprocess_markdown(markdown_content)
|
||||
|
||||
|
||||
# Convert to other formats if converter is available
|
||||
latex = ""
|
||||
mathml = ""
|
||||
mml = ""
|
||||
if self.converter and markdown_content:
|
||||
convert_result = self.converter.convert_to_formats(markdown_content)
|
||||
latex = convert_result.latex
|
||||
mathml = convert_result.mathml
|
||||
|
||||
mml = convert_result.mml
|
||||
|
||||
return {
|
||||
"markdown": markdown_content,
|
||||
"latex": latex,
|
||||
"mathml": mathml,
|
||||
"mml": mml,
|
||||
}
|
||||
|
||||
|
||||
except requests.RequestException as e:
|
||||
raise RuntimeError(f"Local API request failed: {e}") from e
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Recognition failed: {e}") from e
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
mineru_service = MineruOCRService()
|
||||
image = cv2.imread("test/complex_formula.png")
|
||||
image_numpy = np.array(image)
|
||||
ocr_result = mineru_service.recognize(image_numpy)
|
||||
print(ocr_result)
|
||||
print(ocr_result)
|
||||
|
||||
Reference in New Issue
Block a user