fix: image as element

This commit is contained in:
liuyuanchuang
2026-02-09 22:18:30 +08:00
parent 5504bbbf1e
commit 7799e39298
5 changed files with 308 additions and 3 deletions

View File

@@ -143,7 +143,7 @@ def _clean_latex_syntax_spaces(expr: str) -> str:
# Remove spaces everywhere else (e.g., x \in -> x\in is fine)
# Strategy: remove spaces before \ and between non-command chars,
# but preserve the space after \command when followed by a non-\ char
cleaned = re.sub(r"\s+(?=\\)", "", content) # remove space before \cmd
cleaned = re.sub(r"\s+(?=\\)", "", content) # remove space before \cmd
cleaned = re.sub(r"(?<!\\)(?<![a-zA-Z])\s+", "", cleaned) # remove space after non-letter non-\
return f"{operator}{{{cleaned}}}"
@@ -532,7 +532,7 @@ class GLMOCRService(OCRServiceBase):
Returns:
Dict with 'latex', 'markdown', 'mathml', 'mml' keys.
Raises:
RuntimeError: If recognition fails (preserves original exception for fallback handling).
"""
@@ -637,7 +637,7 @@ class MineruOCRService(OCRServiceBase):
messages = [{"role": "user", "content": [{"type": "image_url", "image_url": {"url": image_url}}, {"type": "text", "text": prompt}]}]
response = self.openai_client.chat.completions.create(
model="PaddleOCR-VL-0.9B",
model="glm-ocr",
messages=messages,
temperature=0.0,
)
@@ -714,6 +714,9 @@ class MineruOCRService(OCRServiceBase):
if "results" in result and "image" in result["results"]:
markdown_content = result["results"]["image"].get("md_content", "")
if "![](images/" in markdown_content:
markdown_content = self._extract_and_recognize_formulas(markdown_content, image)
# Apply postprocessing to fix OCR errors
markdown_content = _postprocess_markdown(markdown_content)

100
create_table.py Normal file
View File

@@ -0,0 +1,100 @@
from openpyxl import Workbook
from openpyxl.styles import Font, Alignment, Border, Side
wb = Workbook()
sheet = wb.active
sheet.title = "数据表"
# Headers
headers = ["类别:电动", "结果", "类别:电工", "结果", "类别:黑板", "结果", "类别:小", "结果", "类别:大", "结果"]
for col, header in enumerate(headers, start=1):
cell = sheet.cell(row=1, column=col, value=header)
cell.font = Font(bold=True)
cell.alignment = Alignment(horizontal="center")
# Data for each category
data_电动 = [
["2615 - 243", 2372],
["2633 - 244", 2389],
["2542 - 243", 2299],
["2453 - 369", 2084],
["2670 - 244", 2426],
["2416 - 357", 2059],
["2574 - 239", 2335],
["2641 - 243", 2398],
["2640 - 242", 2398],
["2257 - 244", 2013],
["3114 - 410", 2704],
["2815 - 412", 2403],
["2545 - 243", 2302],
["2299 - 243", 2056],
]
data_电工 = [["2443 - 20", 2423], ["2460 - 13", 2447], ["2339 - 17", 2322], ["2405 - 13", 2392], ["2661 - 16", 2645], ["3065 - 15", 3050], ["2912 - 14", 2898], ["2883 - 14", 2869]]
data_黑板 = [
["1902 - 14", 1888],
["2319 - 14", 2305],
["2041 - 16", 2025],
["2451 - 13", 2438],
["1993 - 14", 1979],
["1841 - 18", 1823],
["2083 - 16", 2067],
["2009 - 18", 1991],
["2001 - 16", 1985],
["2014 - 14", 2000],
["2073 - 17", 2056],
["2008 - 15", 1993],
["2030 - 16", 2014],
["1928 - 22", 1906],
["1991 - 19", 1972],
["2235 - 12", 2223],
["2335 - 19", 2316],
["1920 - 16", 1904],
["1942 - 16", 1926],
["1974 - 19", 1955],
["1863 - 16", 1847],
["512 - 15", 497],
]
data_小 = [["1881 - 20", 1861], ["2055 - 22", 2033], ["2034 - 20", 2014], ["1981 - 17", 1964], ["1629 - 12", 1617], ["913 - 18", 895], ["1842 - 19", 1823], ["1891 - 20", 1871]]
data_大 = [["1931 - 20", 1911], ["1775 - 11", 1764], ["1885 - 21", 1864], ["120 - 18", 102], ["1675 - 13", 1662]]
# Write data starting from row 2
row = 2
max_rows = max(len(data_电动), len(data_电工), len(data_黑板), len(data_小), len(data_大))
for i in range(max_rows):
# 电动
if i < len(data_电动):
sheet.cell(row=row + i, column=1, value=data_电动[i][0])
sheet.cell(row=row + i, column=2, value=data_电动[i][1])
# 电工
if i < len(data_电工):
sheet.cell(row=row + i, column=3, value=data_电工[i][0])
sheet.cell(row=row + i, column=4, value=data_电工[i][1])
# 黑板
if i < len(data_黑板):
sheet.cell(row=row + i, column=5, value=data_黑板[i][0])
sheet.cell(row=row + i, column=6, value=data_黑板[i][1])
# 小
if i < len(data_小):
sheet.cell(row=row + i, column=7, value=data_小[i][0])
sheet.cell(row=row + i, column=8, value=data_小[i][1])
# 大
if i < len(data_大):
sheet.cell(row=row + i, column=9, value=data_大[i][0])
sheet.cell(row=row + i, column=10, value=data_大[i][1])
# Set column widths
for col in range(1, 11):
sheet.column_dimensions[chr(64 + col)].width = 15
wb.save("数据表.xlsx")
print("Excel文件已创建: 数据表.xlsx")

91
create_table_pandas.py Normal file
View File

@@ -0,0 +1,91 @@
import sys
sys.path.insert(0, "/opt/homebrew/lib/python3.13/site-packages")
import pandas as pd
# Define data for each category
data_电动 = [
["2615 - 243", 2372],
["2633 - 244", 2389],
["2542 - 243", 2299],
["2453 - 369", 2084],
["2670 - 244", 2426],
["2416 - 357", 2059],
["2574 - 239", 2335],
["2641 - 243", 2398],
["2640 - 242", 2398],
["2257 - 244", 2013],
["3114 - 410", 2704],
["2815 - 412", 2403],
["2545 - 243", 2302],
["2299 - 243", 2056],
]
data_电工 = [["2443 - 20", 2423], ["2460 - 13", 2447], ["2339 - 17", 2322], ["2405 - 13", 2392], ["2661 - 16", 2645], ["3065 - 15", 3050], ["2912 - 14", 2898], ["2883 - 14", 2869]]
data_黑板 = [
["1902 - 14", 1888],
["2319 - 14", 2305],
["2041 - 16", 2025],
["2451 - 13", 2438],
["1993 - 14", 1979],
["1841 - 18", 1823],
["2083 - 16", 2067],
["2009 - 18", 1991],
["2001 - 16", 1985],
["2014 - 14", 2000],
["2073 - 17", 2056],
["2008 - 15", 1993],
["2030 - 16", 2014],
["1928 - 22", 1906],
["1991 - 19", 1972],
["2235 - 12", 2223],
["2335 - 19", 2316],
["1920 - 16", 1904],
["1942 - 16", 1926],
["1974 - 19", 1955],
["1863 - 16", 1847],
["512 - 15", 497],
]
data_小 = [["1881 - 20", 1861], ["2055 - 22", 2033], ["2034 - 20", 2014], ["1981 - 17", 1964], ["1629 - 12", 1617], ["913 - 18", 895], ["1842 - 19", 1823], ["1891 - 20", 1871]]
data_大 = [["1931 - 20", 1911], ["1775 - 11", 1764], ["1885 - 21", 1864], ["120 - 18", 102], ["1675 - 13", 1662]]
# Pad all lists to the same length
max_len = max(len(data_电动), len(data_电工), len(data_黑板), len(data_小), len(data_大))
def pad_list(lst, length):
return lst + [["", ""]] * (length - len(lst))
data_电动 = pad_list(data_电动, max_len)
data_电工 = pad_list(data_电工, max_len)
data_黑板 = pad_list(data_黑板, max_len)
data_小 = pad_list(data_小, max_len)
data_大 = pad_list(data_大, max_len)
# Create DataFrame
df = pd.DataFrame(
{
"类别:电动": [row[0] for row in data_电动],
"结果_1": [row[1] for row in data_电动],
"类别:电工": [row[0] for row in data_电工],
"结果_2": [row[1] for row in data_电工],
"类别:黑板": [row[0] for row in data_黑板],
"结果_3": [row[1] for row in data_黑板],
"类别:小": [row[0] for row in data_小],
"结果_4": [row[1] for row in data_小],
"类别:大": [row[0] for row in data_大],
"结果_5": [row[1] for row in data_大],
}
)
# Replace empty strings with NaN for cleaner Excel output
df = df.replace("", pd.NA)
# Save to Excel
df.to_excel("数据表.xlsx", index=False, sheet_name="数据表")
print("Excel文件已创建: 数据表.xlsx")

88
csv_to_xlsx.py Normal file
View File

@@ -0,0 +1,88 @@
#!/usr/bin/env python3
import csv
# Read CSV and create a simple Python script to generate XLSX using xlsxwriter
csv_file = "数据表.csv"
xlsx_file = "数据表.xlsx"
# Check if xlsxwriter is available
try:
import xlsxwriter
# Create a workbook and add a worksheet
workbook = xlsxwriter.Workbook(xlsx_file)
worksheet = workbook.add_worksheet("数据表")
# Add a bold format for headers
bold = workbook.add_format({"bold": True, "align": "center"})
center = workbook.add_format({"align": "center"})
# Read CSV and write to Excel
with open(csv_file, "r", encoding="utf-8") as f:
reader = csv.reader(f)
for row_idx, row in enumerate(reader):
for col_idx, value in enumerate(row):
if row_idx == 0: # Header row
worksheet.write(row_idx, col_idx, value, bold)
else:
# Try to convert to number if possible
try:
if value:
num_value = int(value)
worksheet.write_number(row_idx, col_idx, num_value, center)
else:
worksheet.write(row_idx, col_idx, value)
except ValueError:
worksheet.write(row_idx, col_idx, value, center)
# Set column widths
for col in range(10):
worksheet.set_column(col, col, 15)
workbook.close()
print(f"Excel file created: {xlsx_file}")
except ImportError:
print("xlsxwriter not found, trying openpyxl...")
try:
from openpyxl import Workbook
from openpyxl.styles import Font, Alignment
wb = Workbook()
ws = wb.active
ws.title = "数据表"
# Read CSV and write to Excel
with open(csv_file, "r", encoding="utf-8") as f:
reader = csv.reader(f)
for row_idx, row in enumerate(reader, start=1):
for col_idx, value in enumerate(row, start=1):
cell = ws.cell(row=row_idx, column=col_idx)
# Try to convert to number
try:
if value:
cell.value = int(value)
else:
cell.value = value
except ValueError:
cell.value = value
# Format header row
if row_idx == 1:
cell.font = Font(bold=True)
cell.alignment = Alignment(horizontal="center")
else:
cell.alignment = Alignment(horizontal="center")
# Set column widths
for col in range(1, 11):
ws.column_dimensions[chr(64 + col)].width = 15
wb.save(xlsx_file)
print(f"Excel file created: {xlsx_file}")
except ImportError:
print("Neither xlsxwriter nor openpyxl is available.")
print("CSV file has been created: 数据表.csv")
print("You can manually convert it to Excel format.")

23
数据表.csv Normal file
View File

@@ -0,0 +1,23 @@
类别:电动,结果,类别:电工,结果,类别:黑板,结果,类别:小,结果,类别:大,结果
2615 - 243,2372,2443 - 20,2423,1902 - 14,1888,1881 - 20,1861,1931 - 20,1911
2633 - 244,2389,2460 - 13,2447,2319 - 14,2305,2055 - 22,2033,1775 - 11,1764
2542 - 243,2299,2339 - 17,2322,2041 - 16,2025,2034 - 20,2014,1885 - 21,1864
2453 - 369,2084,2405 - 13,2392,2451 - 13,2438,1981 - 17,1964,120 - 18,102
2670 - 244,2426,2661 - 16,2645,1993 - 14,1979,1629 - 12,1617,1675 - 13,1662
2416 - 357,2059,3065 - 15,3050,1841 - 18,1823,913 - 18,895,,
2574 - 239,2335,2912 - 14,2898,2083 - 16,2067,1842 - 19,1823,,
2641 - 243,2398,2883 - 14,2869,2009 - 18,1991,1891 - 20,1871,,
2640 - 242,2398,,,2001 - 16,1985,,,
2257 - 244,2013,,,2014 - 14,2000,,,
3114 - 410,2704,,,2073 - 17,2056,,,
2815 - 412,2403,,,2008 - 15,1993,,,
2545 - 243,2302,,,2030 - 16,2014,,,
2299 - 243,2056,,,1928 - 22,1906,,,
,,,1991 - 19,1972,,,
,,,2235 - 12,2223,,,
,,,2335 - 19,2316,,,
,,,1920 - 16,1904,,,
,,,1942 - 16,1926,,,
,,,1974 - 19,1955,,,
,,,1863 - 16,1847,,,
,,,512 - 15,497,,,
1 类别:电动,结果,类别:电工,结果,类别:黑板,结果,类别:小,结果,类别:大,结果
2 2615 - 243,2372,2443 - 20,2423,1902 - 14,1888,1881 - 20,1861,1931 - 20,1911
3 2633 - 244,2389,2460 - 13,2447,2319 - 14,2305,2055 - 22,2033,1775 - 11,1764
4 2542 - 243,2299,2339 - 17,2322,2041 - 16,2025,2034 - 20,2014,1885 - 21,1864
5 2453 - 369,2084,2405 - 13,2392,2451 - 13,2438,1981 - 17,1964,120 - 18,102
6 2670 - 244,2426,2661 - 16,2645,1993 - 14,1979,1629 - 12,1617,1675 - 13,1662
7 2416 - 357,2059,3065 - 15,3050,1841 - 18,1823,913 - 18,895,,
8 2574 - 239,2335,2912 - 14,2898,2083 - 16,2067,1842 - 19,1823,,
9 2641 - 243,2398,2883 - 14,2869,2009 - 18,1991,1891 - 20,1871,,
10 2640 - 242,2398,,,2001 - 16,1985,,,
11 2257 - 244,2013,,,2014 - 14,2000,,,
12 3114 - 410,2704,,,2073 - 17,2056,,,
13 2815 - 412,2403,,,2008 - 15,1993,,,
14 2545 - 243,2302,,,2030 - 16,2014,,,
15 2299 - 243,2056,,,1928 - 22,1906,,,
16 ,,,1991 - 19,1972,,,
17 ,,,2235 - 12,2223,,,
18 ,,,2335 - 19,2316,,,
19 ,,,1920 - 16,1904,,,
20 ,,,1942 - 16,1926,,,
21 ,,,1974 - 19,1955,,,
22 ,,,1863 - 16,1847,,,
23 ,,,512 - 15,497,,,