web demo加入了katex支持, 不再需要本地安装xelatex渲染器
This commit is contained in:
@@ -9,5 +9,4 @@ tensorboardX
|
|||||||
nltk
|
nltk
|
||||||
python-multipart
|
python-multipart
|
||||||
|
|
||||||
pdf2image
|
|
||||||
augraphy
|
augraphy
|
||||||
120
src/web.py
120
src/web.py
@@ -2,14 +2,11 @@ import os
|
|||||||
import io
|
import io
|
||||||
import base64
|
import base64
|
||||||
import tempfile
|
import tempfile
|
||||||
import time
|
|
||||||
import subprocess
|
|
||||||
import shutil
|
import shutil
|
||||||
import streamlit as st
|
import streamlit as st
|
||||||
|
import re
|
||||||
|
|
||||||
from PIL import Image, ImageChops
|
from PIL import Image
|
||||||
from pathlib import Path
|
|
||||||
from pdf2image import convert_from_path
|
|
||||||
from models.ocr_model.utils.inference import inference
|
from models.ocr_model.utils.inference import inference
|
||||||
from models.ocr_model.model.TexTeller import TexTeller
|
from models.ocr_model.model.TexTeller import TexTeller
|
||||||
|
|
||||||
@@ -69,6 +66,19 @@ def get_model():
|
|||||||
def get_tokenizer():
|
def get_tokenizer():
|
||||||
return TexTeller.get_tokenizer(os.environ['TOKENIZER_DIR'])
|
return TexTeller.get_tokenizer(os.environ['TOKENIZER_DIR'])
|
||||||
|
|
||||||
|
def to_katex(formula: str) -> str:
|
||||||
|
res = formula
|
||||||
|
res = re.sub(r'\\mbox\{([^}]*)\}', r'\1', res)
|
||||||
|
res = re.sub(r'boldmath\$(.*?)\$', r'bm{\1}', res)
|
||||||
|
res = re.sub(r'\\\[(.*?)\\\]', r'\1\\newline', res)
|
||||||
|
|
||||||
|
pattern = r'(\\(?:left|middle|right|big|Big|bigg|Bigg|bigl|Bigl|biggl|Biggl|bigm|Bigm|biggm|Biggm|bigr|Bigr|biggr|Biggr))\{([^}]*)\}'
|
||||||
|
replacement = r'\1\2'
|
||||||
|
res = re.sub(pattern, replacement, res)
|
||||||
|
if res.endswith(r'\newline'):
|
||||||
|
res = res[:-8]
|
||||||
|
return res
|
||||||
|
|
||||||
def get_image_base64(img_file):
|
def get_image_base64(img_file):
|
||||||
buffered = io.BytesIO()
|
buffered = io.BytesIO()
|
||||||
img_file.seek(0)
|
img_file.seek(0)
|
||||||
@@ -76,55 +86,12 @@ def get_image_base64(img_file):
|
|||||||
img.save(buffered, format="PNG")
|
img.save(buffered, format="PNG")
|
||||||
return base64.b64encode(buffered.getvalue()).decode()
|
return base64.b64encode(buffered.getvalue()).decode()
|
||||||
|
|
||||||
def rendering(formula: str, out_img_path: Path) -> bool:
|
|
||||||
build_dir = out_img_path / 'build'
|
|
||||||
build_dir.mkdir(exist_ok=True, parents=True)
|
|
||||||
f = build_dir / 'formula.tex'
|
|
||||||
f.touch(exist_ok=True)
|
|
||||||
f.write_text(tex.format(formula=formula))
|
|
||||||
|
|
||||||
p = subprocess.Popen([
|
|
||||||
'xelatex',
|
|
||||||
f'-output-directory={build_dir}',
|
|
||||||
'-interaction=nonstopmode',
|
|
||||||
'-halt-on-error',
|
|
||||||
f'{f}'
|
|
||||||
])
|
|
||||||
p.communicate()
|
|
||||||
return p.returncode == 0
|
|
||||||
|
|
||||||
def pdf_to_pngbytes(pdf_path):
|
|
||||||
images = convert_from_path(pdf_path, dpi=400,first_page=1, last_page=1)
|
|
||||||
trimmed_images = trim(images[0])
|
|
||||||
png_image_bytes = io.BytesIO()
|
|
||||||
trimmed_images.save(png_image_bytes, format='PNG')
|
|
||||||
png_image_bytes.seek(0)
|
|
||||||
return png_image_bytes
|
|
||||||
|
|
||||||
def trim(im):
|
|
||||||
bg = Image.new(im.mode, im.size, im.getpixel((0,0)))
|
|
||||||
diff = ImageChops.difference(im, bg)
|
|
||||||
diff = ImageChops.add(diff, diff, 2.0, -100)
|
|
||||||
bbox = diff.getbbox()
|
|
||||||
if bbox:
|
|
||||||
return im.crop(bbox)
|
|
||||||
return im
|
|
||||||
|
|
||||||
|
|
||||||
model = get_model()
|
model = get_model()
|
||||||
tokenizer = get_tokenizer()
|
tokenizer = get_tokenizer()
|
||||||
# check if xelatex is installed
|
|
||||||
xelatex_installed = os.system('which xelatex > /dev/null 2>&1') == 0
|
|
||||||
|
|
||||||
if "start" not in st.session_state:
|
if "start" not in st.session_state:
|
||||||
st.session_state["start"] = 1
|
st.session_state["start"] = 1
|
||||||
|
|
||||||
if xelatex_installed:
|
|
||||||
st.toast('Hooray!', icon='🎉')
|
st.toast('Hooray!', icon='🎉')
|
||||||
time.sleep(0.5)
|
|
||||||
st.toast("Xelatex have been detected.", icon='✅')
|
|
||||||
else:
|
|
||||||
st.error('xelatex is not installed. Please install it before using TexTeller.')
|
|
||||||
|
|
||||||
|
|
||||||
# ============================ pages =============================== #
|
# ============================ pages =============================== #
|
||||||
@@ -133,11 +100,6 @@ st.markdown(html_string, unsafe_allow_html=True)
|
|||||||
|
|
||||||
uploaded_file = st.file_uploader("",type=['jpg', 'png', 'pdf'])
|
uploaded_file = st.file_uploader("",type=['jpg', 'png', 'pdf'])
|
||||||
|
|
||||||
if xelatex_installed:
|
|
||||||
st.caption('🥳 Xelatex have been detected, rendered image will be displayed in the web page.')
|
|
||||||
else:
|
|
||||||
st.caption('😭 Xelatex is not detected, please check the resulting latex code by yourself, or check ... to have your xelatex setup ready.')
|
|
||||||
|
|
||||||
if uploaded_file:
|
if uploaded_file:
|
||||||
img = Image.open(uploaded_file)
|
img = Image.open(uploaded_file)
|
||||||
|
|
||||||
@@ -170,60 +132,18 @@ if uploaded_file:
|
|||||||
|
|
||||||
with st.spinner("Predicting..."):
|
with st.spinner("Predicting..."):
|
||||||
uploaded_file.seek(0)
|
uploaded_file.seek(0)
|
||||||
TeXTeller_result = inference(
|
TexTeller_result = inference(
|
||||||
model,
|
model,
|
||||||
tokenizer,
|
tokenizer,
|
||||||
[png_file_path],
|
[png_file_path],
|
||||||
True if os.environ['USE_CUDA'] == 'True' else False,
|
True if os.environ['USE_CUDA'] == 'True' else False,
|
||||||
int(os.environ['NUM_BEAM'])
|
int(os.environ['NUM_BEAM'])
|
||||||
)[0]
|
)[0]
|
||||||
if not xelatex_installed:
|
st.success('Completed!', icon="✅")
|
||||||
st.markdown(fail_gif_html, unsafe_allow_html=True)
|
|
||||||
st.warning('Unable to find xelatex to render image. Please check the prediction results yourself.', icon="🤡")
|
|
||||||
txt = st.text_area(
|
|
||||||
":red[Predicted formula]",
|
|
||||||
TeXTeller_result,
|
|
||||||
height=150,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
is_successed = rendering(TeXTeller_result, Path(temp_dir))
|
|
||||||
if is_successed:
|
|
||||||
# st.code(TeXTeller_result, language='latex')
|
|
||||||
|
|
||||||
img_base64 = get_image_base64(pdf_to_pngbytes(Path(temp_dir) / 'build' / 'formula.pdf'))
|
|
||||||
st.markdown(suc_gif_html, unsafe_allow_html=True)
|
st.markdown(suc_gif_html, unsafe_allow_html=True)
|
||||||
st.success('Successfully rendered!', icon="✅")
|
katex_res = to_katex(TexTeller_result)
|
||||||
txt = st.text_area(
|
st.text_area(":red[Predicted formula]", katex_res, height=150)
|
||||||
":red[Predicted formula]",
|
st.latex(katex_res)
|
||||||
TeXTeller_result,
|
|
||||||
height=150,
|
|
||||||
)
|
|
||||||
# st.latex(TeXTeller_result)
|
|
||||||
st.markdown(f"""
|
|
||||||
<style>
|
|
||||||
.centered-container {{
|
|
||||||
text-align: center;
|
|
||||||
}}
|
|
||||||
.centered-image {{
|
|
||||||
display: block;
|
|
||||||
margin-left: auto;
|
|
||||||
margin-right: auto;
|
|
||||||
max-width: 500px;
|
|
||||||
max-height: 500px;
|
|
||||||
}}
|
|
||||||
</style>
|
|
||||||
<div class="centered-container">
|
|
||||||
<img src="data:image/png;base64,{img_base64}" class="centered-image" alt="Input image">
|
|
||||||
</div>
|
|
||||||
""", unsafe_allow_html=True)
|
|
||||||
else:
|
|
||||||
st.markdown(fail_gif_html, unsafe_allow_html=True)
|
|
||||||
st.error('Rendering failed. You can try using a higher resolution image or splitting the multi line formula into a single line for better results.', icon="❌")
|
|
||||||
txt = st.text_area(
|
|
||||||
":red[Predicted formula]",
|
|
||||||
TeXTeller_result,
|
|
||||||
height=150,
|
|
||||||
)
|
|
||||||
|
|
||||||
shutil.rmtree(temp_dir)
|
shutil.rmtree(temp_dir)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user