diff --git a/src/models/ocr_model/utils/to_katex.py b/src/models/ocr_model/utils/to_katex.py index e7295bf..b6166dc 100644 --- a/src/models/ocr_model/utils/to_katex.py +++ b/src/models/ocr_model/utils/to_katex.py @@ -54,16 +54,58 @@ def change(input_str, old_inst, new_inst, old_surr_l, old_surr_r, new_surr_l, ne result += input_str[i:start] i = start - if old_inst != new_inst and old_inst in result: + if old_inst != new_inst and (old_inst + old_surr_l) in result: return change(result, old_inst, new_inst, old_surr_l, old_surr_r, new_surr_l, new_surr_r) else: return result +def find_substring_positions(string, substring): + positions = [match.start() for match in re.finditer(re.escape(substring), string)] + return positions + + +def rm_dollar_surr(content): + pattern = re.compile(r'\\[a-zA-Z]+\$.*?\$|\$.*?\$') + matches = pattern.findall(content) + + for match in matches: + if not re.match(r'\\[a-zA-Z]+', match): + new_match = match.strip('$') + content = content.replace(match, ' ' + new_match + ' ') + + return content + + +def change_all(input_str, old_inst, new_inst, old_surr_l, old_surr_r, new_surr_l, new_surr_r): + pos = find_substring_positions(input_str, old_inst + old_surr_l) + res = list(input_str) + for p in pos[::-1]: + res[p:] = list(change(''.join(res[p:]), old_inst, new_inst, old_surr_l, old_surr_r, new_surr_l, new_surr_r)) + res = ''.join(res) + return res + + def to_katex(formula: str) -> str: res = formula - res = change(res, r'\mbox ', r'', r'{', r'}', r'', r'') - res = change(res, r'\mbox', r'', r'{', r'}', r'', r'') + # remove mbox surrounding + res = change_all(res, r'\mbox ', r' ', r'{', r'}', r'', r'') + res = change_all(res, r'\mbox', r' ', r'{', r'}', r'', r'') + # remove hbox surrounding + res = re.sub(r'\\hbox to ?-? ?\d+\.\d+(pt)?\{', r'\\hbox{', res) + res = change_all(res, r'\hbox', r' ', r'{', r'}', r'', r' ') + # remove raise surrounding + res = re.sub(r'\\raise ?-? ?\d+\.\d+(pt)?', r' ', res) + # remove makebox + res = re.sub(r'\\makebox ?\[\d+\.\d+(pt)?\]\{', r'\\makebox{', res) + res = change_all(res, r'\makebox', r' ', r'{', r'}', r'', r' ') + # remove vbox surrounding, scalebox surrounding + res = re.sub(r'\\raisebox\{-? ?\d+\.\d+(pt)?\}\{', r'\\raisebox{', res) + res = re.sub(r'\\scalebox\{-? ?\d+\.\d+(pt)?\}\{', r'\\scalebox{', res) + res = change_all(res, r'\scalebox', r' ', r'{', r'}', r'', r' ') + res = change_all(res, r'\raisebox', r' ', r'{', r'}', r'', r' ') + res = change_all(res, r'\vbox', r' ', r'{', r'}', r'', r' ') + origin_instructions = [ r'\Huge', @@ -77,10 +119,14 @@ def to_katex(formula: str) -> str: r'\tiny' ] for (old_ins, new_ins) in zip(origin_instructions, origin_instructions): - res = change(res, old_ins, new_ins, r'$', r'$', '{', '}') - res = change(res, r'\boldmath ', r'\bm', r'$', r'$', r'{', r'}') - res = change(res, r'\boldmath', r'\bm', r'$', r'$', r'{', r'}') - res = change(res, r'\scriptsize', r'\scriptsize', r'$', r'$', r'{', r'}') + res = change_all(res, old_ins, new_ins, r'$', r'$', '{', '}') + res = change_all(res, r'\boldmath ', r'\bm', r'{', r'}', r'{', r'}') + res = change_all(res, r'\boldmath', r'\bm', r'{', r'}', r'{', r'}') + res = change_all(res, r'\boldmath ', r'\bm', r'$', r'$', r'{', r'}') + res = change_all(res, r'\boldmath', r'\bm', r'$', r'$', r'{', r'}') + res = change_all(res, r'\scriptsize', r'\scriptsize', r'$', r'$', r'{', r'}') + res = change_all(res, r'\emph', r'\textit', r'{', r'}', r'{', r'}') + res = change_all(res, r'\emph ', r'\textit', r'{', r'}', r'{', r'}') origin_instructions = [ r'\left', @@ -104,10 +150,31 @@ def to_katex(formula: str) -> str: r'\Biggr' ] for origin_ins in origin_instructions: - res = change(res, origin_ins, origin_ins, r'{', r'}', r'', r'') + res = change_all(res, origin_ins, origin_ins, r'{', r'}', r'', r'') res = re.sub(r'\\\[(.*?)\\\]', r'\1\\newline', res) if res.endswith(r'\newline'): res = res[:-8] - return res + + # remove multiple spaces + res = re.sub(r'(\\,){1,}', ' ', res) + res = re.sub(r'(\\!){1,}', ' ', res) + res = re.sub(r'(\\;){1,}', ' ', res) + res = re.sub(r'(\\:){1,}', ' ', res) + res = re.sub(r'\\vspace\{.*?}', '', res) + + # merge consecutive text + def merge_texts(match): + texts = match.group(0) + merged_content = ''.join(re.findall(r'\\text\{([^}]*)\}', texts)) + return f'\\text{{{merged_content}}}' + res = re.sub(r'(\\text\{[^}]*\}\s*){2,}', merge_texts, res) + + res = res.replace(r'\bf ', '') + res = rm_dollar_surr(res) + + # remove extra spaces (keeping only one) + res = re.sub(r' +', ' ', res) + + return res.strip()