fix: optimize title to formula
This commit is contained in:
@@ -34,13 +34,7 @@ def find_consecutive_repeat(s: str, min_unit_len: int = 10, min_repeats: int = 1
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
pattern = re.compile(
|
pattern = re.compile(
|
||||||
r"(.{"
|
r"(.{" + str(min_unit_len) + "," + str(max_unit_len) + r"}?)\1{" + str(min_repeats - 1) + ",}",
|
||||||
+ str(min_unit_len)
|
|
||||||
+ ","
|
|
||||||
+ str(max_unit_len)
|
|
||||||
+ r"}?)\1{"
|
|
||||||
+ str(min_repeats - 1)
|
|
||||||
+ ",}",
|
|
||||||
re.DOTALL,
|
re.DOTALL,
|
||||||
)
|
)
|
||||||
match = pattern.search(s)
|
match = pattern.search(s)
|
||||||
@@ -74,9 +68,7 @@ def clean_repeated_content(
|
|||||||
if count >= line_threshold and (count / total_lines) >= 0.8:
|
if count >= line_threshold and (count / total_lines) >= 0.8:
|
||||||
for i, line in enumerate(lines):
|
for i, line in enumerate(lines):
|
||||||
if line == common:
|
if line == common:
|
||||||
consecutive = sum(
|
consecutive = sum(1 for j in range(i, min(i + 3, len(lines))) if lines[j] == common)
|
||||||
1 for j in range(i, min(i + 3, len(lines))) if lines[j] == common
|
|
||||||
)
|
|
||||||
if consecutive >= 3:
|
if consecutive >= 3:
|
||||||
original_lines = content.split("\n")
|
original_lines = content.split("\n")
|
||||||
non_empty_count = 0
|
non_empty_count = 0
|
||||||
@@ -113,6 +105,11 @@ def clean_formula_number(number_content: str) -> str:
|
|||||||
# GLMResultFormatter
|
# GLMResultFormatter
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# Matches content that consists *entirely* of a display-math block and nothing else.
|
||||||
|
# Used to detect when a text/heading region was actually recognised as a formula by vLLM,
|
||||||
|
# so we can correct the label before heading prefixes (## …) are applied.
|
||||||
|
_PURE_DISPLAY_FORMULA_RE = re.compile(r"^\s*(?:\$\$[\s\S]+?\$\$|\\\[[\s\S]+?\\\])\s*$")
|
||||||
|
|
||||||
# Label → canonical category mapping (mirrors GLM-OCR label_visualization_mapping)
|
# Label → canonical category mapping (mirrors GLM-OCR label_visualization_mapping)
|
||||||
_LABEL_TO_CATEGORY: dict[str, str] = {
|
_LABEL_TO_CATEGORY: dict[str, str] = {
|
||||||
# text
|
# text
|
||||||
@@ -173,6 +170,19 @@ class GLMResultFormatter:
|
|||||||
item["native_label"] = item.get("native_label", item.get("label", "text"))
|
item["native_label"] = item.get("native_label", item.get("label", "text"))
|
||||||
item["label"] = self._map_label(item.get("label", "text"), item["native_label"])
|
item["label"] = self._map_label(item.get("label", "text"), item["native_label"])
|
||||||
|
|
||||||
|
# Label correction: layout may say "text" (or a heading like "paragraph_title")
|
||||||
|
# but vLLM recognised the content as a formula and returned $$…$$. Without
|
||||||
|
# correction the heading prefix (##) would be prepended to the math block,
|
||||||
|
# producing broken output like "## $$ \mathbf{y}=… $$".
|
||||||
|
raw_content = (item.get("content") or "").strip()
|
||||||
|
if item["label"] == "text" and _PURE_DISPLAY_FORMULA_RE.match(raw_content):
|
||||||
|
logger.debug(
|
||||||
|
"Label corrected text (native=%s) → formula: pure display-formula detected",
|
||||||
|
item["native_label"],
|
||||||
|
)
|
||||||
|
item["label"] = "formula"
|
||||||
|
item["native_label"] = "display_formula"
|
||||||
|
|
||||||
item["content"] = self._format_content(
|
item["content"] = self._format_content(
|
||||||
item.get("content") or "",
|
item.get("content") or "",
|
||||||
item["label"],
|
item["label"],
|
||||||
@@ -262,9 +272,7 @@ class GLMResultFormatter:
|
|||||||
content = content[: -len(e)].strip()
|
content = content[: -len(e)].strip()
|
||||||
break
|
break
|
||||||
if not content:
|
if not content:
|
||||||
logger.warning(
|
logger.warning("Skipping formula region with empty content after stripping delimiters")
|
||||||
"Skipping formula region with empty content after stripping delimiters"
|
|
||||||
)
|
|
||||||
return ""
|
return ""
|
||||||
content = "$$\n" + content + "\n$$"
|
content = "$$\n" + content + "\n$$"
|
||||||
|
|
||||||
@@ -314,9 +322,7 @@ class GLMResultFormatter:
|
|||||||
formula_content = items[i + 1].get("content", "")
|
formula_content = items[i + 1].get("content", "")
|
||||||
merged_block = deepcopy(items[i + 1])
|
merged_block = deepcopy(items[i + 1])
|
||||||
if formula_content.endswith("\n$$"):
|
if formula_content.endswith("\n$$"):
|
||||||
merged_block["content"] = (
|
merged_block["content"] = formula_content[:-3] + f" \\tag{{{num_clean}}}\n$$"
|
||||||
formula_content[:-3] + f" \\tag{{{num_clean}}}\n$$"
|
|
||||||
)
|
|
||||||
merged.append(merged_block)
|
merged.append(merged_block)
|
||||||
skip.add(i + 1)
|
skip.add(i + 1)
|
||||||
continue # always skip the formula_number block itself
|
continue # always skip the formula_number block itself
|
||||||
@@ -328,9 +334,7 @@ class GLMResultFormatter:
|
|||||||
formula_content = block.get("content", "")
|
formula_content = block.get("content", "")
|
||||||
merged_block = deepcopy(block)
|
merged_block = deepcopy(block)
|
||||||
if formula_content.endswith("\n$$"):
|
if formula_content.endswith("\n$$"):
|
||||||
merged_block["content"] = (
|
merged_block["content"] = formula_content[:-3] + f" \\tag{{{num_clean}}}\n$$"
|
||||||
formula_content[:-3] + f" \\tag{{{num_clean}}}\n$$"
|
|
||||||
)
|
|
||||||
merged.append(merged_block)
|
merged.append(merged_block)
|
||||||
skip.add(i + 1)
|
skip.add(i + 1)
|
||||||
continue
|
continue
|
||||||
@@ -390,9 +394,7 @@ class GLMResultFormatter:
|
|||||||
block["index"] = i
|
block["index"] = i
|
||||||
return merged
|
return merged
|
||||||
|
|
||||||
def _format_bullet_points(
|
def _format_bullet_points(self, items: list[dict], left_align_threshold: float = 10.0) -> list[dict]:
|
||||||
self, items: list[dict], left_align_threshold: float = 10.0
|
|
||||||
) -> list[dict]:
|
|
||||||
"""Add missing bullet prefix when a text block is sandwiched between two bullet items."""
|
"""Add missing bullet prefix when a text block is sandwiched between two bullet items."""
|
||||||
if len(items) < 3:
|
if len(items) < 3:
|
||||||
return items
|
return items
|
||||||
@@ -422,10 +424,7 @@ class GLMResultFormatter:
|
|||||||
if not (cur_bbox and prev_bbox and nxt_bbox):
|
if not (cur_bbox and prev_bbox and nxt_bbox):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if (
|
if abs(cur_bbox[0] - prev_bbox[0]) <= left_align_threshold and abs(cur_bbox[0] - nxt_bbox[0]) <= left_align_threshold:
|
||||||
abs(cur_bbox[0] - prev_bbox[0]) <= left_align_threshold
|
|
||||||
and abs(cur_bbox[0] - nxt_bbox[0]) <= left_align_threshold
|
|
||||||
):
|
|
||||||
cur["content"] = "- " + cur_content
|
cur["content"] = "- " + cur_content
|
||||||
|
|
||||||
return items
|
return items
|
||||||
|
|||||||
Reference in New Issue
Block a user