diff --git a/app/services/glm_postprocess.py b/app/services/glm_postprocess.py index a893d04..80d0d7e 100644 --- a/app/services/glm_postprocess.py +++ b/app/services/glm_postprocess.py @@ -34,13 +34,7 @@ def find_consecutive_repeat(s: str, min_unit_len: int = 10, min_repeats: int = 1 return None pattern = re.compile( - r"(.{" - + str(min_unit_len) - + "," - + str(max_unit_len) - + r"}?)\1{" - + str(min_repeats - 1) - + ",}", + r"(.{" + str(min_unit_len) + "," + str(max_unit_len) + r"}?)\1{" + str(min_repeats - 1) + ",}", re.DOTALL, ) match = pattern.search(s) @@ -74,9 +68,7 @@ def clean_repeated_content( if count >= line_threshold and (count / total_lines) >= 0.8: for i, line in enumerate(lines): if line == common: - consecutive = sum( - 1 for j in range(i, min(i + 3, len(lines))) if lines[j] == common - ) + consecutive = sum(1 for j in range(i, min(i + 3, len(lines))) if lines[j] == common) if consecutive >= 3: original_lines = content.split("\n") non_empty_count = 0 @@ -113,6 +105,11 @@ def clean_formula_number(number_content: str) -> str: # GLMResultFormatter # --------------------------------------------------------------------------- +# Matches content that consists *entirely* of a display-math block and nothing else. +# Used to detect when a text/heading region was actually recognised as a formula by vLLM, +# so we can correct the label before heading prefixes (## …) are applied. +_PURE_DISPLAY_FORMULA_RE = re.compile(r"^\s*(?:\$\$[\s\S]+?\$\$|\\\[[\s\S]+?\\\])\s*$") + # Label → canonical category mapping (mirrors GLM-OCR label_visualization_mapping) _LABEL_TO_CATEGORY: dict[str, str] = { # text @@ -173,6 +170,19 @@ class GLMResultFormatter: item["native_label"] = item.get("native_label", item.get("label", "text")) item["label"] = self._map_label(item.get("label", "text"), item["native_label"]) + # Label correction: layout may say "text" (or a heading like "paragraph_title") + # but vLLM recognised the content as a formula and returned $$…$$. Without + # correction the heading prefix (##) would be prepended to the math block, + # producing broken output like "## $$ \mathbf{y}=… $$". + raw_content = (item.get("content") or "").strip() + if item["label"] == "text" and _PURE_DISPLAY_FORMULA_RE.match(raw_content): + logger.debug( + "Label corrected text (native=%s) → formula: pure display-formula detected", + item["native_label"], + ) + item["label"] = "formula" + item["native_label"] = "display_formula" + item["content"] = self._format_content( item.get("content") or "", item["label"], @@ -262,9 +272,7 @@ class GLMResultFormatter: content = content[: -len(e)].strip() break if not content: - logger.warning( - "Skipping formula region with empty content after stripping delimiters" - ) + logger.warning("Skipping formula region with empty content after stripping delimiters") return "" content = "$$\n" + content + "\n$$" @@ -314,9 +322,7 @@ class GLMResultFormatter: formula_content = items[i + 1].get("content", "") merged_block = deepcopy(items[i + 1]) if formula_content.endswith("\n$$"): - merged_block["content"] = ( - formula_content[:-3] + f" \\tag{{{num_clean}}}\n$$" - ) + merged_block["content"] = formula_content[:-3] + f" \\tag{{{num_clean}}}\n$$" merged.append(merged_block) skip.add(i + 1) continue # always skip the formula_number block itself @@ -328,9 +334,7 @@ class GLMResultFormatter: formula_content = block.get("content", "") merged_block = deepcopy(block) if formula_content.endswith("\n$$"): - merged_block["content"] = ( - formula_content[:-3] + f" \\tag{{{num_clean}}}\n$$" - ) + merged_block["content"] = formula_content[:-3] + f" \\tag{{{num_clean}}}\n$$" merged.append(merged_block) skip.add(i + 1) continue @@ -390,9 +394,7 @@ class GLMResultFormatter: block["index"] = i return merged - def _format_bullet_points( - self, items: list[dict], left_align_threshold: float = 10.0 - ) -> list[dict]: + def _format_bullet_points(self, items: list[dict], left_align_threshold: float = 10.0) -> list[dict]: """Add missing bullet prefix when a text block is sandwiched between two bullet items.""" if len(items) < 3: return items @@ -422,10 +424,7 @@ class GLMResultFormatter: if not (cur_bbox and prev_bbox and nxt_bbox): continue - if ( - abs(cur_bbox[0] - prev_bbox[0]) <= left_align_threshold - and abs(cur_bbox[0] - nxt_bbox[0]) <= left_align_threshold - ): + if abs(cur_bbox[0] - prev_bbox[0]) <= left_align_threshold and abs(cur_bbox[0] - nxt_bbox[0]) <= left_align_threshold: cur["content"] = "- " + cur_content return items