Files
doc_processer/tests/services/test_glm_postprocess.py
liuyuanchuang 30d2c2f45b fix: remove padding from GLMOCREndToEndService and clean up ruff violations
- Drop image padding in GLMOCREndToEndService.recognize(); use raw image directly
- Fix F821 undefined `padded` references replaced with `image`
- Fix F601 duplicate dict key "≠" in converter
- Fix F841 unused `image_cls_ids` variable in layout_postprocess
- Fix E702 semicolon-separated statements in layout_postprocess
- Fix UP031 percent-format replaced with f-string in logging_config
- Auto-fix 44 additional ruff violations (import order, UP035/UP045/UP006, F401, F541)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-10 19:52:22 +08:00

210 lines
6.7 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from app.services.glm_postprocess import (
GLMResultFormatter,
clean_formula_number,
clean_repeated_content,
find_consecutive_repeat,
)
def test_find_consecutive_repeat_truncates_when_threshold_met():
repeated = "abcdefghij" * 10 + "tail"
assert find_consecutive_repeat(repeated) == "abcdefghij"
def test_find_consecutive_repeat_returns_none_when_below_threshold():
assert find_consecutive_repeat("abcdefghij" * 9) is None
def test_clean_repeated_content_handles_consecutive_and_line_level_repeats():
assert clean_repeated_content("abcdefghij" * 10 + "tail") == "abcdefghij"
line_repeated = "\n".join(["same line"] * 10 + ["other"])
assert clean_repeated_content(line_repeated, line_threshold=10) == "same line\n"
assert clean_repeated_content("normal text") == "normal text"
def test_clean_formula_number_strips_wrapping_parentheses():
assert clean_formula_number("(1)") == "1"
assert clean_formula_number("2.1") == "2.1"
assert clean_formula_number("3") == "3"
def test_clean_content_removes_literal_tabs_and_long_repeat_noise():
formatter = GLMResultFormatter()
noisy = r"\t\t" + ("·" * 5) + ("abcdefghij" * 205) + r"\t"
cleaned = formatter._clean_content(noisy)
assert cleaned.startswith("···")
assert cleaned.endswith("abcdefghij")
assert r"\t" not in cleaned
def test_format_content_handles_titles_formula_text_and_newlines():
formatter = GLMResultFormatter()
assert formatter._format_content("Intro", "text", "doc_title") == "# Intro"
assert formatter._format_content("- Section", "text", "paragraph_title") == "## Section"
assert formatter._format_content(r"\[x+y\]", "formula", "display_formula") == "$$\nx+y\n$$"
assert formatter._format_content("· item\nnext", "text", "text") == "- item\n\nnext"
def test_merge_formula_numbers_merges_before_and_after_formula():
formatter = GLMResultFormatter()
before = formatter._merge_formula_numbers(
[
{"index": 0, "label": "text", "native_label": "formula_number", "content": "(1)"},
{
"index": 1,
"label": "formula",
"native_label": "display_formula",
"content": "$$\nx+y\n$$",
},
]
)
after = formatter._merge_formula_numbers(
[
{
"index": 0,
"label": "formula",
"native_label": "display_formula",
"content": "$$\nx+y\n$$",
},
{"index": 1, "label": "text", "native_label": "formula_number", "content": "(2)"},
]
)
untouched = formatter._merge_formula_numbers(
[{"index": 0, "label": "text", "native_label": "formula_number", "content": "(3)"}]
)
assert before == [
{
"index": 0,
"label": "formula",
"native_label": "display_formula",
"content": "$$\nx+y \\tag{1}\n$$",
}
]
assert after == [
{
"index": 0,
"label": "formula",
"native_label": "display_formula",
"content": "$$\nx+y \\tag{2}\n$$",
}
]
assert untouched == []
def test_merge_text_blocks_joins_hyphenated_words_when_wordfreq_accepts(monkeypatch):
formatter = GLMResultFormatter()
monkeypatch.setattr("app.services.glm_postprocess._WORDFREQ_AVAILABLE", True)
monkeypatch.setattr("app.services.glm_postprocess.zipf_frequency", lambda word, lang: 3.0)
merged = formatter._merge_text_blocks(
[
{"index": 0, "label": "text", "native_label": "text", "content": "inter-"},
{"index": 1, "label": "text", "native_label": "text", "content": "national"},
]
)
assert merged == [
{"index": 0, "label": "text", "native_label": "text", "content": "international"}
]
def test_merge_text_blocks_skips_invalid_merge(monkeypatch):
formatter = GLMResultFormatter()
monkeypatch.setattr("app.services.glm_postprocess._WORDFREQ_AVAILABLE", True)
monkeypatch.setattr("app.services.glm_postprocess.zipf_frequency", lambda word, lang: 1.0)
merged = formatter._merge_text_blocks(
[
{"index": 0, "label": "text", "native_label": "text", "content": "inter-"},
{"index": 1, "label": "text", "native_label": "text", "content": "National"},
]
)
assert merged == [
{"index": 0, "label": "text", "native_label": "text", "content": "inter-"},
{"index": 1, "label": "text", "native_label": "text", "content": "National"},
]
def test_format_bullet_points_infers_missing_middle_bullet():
formatter = GLMResultFormatter()
items = [
{"native_label": "text", "content": "- first", "bbox_2d": [10, 0, 50, 10]},
{"native_label": "text", "content": "second", "bbox_2d": [12, 12, 52, 22]},
{"native_label": "text", "content": "- third", "bbox_2d": [11, 24, 51, 34]},
]
formatted = formatter._format_bullet_points(items)
assert formatted[1]["content"] == "- second"
def test_format_bullet_points_skips_when_bbox_missing():
formatter = GLMResultFormatter()
items = [
{"native_label": "text", "content": "- first", "bbox_2d": [10, 0, 50, 10]},
{"native_label": "text", "content": "second", "bbox_2d": []},
{"native_label": "text", "content": "- third", "bbox_2d": [11, 24, 51, 34]},
]
formatted = formatter._format_bullet_points(items)
assert formatted[1]["content"] == "second"
def test_process_runs_full_pipeline_and_skips_empty_content():
formatter = GLMResultFormatter()
regions = [
{
"index": 0,
"label": "text",
"native_label": "doc_title",
"content": "Doc Title",
"bbox_2d": [0, 0, 100, 30],
},
{
"index": 1,
"label": "text",
"native_label": "formula_number",
"content": "(1)",
"bbox_2d": [80, 50, 100, 60],
},
{
"index": 2,
"label": "formula",
"native_label": "display_formula",
"content": "x+y",
"bbox_2d": [0, 40, 100, 80],
},
{
"index": 3,
"label": "figure",
"native_label": "image",
"content": "figure placeholder",
"bbox_2d": [0, 80, 100, 120],
},
{
"index": 4,
"label": "text",
"native_label": "text",
"content": "",
"bbox_2d": [0, 120, 100, 150],
},
]
output = formatter.process(regions)
assert "# Doc Title" in output
assert "$$\nx+y \\tag{1}\n$$" in output
assert "![](bbox=[0, 80, 100, 120])" in output