- Drop image padding in GLMOCREndToEndService.recognize(); use raw image directly - Fix F821 undefined `padded` references replaced with `image` - Fix F601 duplicate dict key "≠" in converter - Fix F841 unused `image_cls_ids` variable in layout_postprocess - Fix E702 semicolon-separated statements in layout_postprocess - Fix UP031 percent-format replaced with f-string in logging_config - Auto-fix 44 additional ruff violations (import order, UP035/UP045/UP006, F401, F541) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
210 lines
6.7 KiB
Python
210 lines
6.7 KiB
Python
from app.services.glm_postprocess import (
|
||
GLMResultFormatter,
|
||
clean_formula_number,
|
||
clean_repeated_content,
|
||
find_consecutive_repeat,
|
||
)
|
||
|
||
|
||
def test_find_consecutive_repeat_truncates_when_threshold_met():
|
||
repeated = "abcdefghij" * 10 + "tail"
|
||
|
||
assert find_consecutive_repeat(repeated) == "abcdefghij"
|
||
|
||
|
||
def test_find_consecutive_repeat_returns_none_when_below_threshold():
|
||
assert find_consecutive_repeat("abcdefghij" * 9) is None
|
||
|
||
|
||
def test_clean_repeated_content_handles_consecutive_and_line_level_repeats():
|
||
assert clean_repeated_content("abcdefghij" * 10 + "tail") == "abcdefghij"
|
||
|
||
line_repeated = "\n".join(["same line"] * 10 + ["other"])
|
||
assert clean_repeated_content(line_repeated, line_threshold=10) == "same line\n"
|
||
|
||
assert clean_repeated_content("normal text") == "normal text"
|
||
|
||
|
||
def test_clean_formula_number_strips_wrapping_parentheses():
|
||
assert clean_formula_number("(1)") == "1"
|
||
assert clean_formula_number("(2.1)") == "2.1"
|
||
assert clean_formula_number("3") == "3"
|
||
|
||
|
||
def test_clean_content_removes_literal_tabs_and_long_repeat_noise():
|
||
formatter = GLMResultFormatter()
|
||
noisy = r"\t\t" + ("·" * 5) + ("abcdefghij" * 205) + r"\t"
|
||
|
||
cleaned = formatter._clean_content(noisy)
|
||
|
||
assert cleaned.startswith("···")
|
||
assert cleaned.endswith("abcdefghij")
|
||
assert r"\t" not in cleaned
|
||
|
||
|
||
def test_format_content_handles_titles_formula_text_and_newlines():
|
||
formatter = GLMResultFormatter()
|
||
|
||
assert formatter._format_content("Intro", "text", "doc_title") == "# Intro"
|
||
assert formatter._format_content("- Section", "text", "paragraph_title") == "## Section"
|
||
assert formatter._format_content(r"\[x+y\]", "formula", "display_formula") == "$$\nx+y\n$$"
|
||
assert formatter._format_content("· item\nnext", "text", "text") == "- item\n\nnext"
|
||
|
||
|
||
def test_merge_formula_numbers_merges_before_and_after_formula():
|
||
formatter = GLMResultFormatter()
|
||
|
||
before = formatter._merge_formula_numbers(
|
||
[
|
||
{"index": 0, "label": "text", "native_label": "formula_number", "content": "(1)"},
|
||
{
|
||
"index": 1,
|
||
"label": "formula",
|
||
"native_label": "display_formula",
|
||
"content": "$$\nx+y\n$$",
|
||
},
|
||
]
|
||
)
|
||
after = formatter._merge_formula_numbers(
|
||
[
|
||
{
|
||
"index": 0,
|
||
"label": "formula",
|
||
"native_label": "display_formula",
|
||
"content": "$$\nx+y\n$$",
|
||
},
|
||
{"index": 1, "label": "text", "native_label": "formula_number", "content": "(2)"},
|
||
]
|
||
)
|
||
untouched = formatter._merge_formula_numbers(
|
||
[{"index": 0, "label": "text", "native_label": "formula_number", "content": "(3)"}]
|
||
)
|
||
|
||
assert before == [
|
||
{
|
||
"index": 0,
|
||
"label": "formula",
|
||
"native_label": "display_formula",
|
||
"content": "$$\nx+y \\tag{1}\n$$",
|
||
}
|
||
]
|
||
assert after == [
|
||
{
|
||
"index": 0,
|
||
"label": "formula",
|
||
"native_label": "display_formula",
|
||
"content": "$$\nx+y \\tag{2}\n$$",
|
||
}
|
||
]
|
||
assert untouched == []
|
||
|
||
|
||
def test_merge_text_blocks_joins_hyphenated_words_when_wordfreq_accepts(monkeypatch):
|
||
formatter = GLMResultFormatter()
|
||
|
||
monkeypatch.setattr("app.services.glm_postprocess._WORDFREQ_AVAILABLE", True)
|
||
monkeypatch.setattr("app.services.glm_postprocess.zipf_frequency", lambda word, lang: 3.0)
|
||
|
||
merged = formatter._merge_text_blocks(
|
||
[
|
||
{"index": 0, "label": "text", "native_label": "text", "content": "inter-"},
|
||
{"index": 1, "label": "text", "native_label": "text", "content": "national"},
|
||
]
|
||
)
|
||
|
||
assert merged == [
|
||
{"index": 0, "label": "text", "native_label": "text", "content": "international"}
|
||
]
|
||
|
||
|
||
def test_merge_text_blocks_skips_invalid_merge(monkeypatch):
|
||
formatter = GLMResultFormatter()
|
||
|
||
monkeypatch.setattr("app.services.glm_postprocess._WORDFREQ_AVAILABLE", True)
|
||
monkeypatch.setattr("app.services.glm_postprocess.zipf_frequency", lambda word, lang: 1.0)
|
||
|
||
merged = formatter._merge_text_blocks(
|
||
[
|
||
{"index": 0, "label": "text", "native_label": "text", "content": "inter-"},
|
||
{"index": 1, "label": "text", "native_label": "text", "content": "National"},
|
||
]
|
||
)
|
||
|
||
assert merged == [
|
||
{"index": 0, "label": "text", "native_label": "text", "content": "inter-"},
|
||
{"index": 1, "label": "text", "native_label": "text", "content": "National"},
|
||
]
|
||
|
||
|
||
def test_format_bullet_points_infers_missing_middle_bullet():
|
||
formatter = GLMResultFormatter()
|
||
items = [
|
||
{"native_label": "text", "content": "- first", "bbox_2d": [10, 0, 50, 10]},
|
||
{"native_label": "text", "content": "second", "bbox_2d": [12, 12, 52, 22]},
|
||
{"native_label": "text", "content": "- third", "bbox_2d": [11, 24, 51, 34]},
|
||
]
|
||
|
||
formatted = formatter._format_bullet_points(items)
|
||
|
||
assert formatted[1]["content"] == "- second"
|
||
|
||
|
||
def test_format_bullet_points_skips_when_bbox_missing():
|
||
formatter = GLMResultFormatter()
|
||
items = [
|
||
{"native_label": "text", "content": "- first", "bbox_2d": [10, 0, 50, 10]},
|
||
{"native_label": "text", "content": "second", "bbox_2d": []},
|
||
{"native_label": "text", "content": "- third", "bbox_2d": [11, 24, 51, 34]},
|
||
]
|
||
|
||
formatted = formatter._format_bullet_points(items)
|
||
|
||
assert formatted[1]["content"] == "second"
|
||
|
||
|
||
def test_process_runs_full_pipeline_and_skips_empty_content():
|
||
formatter = GLMResultFormatter()
|
||
regions = [
|
||
{
|
||
"index": 0,
|
||
"label": "text",
|
||
"native_label": "doc_title",
|
||
"content": "Doc Title",
|
||
"bbox_2d": [0, 0, 100, 30],
|
||
},
|
||
{
|
||
"index": 1,
|
||
"label": "text",
|
||
"native_label": "formula_number",
|
||
"content": "(1)",
|
||
"bbox_2d": [80, 50, 100, 60],
|
||
},
|
||
{
|
||
"index": 2,
|
||
"label": "formula",
|
||
"native_label": "display_formula",
|
||
"content": "x+y",
|
||
"bbox_2d": [0, 40, 100, 80],
|
||
},
|
||
{
|
||
"index": 3,
|
||
"label": "figure",
|
||
"native_label": "image",
|
||
"content": "figure placeholder",
|
||
"bbox_2d": [0, 80, 100, 120],
|
||
},
|
||
{
|
||
"index": 4,
|
||
"label": "text",
|
||
"native_label": "text",
|
||
"content": "",
|
||
"bbox_2d": [0, 120, 100, 150],
|
||
},
|
||
]
|
||
|
||
output = formatter.process(regions)
|
||
|
||
assert "# Doc Title" in output
|
||
assert "$$\nx+y \\tag{1}\n$$" in output
|
||
assert "" in output
|