feat add glm-ocr core

2026-03-09 16:51:06 +08:00
parent d74130914c
commit 6dfaf9668b
17 changed files with 1687 additions and 140 deletions
--- a/tests/services/test_glm_postprocess.py
+++ b/tests/services/test_glm_postprocess.py
@@ -0,0 +1,199 @@
+from app.services.glm_postprocess import (
+    GLMResultFormatter,
+    clean_formula_number,
+    clean_repeated_content,
+    find_consecutive_repeat,
+)
+
+
+def test_find_consecutive_repeat_truncates_when_threshold_met():
+    repeated = "abcdefghij" * 10 + "tail"
+
+    assert find_consecutive_repeat(repeated) == "abcdefghij"
+
+
+def test_find_consecutive_repeat_returns_none_when_below_threshold():
+    assert find_consecutive_repeat("abcdefghij" * 9) is None
+
+
+def test_clean_repeated_content_handles_consecutive_and_line_level_repeats():
+    assert clean_repeated_content("abcdefghij" * 10 + "tail") == "abcdefghij"
+
+    line_repeated = "\n".join(["same line"] * 10 + ["other"])
+    assert clean_repeated_content(line_repeated, line_threshold=10) == "same line\n"
+
+    assert clean_repeated_content("normal text") == "normal text"
+
+
+def test_clean_formula_number_strips_wrapping_parentheses():
+    assert clean_formula_number("(1)") == "1"
+    assert clean_formula_number("（2.1）") == "2.1"
+    assert clean_formula_number("3") == "3"
+
+
+def test_clean_content_removes_literal_tabs_and_long_repeat_noise():
+    formatter = GLMResultFormatter()
+    noisy = r"\t\t" + ("·" * 5) + ("abcdefghij" * 205) + r"\t"
+
+    cleaned = formatter._clean_content(noisy)
+
+    assert cleaned.startswith("···")
+    assert cleaned.endswith("abcdefghij")
+    assert r"\t" not in cleaned
+
+
+def test_format_content_handles_titles_formula_text_and_newlines():
+    formatter = GLMResultFormatter()
+
+    assert formatter._format_content("Intro", "text", "doc_title") == "# Intro"
+    assert formatter._format_content("- Section", "text", "paragraph_title") == "## Section"
+    assert formatter._format_content(r"\[x+y\]", "formula", "display_formula") == "$$\nx+y\n$$"
+    assert formatter._format_content("· item\nnext", "text", "text") == "- item\n\nnext"
+
+
+def test_merge_formula_numbers_merges_before_and_after_formula():
+    formatter = GLMResultFormatter()
+
+    before = formatter._merge_formula_numbers(
+        [
+            {"index": 0, "label": "text", "native_label": "formula_number", "content": "(1)"},
+            {"index": 1, "label": "formula", "native_label": "display_formula", "content": "$$\nx+y\n$$"},
+        ]
+    )
+    after = formatter._merge_formula_numbers(
+        [
+            {"index": 0, "label": "formula", "native_label": "display_formula", "content": "$$\nx+y\n$$"},
+            {"index": 1, "label": "text", "native_label": "formula_number", "content": "(2)"},
+        ]
+    )
+    untouched = formatter._merge_formula_numbers(
+        [{"index": 0, "label": "text", "native_label": "formula_number", "content": "(3)"}]
+    )
+
+    assert before == [
+        {
+            "index": 0,
+            "label": "formula",
+            "native_label": "display_formula",
+            "content": "$$\nx+y \\tag{1}\n$$",
+        }
+    ]
+    assert after == [
+        {
+            "index": 0,
+            "label": "formula",
+            "native_label": "display_formula",
+            "content": "$$\nx+y \\tag{2}\n$$",
+        }
+    ]
+    assert untouched == []
+
+
+def test_merge_text_blocks_joins_hyphenated_words_when_wordfreq_accepts(monkeypatch):
+    formatter = GLMResultFormatter()
+
+    monkeypatch.setattr("app.services.glm_postprocess._WORDFREQ_AVAILABLE", True)
+    monkeypatch.setattr("app.services.glm_postprocess.zipf_frequency", lambda word, lang: 3.0)
+
+    merged = formatter._merge_text_blocks(
+        [
+            {"index": 0, "label": "text", "native_label": "text", "content": "inter-"},
+            {"index": 1, "label": "text", "native_label": "text", "content": "national"},
+        ]
+    )
+
+    assert merged == [
+        {"index": 0, "label": "text", "native_label": "text", "content": "international"}
+    ]
+
+
+def test_merge_text_blocks_skips_invalid_merge(monkeypatch):
+    formatter = GLMResultFormatter()
+
+    monkeypatch.setattr("app.services.glm_postprocess._WORDFREQ_AVAILABLE", True)
+    monkeypatch.setattr("app.services.glm_postprocess.zipf_frequency", lambda word, lang: 1.0)
+
+    merged = formatter._merge_text_blocks(
+        [
+            {"index": 0, "label": "text", "native_label": "text", "content": "inter-"},
+            {"index": 1, "label": "text", "native_label": "text", "content": "National"},
+        ]
+    )
+
+    assert merged == [
+        {"index": 0, "label": "text", "native_label": "text", "content": "inter-"},
+        {"index": 1, "label": "text", "native_label": "text", "content": "National"},
+    ]
+
+
+def test_format_bullet_points_infers_missing_middle_bullet():
+    formatter = GLMResultFormatter()
+    items = [
+        {"native_label": "text", "content": "- first", "bbox_2d": [10, 0, 50, 10]},
+        {"native_label": "text", "content": "second", "bbox_2d": [12, 12, 52, 22]},
+        {"native_label": "text", "content": "- third", "bbox_2d": [11, 24, 51, 34]},
+    ]
+
+    formatted = formatter._format_bullet_points(items)
+
+    assert formatted[1]["content"] == "- second"
+
+
+def test_format_bullet_points_skips_when_bbox_missing():
+    formatter = GLMResultFormatter()
+    items = [
+        {"native_label": "text", "content": "- first", "bbox_2d": [10, 0, 50, 10]},
+        {"native_label": "text", "content": "second", "bbox_2d": []},
+        {"native_label": "text", "content": "- third", "bbox_2d": [11, 24, 51, 34]},
+    ]
+
+    formatted = formatter._format_bullet_points(items)
+
+    assert formatted[1]["content"] == "second"
+
+
+def test_process_runs_full_pipeline_and_skips_empty_content():
+    formatter = GLMResultFormatter()
+    regions = [
+        {
+            "index": 0,
+            "label": "text",
+            "native_label": "doc_title",
+            "content": "Doc Title",
+            "bbox_2d": [0, 0, 100, 30],
+        },
+        {
+            "index": 1,
+            "label": "text",
+            "native_label": "formula_number",
+            "content": "(1)",
+            "bbox_2d": [80, 50, 100, 60],
+        },
+        {
+            "index": 2,
+            "label": "formula",
+            "native_label": "display_formula",
+            "content": "x+y",
+            "bbox_2d": [0, 40, 100, 80],
+        },
+        {
+            "index": 3,
+            "label": "figure",
+            "native_label": "image",
+            "content": "figure placeholder",
+            "bbox_2d": [0, 80, 100, 120],
+        },
+        {
+            "index": 4,
+            "label": "text",
+            "native_label": "text",
+            "content": "",
+            "bbox_2d": [0, 120, 100, 150],
+        },
+    ]
+
+    output = formatter.process(regions)
+
+    assert "# Doc Title" in output
+    assert "$$\nx+y \\tag{1}\n$$" in output
+    assert "![](bbox=[0, 80, 100, 120])" in output