from app.services.glm_postprocess import ( GLMResultFormatter, clean_formula_number, clean_repeated_content, find_consecutive_repeat, ) def test_find_consecutive_repeat_truncates_when_threshold_met(): repeated = "abcdefghij" * 10 + "tail" assert find_consecutive_repeat(repeated) == "abcdefghij" def test_find_consecutive_repeat_returns_none_when_below_threshold(): assert find_consecutive_repeat("abcdefghij" * 9) is None def test_clean_repeated_content_handles_consecutive_and_line_level_repeats(): assert clean_repeated_content("abcdefghij" * 10 + "tail") == "abcdefghij" line_repeated = "\n".join(["same line"] * 10 + ["other"]) assert clean_repeated_content(line_repeated, line_threshold=10) == "same line\n" assert clean_repeated_content("normal text") == "normal text" def test_clean_formula_number_strips_wrapping_parentheses(): assert clean_formula_number("(1)") == "1" assert clean_formula_number("(2.1)") == "2.1" assert clean_formula_number("3") == "3" def test_clean_content_removes_literal_tabs_and_long_repeat_noise(): formatter = GLMResultFormatter() noisy = r"\t\t" + ("·" * 5) + ("abcdefghij" * 205) + r"\t" cleaned = formatter._clean_content(noisy) assert cleaned.startswith("···") assert cleaned.endswith("abcdefghij") assert r"\t" not in cleaned def test_format_content_handles_titles_formula_text_and_newlines(): formatter = GLMResultFormatter() assert formatter._format_content("Intro", "text", "doc_title") == "# Intro" assert formatter._format_content("- Section", "text", "paragraph_title") == "## Section" assert formatter._format_content(r"\[x+y\]", "formula", "display_formula") == "$$\nx+y\n$$" assert formatter._format_content("· item\nnext", "text", "text") == "- item\n\nnext" def test_merge_formula_numbers_merges_before_and_after_formula(): formatter = GLMResultFormatter() before = formatter._merge_formula_numbers( [ {"index": 0, "label": "text", "native_label": "formula_number", "content": "(1)"}, {"index": 1, "label": "formula", "native_label": "display_formula", "content": "$$\nx+y\n$$"}, ] ) after = formatter._merge_formula_numbers( [ {"index": 0, "label": "formula", "native_label": "display_formula", "content": "$$\nx+y\n$$"}, {"index": 1, "label": "text", "native_label": "formula_number", "content": "(2)"}, ] ) untouched = formatter._merge_formula_numbers( [{"index": 0, "label": "text", "native_label": "formula_number", "content": "(3)"}] ) assert before == [ { "index": 0, "label": "formula", "native_label": "display_formula", "content": "$$\nx+y \\tag{1}\n$$", } ] assert after == [ { "index": 0, "label": "formula", "native_label": "display_formula", "content": "$$\nx+y \\tag{2}\n$$", } ] assert untouched == [] def test_merge_text_blocks_joins_hyphenated_words_when_wordfreq_accepts(monkeypatch): formatter = GLMResultFormatter() monkeypatch.setattr("app.services.glm_postprocess._WORDFREQ_AVAILABLE", True) monkeypatch.setattr("app.services.glm_postprocess.zipf_frequency", lambda word, lang: 3.0) merged = formatter._merge_text_blocks( [ {"index": 0, "label": "text", "native_label": "text", "content": "inter-"}, {"index": 1, "label": "text", "native_label": "text", "content": "national"}, ] ) assert merged == [ {"index": 0, "label": "text", "native_label": "text", "content": "international"} ] def test_merge_text_blocks_skips_invalid_merge(monkeypatch): formatter = GLMResultFormatter() monkeypatch.setattr("app.services.glm_postprocess._WORDFREQ_AVAILABLE", True) monkeypatch.setattr("app.services.glm_postprocess.zipf_frequency", lambda word, lang: 1.0) merged = formatter._merge_text_blocks( [ {"index": 0, "label": "text", "native_label": "text", "content": "inter-"}, {"index": 1, "label": "text", "native_label": "text", "content": "National"}, ] ) assert merged == [ {"index": 0, "label": "text", "native_label": "text", "content": "inter-"}, {"index": 1, "label": "text", "native_label": "text", "content": "National"}, ] def test_format_bullet_points_infers_missing_middle_bullet(): formatter = GLMResultFormatter() items = [ {"native_label": "text", "content": "- first", "bbox_2d": [10, 0, 50, 10]}, {"native_label": "text", "content": "second", "bbox_2d": [12, 12, 52, 22]}, {"native_label": "text", "content": "- third", "bbox_2d": [11, 24, 51, 34]}, ] formatted = formatter._format_bullet_points(items) assert formatted[1]["content"] == "- second" def test_format_bullet_points_skips_when_bbox_missing(): formatter = GLMResultFormatter() items = [ {"native_label": "text", "content": "- first", "bbox_2d": [10, 0, 50, 10]}, {"native_label": "text", "content": "second", "bbox_2d": []}, {"native_label": "text", "content": "- third", "bbox_2d": [11, 24, 51, 34]}, ] formatted = formatter._format_bullet_points(items) assert formatted[1]["content"] == "second" def test_process_runs_full_pipeline_and_skips_empty_content(): formatter = GLMResultFormatter() regions = [ { "index": 0, "label": "text", "native_label": "doc_title", "content": "Doc Title", "bbox_2d": [0, 0, 100, 30], }, { "index": 1, "label": "text", "native_label": "formula_number", "content": "(1)", "bbox_2d": [80, 50, 100, 60], }, { "index": 2, "label": "formula", "native_label": "display_formula", "content": "x+y", "bbox_2d": [0, 40, 100, 80], }, { "index": 3, "label": "figure", "native_label": "image", "content": "figure placeholder", "bbox_2d": [0, 80, 100, 120], }, { "index": 4, "label": "text", "native_label": "text", "content": "", "bbox_2d": [0, 120, 100, 150], }, ] output = formatter.process(regions) assert "# Doc Title" in output assert "$$\nx+y \\tag{1}\n$$" in output assert "![](bbox=[0, 80, 100, 120])" in output