from app.ingest.chunker import chunk_text def test_chunk_short_text_single_chunk(): text = "Das ist ein kurzer Text mit wenigen Worten." chunks = chunk_text(text, size_words=500, overlap_words=50, page=1) assert len(chunks) == 1 assert chunks[0].text == text assert chunks[0].page == 1 def test_chunk_size_and_overlap(): words = [f"w{i}" for i in range(1200)] text = " ".join(words) chunks = chunk_text(text, size_words=500, overlap_words=50, page=1) # 1200 words, size 500, overlap 50 → step 450 → starts at 0, 450, 900 → 3 chunks assert len(chunks) == 3 # First chunk has up to 500 words assert len(chunks[0].text.split()) <= 500 # Overlap: last 50 words of chunk 0 are first 50 words of chunk 1 last_50_of_first = chunks[0].text.split()[-50:] first_50_of_second = chunks[1].text.split()[:50] assert last_50_of_first == first_50_of_second def test_chunk_respects_sentence_boundary_in_lookback_window(): # 600 words, with a sentence ending around word 480 (within last 20% = words 400-500) words = [f"w{i}" for i in range(600)] words[479] = "ende." text = " ".join(words) chunks = chunk_text(text, size_words=500, overlap_words=50, page=1) # First chunk should end at the sentence boundary, not at word 500 first_chunk_words = chunks[0].text.split() assert first_chunk_words[-1] == "ende." assert len(first_chunk_words) == 480 def test_chunk_no_sentence_boundary_in_window_falls_back_to_word_count(): words = [f"w{i}" for i in range(600)] text = " ".join(words) chunks = chunk_text(text, size_words=500, overlap_words=50, page=1) # No sentence-end → exactly 500 words in first chunk assert len(chunks[0].text.split()) == 500 def test_chunk_empty_text_returns_empty_list(): assert chunk_text("", size_words=500, overlap_words=50, page=1) == [] def test_chunk_carries_page_number(): chunks = chunk_text("hallo welt", size_words=500, overlap_words=50, page=7) assert chunks[0].page == 7 def test_chunk_overlap_geq_size_does_not_loop_forever(): words = [f"w{i}" for i in range(60)] text = " ".join(words) chunks = chunk_text(text, size_words=10, overlap_words=10, page=1) assert len(chunks) >= 5 assert all(len(c.text.split()) <= 10 for c in chunks) def test_chunk_ignores_sentence_boundary_outside_lookback_window(): words = [f"w{i}" for i in range(600)] words[399] = "ende." text = " ".join(words) chunks = chunk_text(text, size_words=500, overlap_words=50, page=1) assert len(chunks[0].text.split()) == 500