test: regression tests fuer overlap>=size und boundary ausserhalb lookback

This commit is contained in:
2026-05-04 22:13:34 +02:00
parent 2f2024f168
commit 5e44495676

View File

@@ -52,3 +52,19 @@ def test_chunk_empty_text_returns_empty_list():
def test_chunk_carries_page_number():
chunks = chunk_text("hallo welt", size_words=500, overlap_words=50, page=7)
assert chunks[0].page == 7
def test_chunk_overlap_geq_size_does_not_loop_forever():
words = [f"w{i}" for i in range(60)]
text = " ".join(words)
chunks = chunk_text(text, size_words=10, overlap_words=10, page=1)
assert len(chunks) >= 5
assert all(len(c.text.split()) <= 10 for c in chunks)
def test_chunk_ignores_sentence_boundary_outside_lookback_window():
words = [f"w{i}" for i in range(600)]
words[399] = "ende."
text = " ".join(words)
chunks = chunk_text(text, size_words=500, overlap_words=50, page=1)
assert len(chunks[0].text.split()) == 500