test: regression tests fuer overlap>=size und boundary ausserhalb lookback
This commit is contained in:
@@ -52,3 +52,19 @@ def test_chunk_empty_text_returns_empty_list():
|
||||
def test_chunk_carries_page_number():
|
||||
chunks = chunk_text("hallo welt", size_words=500, overlap_words=50, page=7)
|
||||
assert chunks[0].page == 7
|
||||
|
||||
|
||||
def test_chunk_overlap_geq_size_does_not_loop_forever():
|
||||
words = [f"w{i}" for i in range(60)]
|
||||
text = " ".join(words)
|
||||
chunks = chunk_text(text, size_words=10, overlap_words=10, page=1)
|
||||
assert len(chunks) >= 5
|
||||
assert all(len(c.text.split()) <= 10 for c in chunks)
|
||||
|
||||
|
||||
def test_chunk_ignores_sentence_boundary_outside_lookback_window():
|
||||
words = [f"w{i}" for i in range(600)]
|
||||
words[399] = "ende."
|
||||
text = " ".join(words)
|
||||
chunks = chunk_text(text, size_words=500, overlap_words=50, page=1)
|
||||
assert len(chunks[0].text.split()) == 500
|
||||
|
||||
Reference in New Issue
Block a user