71 lines
2.5 KiB
Python
71 lines
2.5 KiB
Python
from app.ingest.chunker import chunk_text
|
|
|
|
|
|
def test_chunk_short_text_single_chunk():
|
|
text = "Das ist ein kurzer Text mit wenigen Worten."
|
|
chunks = chunk_text(text, size_words=500, overlap_words=50, page=1)
|
|
assert len(chunks) == 1
|
|
assert chunks[0].text == text
|
|
assert chunks[0].page == 1
|
|
|
|
|
|
def test_chunk_size_and_overlap():
|
|
words = [f"w{i}" for i in range(1200)]
|
|
text = " ".join(words)
|
|
chunks = chunk_text(text, size_words=500, overlap_words=50, page=1)
|
|
|
|
# 1200 words, size 500, overlap 50 → step 450 → starts at 0, 450, 900 → 3 chunks
|
|
assert len(chunks) == 3
|
|
# First chunk has up to 500 words
|
|
assert len(chunks[0].text.split()) <= 500
|
|
# Overlap: last 50 words of chunk 0 are first 50 words of chunk 1
|
|
last_50_of_first = chunks[0].text.split()[-50:]
|
|
first_50_of_second = chunks[1].text.split()[:50]
|
|
assert last_50_of_first == first_50_of_second
|
|
|
|
|
|
def test_chunk_respects_sentence_boundary_in_lookback_window():
|
|
# 600 words, with a sentence ending around word 480 (within last 20% = words 400-500)
|
|
words = [f"w{i}" for i in range(600)]
|
|
words[479] = "ende."
|
|
text = " ".join(words)
|
|
chunks = chunk_text(text, size_words=500, overlap_words=50, page=1)
|
|
|
|
# First chunk should end at the sentence boundary, not at word 500
|
|
first_chunk_words = chunks[0].text.split()
|
|
assert first_chunk_words[-1] == "ende."
|
|
assert len(first_chunk_words) == 480
|
|
|
|
|
|
def test_chunk_no_sentence_boundary_in_window_falls_back_to_word_count():
|
|
words = [f"w{i}" for i in range(600)]
|
|
text = " ".join(words)
|
|
chunks = chunk_text(text, size_words=500, overlap_words=50, page=1)
|
|
# No sentence-end → exactly 500 words in first chunk
|
|
assert len(chunks[0].text.split()) == 500
|
|
|
|
|
|
def test_chunk_empty_text_returns_empty_list():
|
|
assert chunk_text("", size_words=500, overlap_words=50, page=1) == []
|
|
|
|
|
|
def test_chunk_carries_page_number():
|
|
chunks = chunk_text("hallo welt", size_words=500, overlap_words=50, page=7)
|
|
assert chunks[0].page == 7
|
|
|
|
|
|
def test_chunk_overlap_geq_size_does_not_loop_forever():
|
|
words = [f"w{i}" for i in range(60)]
|
|
text = " ".join(words)
|
|
chunks = chunk_text(text, size_words=10, overlap_words=10, page=1)
|
|
assert len(chunks) >= 5
|
|
assert all(len(c.text.split()) <= 10 for c in chunks)
|
|
|
|
|
|
def test_chunk_ignores_sentence_boundary_outside_lookback_window():
|
|
words = [f"w{i}" for i in range(600)]
|
|
words[399] = "ende."
|
|
text = " ".join(words)
|
|
chunks = chunk_text(text, size_words=500, overlap_words=50, page=1)
|
|
assert len(chunks[0].text.split()) == 500
|