Files
rag-ingestor/tests/test_chunker.py
Jean-Luc Makiola 4b9280a972
All checks were successful
CI / ci (push) Successful in 37s
Release / release (push) Successful in 53s
test: ungenutzte imports entfernen (ruff F401)
2026-05-07 16:06:55 +02:00

71 lines
2.5 KiB
Python

from app.ingest.chunker import chunk_text
def test_chunk_short_text_single_chunk():
text = "Das ist ein kurzer Text mit wenigen Worten."
chunks = chunk_text(text, size_words=500, overlap_words=50, page=1)
assert len(chunks) == 1
assert chunks[0].text == text
assert chunks[0].page == 1
def test_chunk_size_and_overlap():
words = [f"w{i}" for i in range(1200)]
text = " ".join(words)
chunks = chunk_text(text, size_words=500, overlap_words=50, page=1)
# 1200 words, size 500, overlap 50 → step 450 → starts at 0, 450, 900 → 3 chunks
assert len(chunks) == 3
# First chunk has up to 500 words
assert len(chunks[0].text.split()) <= 500
# Overlap: last 50 words of chunk 0 are first 50 words of chunk 1
last_50_of_first = chunks[0].text.split()[-50:]
first_50_of_second = chunks[1].text.split()[:50]
assert last_50_of_first == first_50_of_second
def test_chunk_respects_sentence_boundary_in_lookback_window():
# 600 words, with a sentence ending around word 480 (within last 20% = words 400-500)
words = [f"w{i}" for i in range(600)]
words[479] = "ende."
text = " ".join(words)
chunks = chunk_text(text, size_words=500, overlap_words=50, page=1)
# First chunk should end at the sentence boundary, not at word 500
first_chunk_words = chunks[0].text.split()
assert first_chunk_words[-1] == "ende."
assert len(first_chunk_words) == 480
def test_chunk_no_sentence_boundary_in_window_falls_back_to_word_count():
words = [f"w{i}" for i in range(600)]
text = " ".join(words)
chunks = chunk_text(text, size_words=500, overlap_words=50, page=1)
# No sentence-end → exactly 500 words in first chunk
assert len(chunks[0].text.split()) == 500
def test_chunk_empty_text_returns_empty_list():
assert chunk_text("", size_words=500, overlap_words=50, page=1) == []
def test_chunk_carries_page_number():
chunks = chunk_text("hallo welt", size_words=500, overlap_words=50, page=7)
assert chunks[0].page == 7
def test_chunk_overlap_geq_size_does_not_loop_forever():
words = [f"w{i}" for i in range(60)]
text = " ".join(words)
chunks = chunk_text(text, size_words=10, overlap_words=10, page=1)
assert len(chunks) >= 5
assert all(len(c.text.split()) <= 10 for c in chunks)
def test_chunk_ignores_sentence_boundary_outside_lookback_window():
words = [f"w{i}" for i in range(600)]
words[399] = "ende."
text = " ".join(words)
chunks = chunk_text(text, size_words=500, overlap_words=50, page=1)
assert len(chunks[0].text.split()) == 500