feat: word-based chunker mit sentence-boundary look-back
This commit is contained in:
62
app/ingest/chunker.py
Normal file
62
app/ingest/chunker.py
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
|
||||||
|
SENTENCE_END_CHARS = (".", "!", "?")
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class Chunk:
|
||||||
|
text: str
|
||||||
|
page: int
|
||||||
|
|
||||||
|
|
||||||
|
def _find_sentence_boundary(words: list[str], window_start: int) -> int | None:
|
||||||
|
"""Return index of last word ending with a sentence terminator within
|
||||||
|
[window_start, len(words)), or None if no boundary found.
|
||||||
|
|
||||||
|
The returned index is the inclusive end-index of the sentence: the chunk
|
||||||
|
will include words[: idx + 1].
|
||||||
|
"""
|
||||||
|
for i in range(len(words) - 1, window_start - 1, -1):
|
||||||
|
if words[i].endswith(SENTENCE_END_CHARS):
|
||||||
|
return i
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def chunk_text(text: str, size_words: int, overlap_words: int, page: int) -> list[Chunk]:
|
||||||
|
"""Split text into ≤size_words chunks with overlap_words overlap.
|
||||||
|
|
||||||
|
Each chunk ends at the last sentence boundary in the final 20% of the
|
||||||
|
`size_words` window when possible; otherwise it ends at exactly `size_words`.
|
||||||
|
"""
|
||||||
|
if not text.strip():
|
||||||
|
return []
|
||||||
|
|
||||||
|
words = text.split()
|
||||||
|
if len(words) <= size_words:
|
||||||
|
return [Chunk(text=" ".join(words), page=page)]
|
||||||
|
|
||||||
|
chunks: list[Chunk] = []
|
||||||
|
start = 0
|
||||||
|
lookback_window = max(1, int(size_words * 0.2))
|
||||||
|
|
||||||
|
while start < len(words):
|
||||||
|
hard_end = min(start + size_words, len(words))
|
||||||
|
# Search for sentence boundary in last 20% of the window
|
||||||
|
if hard_end - start == size_words:
|
||||||
|
boundary_search_start = hard_end - lookback_window
|
||||||
|
boundary = _find_sentence_boundary(words[: hard_end], boundary_search_start)
|
||||||
|
end = boundary + 1 if boundary is not None else hard_end
|
||||||
|
else:
|
||||||
|
end = hard_end
|
||||||
|
|
||||||
|
chunks.append(Chunk(text=" ".join(words[start:end]), page=page))
|
||||||
|
|
||||||
|
if end >= len(words):
|
||||||
|
break
|
||||||
|
|
||||||
|
# Step forward: end - overlap, but never less than start + 1
|
||||||
|
next_start = max(end - overlap_words, start + 1)
|
||||||
|
start = next_start
|
||||||
|
|
||||||
|
return chunks
|
||||||
54
tests/test_chunker.py
Normal file
54
tests/test_chunker.py
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
from app.ingest.chunker import chunk_text, Chunk
|
||||||
|
|
||||||
|
|
||||||
|
def test_chunk_short_text_single_chunk():
|
||||||
|
text = "Das ist ein kurzer Text mit wenigen Worten."
|
||||||
|
chunks = chunk_text(text, size_words=500, overlap_words=50, page=1)
|
||||||
|
assert len(chunks) == 1
|
||||||
|
assert chunks[0].text == text
|
||||||
|
assert chunks[0].page == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_chunk_size_and_overlap():
|
||||||
|
words = [f"w{i}" for i in range(1200)]
|
||||||
|
text = " ".join(words)
|
||||||
|
chunks = chunk_text(text, size_words=500, overlap_words=50, page=1)
|
||||||
|
|
||||||
|
# 1200 words, size 500, overlap 50 → step 450 → starts at 0, 450, 900 → 3 chunks
|
||||||
|
assert len(chunks) == 3
|
||||||
|
# First chunk has up to 500 words
|
||||||
|
assert len(chunks[0].text.split()) <= 500
|
||||||
|
# Overlap: last 50 words of chunk 0 are first 50 words of chunk 1
|
||||||
|
last_50_of_first = chunks[0].text.split()[-50:]
|
||||||
|
first_50_of_second = chunks[1].text.split()[:50]
|
||||||
|
assert last_50_of_first == first_50_of_second
|
||||||
|
|
||||||
|
|
||||||
|
def test_chunk_respects_sentence_boundary_in_lookback_window():
|
||||||
|
# 600 words, with a sentence ending around word 480 (within last 20% = words 400-500)
|
||||||
|
words = [f"w{i}" for i in range(600)]
|
||||||
|
words[479] = "ende."
|
||||||
|
text = " ".join(words)
|
||||||
|
chunks = chunk_text(text, size_words=500, overlap_words=50, page=1)
|
||||||
|
|
||||||
|
# First chunk should end at the sentence boundary, not at word 500
|
||||||
|
first_chunk_words = chunks[0].text.split()
|
||||||
|
assert first_chunk_words[-1] == "ende."
|
||||||
|
assert len(first_chunk_words) == 480
|
||||||
|
|
||||||
|
|
||||||
|
def test_chunk_no_sentence_boundary_in_window_falls_back_to_word_count():
|
||||||
|
words = [f"w{i}" for i in range(600)]
|
||||||
|
text = " ".join(words)
|
||||||
|
chunks = chunk_text(text, size_words=500, overlap_words=50, page=1)
|
||||||
|
# No sentence-end → exactly 500 words in first chunk
|
||||||
|
assert len(chunks[0].text.split()) == 500
|
||||||
|
|
||||||
|
|
||||||
|
def test_chunk_empty_text_returns_empty_list():
|
||||||
|
assert chunk_text("", size_words=500, overlap_words=50, page=1) == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_chunk_carries_page_number():
|
||||||
|
chunks = chunk_text("hallo welt", size_words=500, overlap_words=50, page=7)
|
||||||
|
assert chunks[0].page == 7
|
||||||
Reference in New Issue
Block a user