diff --git a/app/ingest/chunker.py b/app/ingest/chunker.py new file mode 100644 index 0000000..bfe6c2e --- /dev/null +++ b/app/ingest/chunker.py @@ -0,0 +1,62 @@ +from dataclasses import dataclass + + +SENTENCE_END_CHARS = (".", "!", "?") + + +@dataclass(frozen=True) +class Chunk: + text: str + page: int + + +def _find_sentence_boundary(words: list[str], window_start: int) -> int | None: + """Return index of last word ending with a sentence terminator within + [window_start, len(words)), or None if no boundary found. + + The returned index is the inclusive end-index of the sentence: the chunk + will include words[: idx + 1]. + """ + for i in range(len(words) - 1, window_start - 1, -1): + if words[i].endswith(SENTENCE_END_CHARS): + return i + return None + + +def chunk_text(text: str, size_words: int, overlap_words: int, page: int) -> list[Chunk]: + """Split text into ≤size_words chunks with overlap_words overlap. + + Each chunk ends at the last sentence boundary in the final 20% of the + `size_words` window when possible; otherwise it ends at exactly `size_words`. + """ + if not text.strip(): + return [] + + words = text.split() + if len(words) <= size_words: + return [Chunk(text=" ".join(words), page=page)] + + chunks: list[Chunk] = [] + start = 0 + lookback_window = max(1, int(size_words * 0.2)) + + while start < len(words): + hard_end = min(start + size_words, len(words)) + # Search for sentence boundary in last 20% of the window + if hard_end - start == size_words: + boundary_search_start = hard_end - lookback_window + boundary = _find_sentence_boundary(words[: hard_end], boundary_search_start) + end = boundary + 1 if boundary is not None else hard_end + else: + end = hard_end + + chunks.append(Chunk(text=" ".join(words[start:end]), page=page)) + + if end >= len(words): + break + + # Step forward: end - overlap, but never less than start + 1 + next_start = max(end - overlap_words, start + 1) + start = next_start + + return chunks diff --git a/tests/test_chunker.py b/tests/test_chunker.py new file mode 100644 index 0000000..0a3b248 --- /dev/null +++ b/tests/test_chunker.py @@ -0,0 +1,54 @@ +from app.ingest.chunker import chunk_text, Chunk + + +def test_chunk_short_text_single_chunk(): + text = "Das ist ein kurzer Text mit wenigen Worten." + chunks = chunk_text(text, size_words=500, overlap_words=50, page=1) + assert len(chunks) == 1 + assert chunks[0].text == text + assert chunks[0].page == 1 + + +def test_chunk_size_and_overlap(): + words = [f"w{i}" for i in range(1200)] + text = " ".join(words) + chunks = chunk_text(text, size_words=500, overlap_words=50, page=1) + + # 1200 words, size 500, overlap 50 → step 450 → starts at 0, 450, 900 → 3 chunks + assert len(chunks) == 3 + # First chunk has up to 500 words + assert len(chunks[0].text.split()) <= 500 + # Overlap: last 50 words of chunk 0 are first 50 words of chunk 1 + last_50_of_first = chunks[0].text.split()[-50:] + first_50_of_second = chunks[1].text.split()[:50] + assert last_50_of_first == first_50_of_second + + +def test_chunk_respects_sentence_boundary_in_lookback_window(): + # 600 words, with a sentence ending around word 480 (within last 20% = words 400-500) + words = [f"w{i}" for i in range(600)] + words[479] = "ende." + text = " ".join(words) + chunks = chunk_text(text, size_words=500, overlap_words=50, page=1) + + # First chunk should end at the sentence boundary, not at word 500 + first_chunk_words = chunks[0].text.split() + assert first_chunk_words[-1] == "ende." + assert len(first_chunk_words) == 480 + + +def test_chunk_no_sentence_boundary_in_window_falls_back_to_word_count(): + words = [f"w{i}" for i in range(600)] + text = " ".join(words) + chunks = chunk_text(text, size_words=500, overlap_words=50, page=1) + # No sentence-end → exactly 500 words in first chunk + assert len(chunks[0].text.split()) == 500 + + +def test_chunk_empty_text_returns_empty_list(): + assert chunk_text("", size_words=500, overlap_words=50, page=1) == [] + + +def test_chunk_carries_page_number(): + chunks = chunk_text("hallo welt", size_words=500, overlap_words=50, page=7) + assert chunks[0].page == 7