feat: word-based chunker mit sentence-boundary look-back

2026-05-04 22:11:14 +02:00
parent 0224581587
commit 2f2024f168
2 changed files with 116 additions and 0 deletions
--- a/app/ingest/chunker.py
+++ b/app/ingest/chunker.py
@@ -0,0 +1,62 @@
+from dataclasses import dataclass
+
+
+SENTENCE_END_CHARS = (".", "!", "?")
+
+
+@dataclass(frozen=True)
+class Chunk:
+    text: str
+    page: int
+
+
+def _find_sentence_boundary(words: list[str], window_start: int) -> int | None:
+    """Return index of last word ending with a sentence terminator within
+    [window_start, len(words)), or None if no boundary found.
+
+    The returned index is the inclusive end-index of the sentence: the chunk
+    will include words[: idx + 1].
+    """
+    for i in range(len(words) - 1, window_start - 1, -1):
+        if words[i].endswith(SENTENCE_END_CHARS):
+            return i
+    return None
+
+
+def chunk_text(text: str, size_words: int, overlap_words: int, page: int) -> list[Chunk]:
+    """Split text into ≤size_words chunks with overlap_words overlap.
+
+    Each chunk ends at the last sentence boundary in the final 20% of the
+    `size_words` window when possible; otherwise it ends at exactly `size_words`.
+    """
+    if not text.strip():
+        return []
+
+    words = text.split()
+    if len(words) <= size_words:
+        return [Chunk(text=" ".join(words), page=page)]
+
+    chunks: list[Chunk] = []
+    start = 0
+    lookback_window = max(1, int(size_words * 0.2))
+
+    while start < len(words):
+        hard_end = min(start + size_words, len(words))
+        # Search for sentence boundary in last 20% of the window
+        if hard_end - start == size_words:
+            boundary_search_start = hard_end - lookback_window
+            boundary = _find_sentence_boundary(words[: hard_end], boundary_search_start)
+            end = boundary + 1 if boundary is not None else hard_end
+        else:
+            end = hard_end
+
+        chunks.append(Chunk(text=" ".join(words[start:end]), page=page))
+
+        if end >= len(words):
+            break
+
+        # Step forward: end - overlap, but never less than start + 1
+        next_start = max(end - overlap_words, start + 1)
+        start = next_start
+
+    return chunks