feat: word-based chunker mit sentence-boundary look-back
This commit is contained in:
62
app/ingest/chunker.py
Normal file
62
app/ingest/chunker.py
Normal file
@@ -0,0 +1,62 @@
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
SENTENCE_END_CHARS = (".", "!", "?")
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Chunk:
|
||||
text: str
|
||||
page: int
|
||||
|
||||
|
||||
def _find_sentence_boundary(words: list[str], window_start: int) -> int | None:
|
||||
"""Return index of last word ending with a sentence terminator within
|
||||
[window_start, len(words)), or None if no boundary found.
|
||||
|
||||
The returned index is the inclusive end-index of the sentence: the chunk
|
||||
will include words[: idx + 1].
|
||||
"""
|
||||
for i in range(len(words) - 1, window_start - 1, -1):
|
||||
if words[i].endswith(SENTENCE_END_CHARS):
|
||||
return i
|
||||
return None
|
||||
|
||||
|
||||
def chunk_text(text: str, size_words: int, overlap_words: int, page: int) -> list[Chunk]:
|
||||
"""Split text into ≤size_words chunks with overlap_words overlap.
|
||||
|
||||
Each chunk ends at the last sentence boundary in the final 20% of the
|
||||
`size_words` window when possible; otherwise it ends at exactly `size_words`.
|
||||
"""
|
||||
if not text.strip():
|
||||
return []
|
||||
|
||||
words = text.split()
|
||||
if len(words) <= size_words:
|
||||
return [Chunk(text=" ".join(words), page=page)]
|
||||
|
||||
chunks: list[Chunk] = []
|
||||
start = 0
|
||||
lookback_window = max(1, int(size_words * 0.2))
|
||||
|
||||
while start < len(words):
|
||||
hard_end = min(start + size_words, len(words))
|
||||
# Search for sentence boundary in last 20% of the window
|
||||
if hard_end - start == size_words:
|
||||
boundary_search_start = hard_end - lookback_window
|
||||
boundary = _find_sentence_boundary(words[: hard_end], boundary_search_start)
|
||||
end = boundary + 1 if boundary is not None else hard_end
|
||||
else:
|
||||
end = hard_end
|
||||
|
||||
chunks.append(Chunk(text=" ".join(words[start:end]), page=page))
|
||||
|
||||
if end >= len(words):
|
||||
break
|
||||
|
||||
# Step forward: end - overlap, but never less than start + 1
|
||||
next_start = max(end - overlap_words, start + 1)
|
||||
start = next_start
|
||||
|
||||
return chunks
|
||||
Reference in New Issue
Block a user