Files
rag-ingestor/app/ingest/chunker.py

63 lines
1.9 KiB
Python

from dataclasses import dataclass
SENTENCE_END_CHARS = (".", "!", "?")
@dataclass(frozen=True)
class Chunk:
text: str
page: int
def _find_sentence_boundary(words: list[str], window_start: int) -> int | None:
"""Return index of last word ending with a sentence terminator within
[window_start, len(words)), or None if no boundary found.
The returned index is the inclusive end-index of the sentence: the chunk
will include words[: idx + 1].
"""
for i in range(len(words) - 1, window_start - 1, -1):
if words[i].endswith(SENTENCE_END_CHARS):
return i
return None
def chunk_text(text: str, size_words: int, overlap_words: int, page: int) -> list[Chunk]:
"""Split text into ≤size_words chunks with overlap_words overlap.
Each chunk ends at the last sentence boundary in the final 20% of the
`size_words` window when possible; otherwise it ends at exactly `size_words`.
"""
if not text.strip():
return []
words = text.split()
if len(words) <= size_words:
return [Chunk(text=" ".join(words), page=page)]
chunks: list[Chunk] = []
start = 0
lookback_window = max(1, int(size_words * 0.2))
while start < len(words):
hard_end = min(start + size_words, len(words))
# Search for sentence boundary in last 20% of the window
if hard_end - start == size_words:
boundary_search_start = hard_end - lookback_window
boundary = _find_sentence_boundary(words[: hard_end], boundary_search_start)
end = boundary + 1 if boundary is not None else hard_end
else:
end = hard_end
chunks.append(Chunk(text=" ".join(words[start:end]), page=page))
if end >= len(words):
break
# Step forward: end - overlap, but never less than start + 1
next_start = max(end - overlap_words, start + 1)
start = next_start
return chunks