rag-ingestor/app/ingest/chunker.py

from dataclasses import dataclass


SENTENCE_END_CHARS = (".", "!", "?")


@dataclass(frozen=True)
class Chunk:
    text: str
    page: int


def _find_sentence_boundary(words: list[str], window_start: int) -> int | None:
    """Return index of last word ending with a sentence terminator within
    [window_start, len(words)), or None if no boundary found.

    The returned index is the inclusive end-index of the sentence: the chunk
    will include words[: idx + 1].
    """
    for i in range(len(words) - 1, window_start - 1, -1):
        if words[i].endswith(SENTENCE_END_CHARS):
            return i
    return None


def chunk_text(text: str, size_words: int, overlap_words: int, page: int) -> list[Chunk]:
    """Split text into ≤size_words chunks with overlap_words overlap.

    Each chunk ends at the last sentence boundary in the final 20% of the
    `size_words` window when possible; otherwise it ends at exactly `size_words`.
    """
    if not text.strip():
        return []

    words = text.split()
    if len(words) <= size_words:
        return [Chunk(text=" ".join(words), page=page)]

    chunks: list[Chunk] = []
    start = 0
    lookback_window = max(1, int(size_words * 0.2))

    while start < len(words):
        hard_end = min(start + size_words, len(words))
        # Search for sentence boundary in last 20% of the window
        if hard_end - start == size_words:
            boundary_search_start = hard_end - lookback_window
            boundary = _find_sentence_boundary(words[: hard_end], boundary_search_start)
            end = boundary + 1 if boundary is not None else hard_end
        else:
            end = hard_end

        chunks.append(Chunk(text=" ".join(words[start:end]), page=page))

        if end >= len(words):
            break

        # Step forward: end - overlap, but never less than start + 1
        next_start = max(end - overlap_words, start + 1)
        start = next_start

    return chunks