from dataclasses import dataclass SENTENCE_END_CHARS = (".", "!", "?") @dataclass(frozen=True) class Chunk: text: str page: int def _find_sentence_boundary(words: list[str], window_start: int) -> int | None: """Return index of last word ending with a sentence terminator within [window_start, len(words)), or None if no boundary found. The returned index is the inclusive end-index of the sentence: the chunk will include words[: idx + 1]. """ for i in range(len(words) - 1, window_start - 1, -1): if words[i].endswith(SENTENCE_END_CHARS): return i return None def chunk_text(text: str, size_words: int, overlap_words: int, page: int) -> list[Chunk]: """Split text into ≤size_words chunks with overlap_words overlap. Each chunk ends at the last sentence boundary in the final 20% of the `size_words` window when possible; otherwise it ends at exactly `size_words`. """ if not text.strip(): return [] words = text.split() if len(words) <= size_words: return [Chunk(text=" ".join(words), page=page)] chunks: list[Chunk] = [] start = 0 lookback_window = max(1, int(size_words * 0.2)) while start < len(words): hard_end = min(start + size_words, len(words)) # Search for sentence boundary in last 20% of the window if hard_end - start == size_words: boundary_search_start = hard_end - lookback_window boundary = _find_sentence_boundary(words[: hard_end], boundary_search_start) end = boundary + 1 if boundary is not None else hard_end else: end = hard_end chunks.append(Chunk(text=" ".join(words[start:end]), page=page)) if end >= len(words): break # Step forward: end - overlap, but never less than start + 1 next_start = max(end - overlap_words, start + 1) start = next_start return chunks