chore: vector/chunk count assertion und page_num rename in pipeline

This commit is contained in:
2026-05-04 22:35:17 +02:00
parent 02c8f5d338
commit 61e00028e8

View File

@@ -93,6 +93,13 @@ async def process_file(file_path: str, event_type: EventType) -> None:
logger.exception("embed failed", extra={"event": "embed_failed", "file": file_path, "error": str(exc)}) logger.exception("embed failed", extra={"event": "embed_failed", "file": file_path, "error": str(exc)})
return return
if len(vectors) != len(chunks):
logger.error(
"vector/chunk count mismatch",
extra={"event": "embed_failed", "file": file_path, "vectors": len(vectors), "chunks": len(chunks)},
)
return
now_iso = datetime.now(timezone.utc).isoformat() now_iso = datetime.now(timezone.utc).isoformat()
file_name = PurePosixPath(file_path).name file_name = PurePosixPath(file_path).name
points = [ points = [
@@ -105,13 +112,13 @@ async def process_file(file_path: str, event_type: EventType) -> None:
"semester": metadata.semester, "semester": metadata.semester,
"fach": metadata.fach, "fach": metadata.fach,
"typ": metadata.typ, "typ": metadata.typ,
"page": page, "page": page_num,
"chunk_index": idx, "chunk_index": idx,
"text": text, "text": text,
"ingested_at": now_iso, "ingested_at": now_iso,
}, },
) )
for vec, (text, page, idx) in zip(vectors, chunks) for vec, (text, page_num, idx) in zip(vectors, chunks)
] ]
qdrant = _qdrant_client() qdrant = _qdrant_client()