chore: vector/chunk count assertion und page_num rename in pipeline
This commit is contained in:
@@ -93,6 +93,13 @@ async def process_file(file_path: str, event_type: EventType) -> None:
|
||||
logger.exception("embed failed", extra={"event": "embed_failed", "file": file_path, "error": str(exc)})
|
||||
return
|
||||
|
||||
if len(vectors) != len(chunks):
|
||||
logger.error(
|
||||
"vector/chunk count mismatch",
|
||||
extra={"event": "embed_failed", "file": file_path, "vectors": len(vectors), "chunks": len(chunks)},
|
||||
)
|
||||
return
|
||||
|
||||
now_iso = datetime.now(timezone.utc).isoformat()
|
||||
file_name = PurePosixPath(file_path).name
|
||||
points = [
|
||||
@@ -105,13 +112,13 @@ async def process_file(file_path: str, event_type: EventType) -> None:
|
||||
"semester": metadata.semester,
|
||||
"fach": metadata.fach,
|
||||
"typ": metadata.typ,
|
||||
"page": page,
|
||||
"page": page_num,
|
||||
"chunk_index": idx,
|
||||
"text": text,
|
||||
"ingested_at": now_iso,
|
||||
},
|
||||
)
|
||||
for vec, (text, page, idx) in zip(vectors, chunks)
|
||||
for vec, (text, page_num, idx) in zip(vectors, chunks)
|
||||
]
|
||||
|
||||
qdrant = _qdrant_client()
|
||||
|
||||
Reference in New Issue
Block a user