chore: vector/chunk count assertion und page_num rename in pipeline
This commit is contained in:
@@ -93,6 +93,13 @@ async def process_file(file_path: str, event_type: EventType) -> None:
|
|||||||
logger.exception("embed failed", extra={"event": "embed_failed", "file": file_path, "error": str(exc)})
|
logger.exception("embed failed", extra={"event": "embed_failed", "file": file_path, "error": str(exc)})
|
||||||
return
|
return
|
||||||
|
|
||||||
|
if len(vectors) != len(chunks):
|
||||||
|
logger.error(
|
||||||
|
"vector/chunk count mismatch",
|
||||||
|
extra={"event": "embed_failed", "file": file_path, "vectors": len(vectors), "chunks": len(chunks)},
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
now_iso = datetime.now(timezone.utc).isoformat()
|
now_iso = datetime.now(timezone.utc).isoformat()
|
||||||
file_name = PurePosixPath(file_path).name
|
file_name = PurePosixPath(file_path).name
|
||||||
points = [
|
points = [
|
||||||
@@ -105,13 +112,13 @@ async def process_file(file_path: str, event_type: EventType) -> None:
|
|||||||
"semester": metadata.semester,
|
"semester": metadata.semester,
|
||||||
"fach": metadata.fach,
|
"fach": metadata.fach,
|
||||||
"typ": metadata.typ,
|
"typ": metadata.typ,
|
||||||
"page": page,
|
"page": page_num,
|
||||||
"chunk_index": idx,
|
"chunk_index": idx,
|
||||||
"text": text,
|
"text": text,
|
||||||
"ingested_at": now_iso,
|
"ingested_at": now_iso,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
for vec, (text, page, idx) in zip(vectors, chunks)
|
for vec, (text, page_num, idx) in zip(vectors, chunks)
|
||||||
]
|
]
|
||||||
|
|
||||||
qdrant = _qdrant_client()
|
qdrant = _qdrant_client()
|
||||||
|
|||||||
Reference in New Issue
Block a user