feat: duration_ms-logging, bulk-semaphore und erweitertes README

- Pipeline-Stages (download/extract/embed/qdrant) loggen jetzt duration_ms
- bulk-import dispatcht mit Semaphore(4) statt unbounded → Backpressure
- README dokumentiert Webhook-Payload-Schema mit curl-Beispiel
- README enthaelt Recovery-Runbook (dim-mismatch, crash-recovery, single-file reindex)
This commit is contained in:
2026-05-04 22:54:58 +02:00
parent 7fe2d853ec
commit ca9ff55587
3 changed files with 92 additions and 3 deletions

View File

@@ -1,4 +1,5 @@
import logging
import time
from datetime import datetime, timezone
from functools import lru_cache
from pathlib import PurePosixPath
@@ -49,6 +50,7 @@ async def process_file(file_path: str, event_type: EventType) -> None:
)
return
t0 = time.perf_counter()
try:
data = await download_file(
settings.nextcloud_webdav_url,
@@ -59,7 +61,13 @@ async def process_file(file_path: str, event_type: EventType) -> None:
except Exception as exc:
logger.exception("download failed", extra={"event": "download_failed", "file": file_path, "error": str(exc)})
return
download_ms = int((time.perf_counter() - t0) * 1000)
logger.info(
"download ok",
extra={"event": "download", "status": "ok", "file": file_path, "duration_ms": download_ms, "bytes": len(data)},
)
t0 = time.perf_counter()
try:
pages = extract(data, extension, filename=PurePosixPath(file_path).name)
except UnsupportedFileType:
@@ -68,6 +76,11 @@ async def process_file(file_path: str, event_type: EventType) -> None:
except Exception as exc:
logger.exception("extract failed", extra={"event": "extract_failed", "file": file_path, "error": str(exc)})
return
extract_ms = int((time.perf_counter() - t0) * 1000)
logger.info(
"extract ok",
extra={"event": "extract", "status": "ok", "file": file_path, "duration_ms": extract_ms, "pages": len(pages)},
)
chunks: list[tuple[str, int, int]] = [] # (text, page, chunk_index)
chunk_index = 0
@@ -87,11 +100,17 @@ async def process_file(file_path: str, event_type: EventType) -> None:
delete_by_path(_qdrant_client(), settings.qdrant_collection, file_path)
return
t0 = time.perf_counter()
try:
vectors = await embed_texts([c[0] for c in chunks], model=settings.ollama_embed_model)
except Exception as exc:
logger.exception("embed failed", extra={"event": "embed_failed", "file": file_path, "error": str(exc)})
return
embed_ms = int((time.perf_counter() - t0) * 1000)
logger.info(
"embed ok",
extra={"event": "embed", "status": "ok", "file": file_path, "duration_ms": embed_ms, "chunks": len(vectors)},
)
if len(vectors) != len(chunks):
logger.error(
@@ -121,11 +140,19 @@ async def process_file(file_path: str, event_type: EventType) -> None:
for vec, (text, page_num, idx) in zip(vectors, chunks)
]
t0 = time.perf_counter()
qdrant = _qdrant_client()
delete_by_path(qdrant, settings.qdrant_collection, file_path)
upsert_chunks(qdrant, settings.qdrant_collection, points)
qdrant_ms = int((time.perf_counter() - t0) * 1000)
logger.info(
"ingested",
extra={"event": "ingest_done", "file": file_path, "chunks": len(points)},
extra={
"event": "ingest_done",
"status": "ok",
"file": file_path,
"chunks": len(points),
"duration_ms": qdrant_ms,
},
)