64 lines
2.1 KiB
Python
64 lines
2.1 KiB
Python
import uuid
|
|
from dataclasses import dataclass
|
|
from typing import Any
|
|
|
|
from qdrant_client import QdrantClient
|
|
from qdrant_client.http import models as qm
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class ChunkPoint:
|
|
vector: list[float]
|
|
payload: dict[str, Any]
|
|
|
|
|
|
def ensure_collection(client: QdrantClient, name: str, vector_size: int) -> None:
|
|
"""Create the collection if missing. Crash if it exists with wrong dim.
|
|
|
|
Note: payload indexes are created only on initial collection creation;
|
|
they are not reconciled on subsequent runs.
|
|
"""
|
|
if not client.collection_exists(name):
|
|
client.create_collection(
|
|
collection_name=name,
|
|
vectors_config=qm.VectorParams(size=vector_size, distance=qm.Distance.COSINE),
|
|
)
|
|
for field in ("file_path", "semester", "fach"):
|
|
client.create_payload_index(
|
|
collection_name=name,
|
|
field_name=field,
|
|
field_schema=qm.PayloadSchemaType.KEYWORD,
|
|
)
|
|
return
|
|
|
|
info = client.get_collection(name)
|
|
existing = info.config.params.vectors.size
|
|
if existing != vector_size:
|
|
raise RuntimeError(
|
|
f"qdrant collection '{name}' dimension mismatch: "
|
|
f"existing={existing}, model={vector_size}. "
|
|
"Drop the collection manually and run a bulk import."
|
|
)
|
|
|
|
|
|
def upsert_chunks(client: QdrantClient, name: str, chunks: list[ChunkPoint]) -> None:
|
|
"""Insert chunks with fresh UUID ids.
|
|
|
|
Caller is responsible for deduplication: call ``delete_by_path`` for the
|
|
file before re-ingesting, otherwise duplicates accumulate.
|
|
"""
|
|
points = [
|
|
qm.PointStruct(id=str(uuid.uuid4()), vector=c.vector, payload=c.payload)
|
|
for c in chunks
|
|
]
|
|
client.upsert(collection_name=name, points=points)
|
|
|
|
|
|
def delete_by_path(client: QdrantClient, name: str, file_path: str) -> None:
|
|
selector = qm.FilterSelector(
|
|
filter=qm.Filter(
|
|
must=[qm.FieldCondition(key="file_path", match=qm.MatchValue(value=file_path))]
|
|
)
|
|
)
|
|
client.delete(collection_name=name, points_selector=selector)
|