feat: MCP-Server für RAG-Retrieval + Webhook-Härtung

app/mcp_server.py: FastMCP (mcp SDK), streamable-http auf /mcp, statischer Bearer-Token (constant-time ASGI-Middleware), Fail-Fast ohne RAG_MCP_TOKEN. Tools rag_search (mit semester/fach/typ-Filter) + get_file_chunks. Läuft aus demselben Image wie der Ingestor und reused den Embed-Pfad → Vektoren sind garantiert kompatibel zum Ingest (der offizielle qdrant-MCP-Server kann nur fastembed → Dimension-/Schema-Mismatch). app/qdrant_store.py: search_chunks (query_points + optionaler Payload-Filter) und get_chunks_by_path (scroll, nach chunk_index sortiert). app/bulk.py: Amplification-Guard — /bulk-import lehnt mit 409 ab solange ein vorheriger Bulk noch BackgroundTasks abarbeitet. docker-compose.coolify.yml: rag-mcp-Service (nicht public, externes metamcp-net statt Stack-Coupling) + Traefik-Rate-Limit-Middleware am ingestor. tests/conftest.py: Settings-env_file in Tests neutralisieren (Dev-.env darf die Suite nicht kontaminieren). 68 passed, ruff clean. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 22:08:37 +02:00
parent a6a2175f8b
commit 9643011e64
12 changed files with 935 additions and 8 deletions
--- a/app/bulk.py
+++ b/app/bulk.py
@@ -24,10 +24,21 @@ router = APIRouter()
 BULK_CONCURRENCY = 4
 _bulk_semaphore = asyncio.Semaphore(BULK_CONCURRENCY)

+# Amplification guard: one /bulk-import dispatches one BackgroundTask per
+# matching file with no upper bound. Repeated calls (a misfiring Nextcloud
+# flow, or an attacker holding the secret) would pile unbounded tasks. We
+# track outstanding dispatched work and reject a new bulk while any is
+# still draining — bulk runs are rare, so serialising them is acceptable.
+_inflight = 0
+

 async def _process_with_semaphore(file_path: str, event_type: EventType) -> None:
-    async with _bulk_semaphore:
-        await process_file(file_path, event_type)
+    global _inflight
+    try:
+        async with _bulk_semaphore:
+            await process_file(file_path, event_type)
+    finally:
+        _inflight -= 1


 class BulkRequest(BaseModel):
@@ -83,9 +94,20 @@ async def bulk_import(
    background: BackgroundTasks,
    x_webhook_secret: str | None = Header(default=None),
 ):
+    global _inflight
    settings = get_settings()
    verify_secret(x_webhook_secret, settings.webhook_secret)

+    if _inflight > 0:
+        logger.warning(
+            "bulk rejected: import already in progress",
+            extra={"event": "bulk_rejected", "path": body.path, "inflight": _inflight},
+        )
+        raise HTTPException(
+            status_code=status.HTTP_409_CONFLICT,
+            detail="a bulk import is already in progress",
+        )
+
    try:
        files = await list_files_recursive(
            settings.nextcloud_webdav_url,
@@ -105,6 +127,7 @@ async def bulk_import(
        ext = PurePosixPath(f).suffix.lstrip(".").lower()
        if ext not in SUPPORTED_TYPES:
            continue
+        _inflight += 1
        background.add_task(_process_with_semaphore, f, EventType.CREATED)
        dispatched += 1

--- a/app/config.py
+++ b/app/config.py
@@ -23,6 +23,12 @@ class Settings(BaseSettings):
    chunk_overlap_words: int = 50
    log_level: str = "INFO"

+    # MCP server (app.mcp_server). Optional so the ingestor — which shares
+    # this Settings model — is unaffected. The MCP server itself refuses to
+    # start when rag_mcp_token is empty.
+    rag_mcp_token: str = ""
+    rag_mcp_port: int = 9009
+

@lru_cache(maxsize=1)
 def get_settings() -> Settings:
--- a/app/mcp_server.py
+++ b/app/mcp_server.py
@@ -0,0 +1,147 @@
+"""MCP server exposing the THB-Studium RAG corpus.
+
+Runs from the *same* image as the ingestor and reuses its embedding path
+(`app.ingest.embedder`), so query vectors are produced by the exact model
+used at ingest time — the only way Qdrant search returns meaningful hits.
+
+Transport: streamable-http on ``/mcp``. A static bearer token gates every
+request; the token is the second control layer behind network isolation
+(the service is only reachable by MetaMCP over a dedicated bridge).
+"""
+
+import hmac
+import logging
+import sys
+from functools import lru_cache
+
+import uvicorn
+from mcp.server.fastmcp import FastMCP
+from qdrant_client import QdrantClient
+from starlette.types import Receive, Scope, Send
+
+from app.config import get_settings
+from app.ingest.embedder import embed_texts
+from app.logging_setup import setup_logging
+from app.qdrant_store import get_chunks_by_path, search_chunks
+
+logger = logging.getLogger(__name__)
+
+mcp = FastMCP("rag-thb")
+
+
+@lru_cache(maxsize=1)
+def _qdrant() -> QdrantClient:
+    return QdrantClient(url=get_settings().qdrant_url)
+
+
+@mcp.tool()
+async def rag_search(
+    query: str,
+    limit: int = 5,
+    semester: str | None = None,
+    fach: str | None = None,
+    typ: str | None = None,
+) -> list[dict]:
+    """Semantische Suche im THB-Studium-Wissen (Vorlesungen, Übungen, Notizen).
+
+    Args:
+        query: Natürlichsprachige Suchanfrage.
+        limit: Maximale Trefferzahl (Default 5).
+        semester: Optionaler Filter, z.B. "2.Semester".
+        fach: Optionaler Filter, z.B. "Databases".
+        typ: Optionaler Filter, z.B. "Vorlesungen" oder "Uebungen".
+
+    Returns:
+        Treffer mit text und Quell-Metadaten (file_path, semester, fach,
+        typ, page, chunk_index) plus Similarity-score, absteigend sortiert.
+    """
+    settings = get_settings()
+    vectors = await embed_texts([query], model=settings.ollama_embed_model)
+    return search_chunks(
+        _qdrant(),
+        settings.qdrant_collection,
+        vectors[0],
+        limit=limit,
+        semester=semester,
+        fach=fach,
+        typ=typ,
+    )
+
+
+@mcp.tool()
+async def get_file_chunks(file_path: str) -> list[dict]:
+    """Alle Chunks eines Dokuments in Reihenfolge laden.
+
+    Nützlich, um nach einem rag_search-Treffer das vollständige Dokument
+    zu rekonstruieren.
+
+    Args:
+        file_path: Exakter Nextcloud-Pfad wie in rag_search-Treffern, z.B.
+            "Documents/THB/2.Semester/Databases/Uebungen/01/Loesung.pdf".
+
+    Returns:
+        Chunks mit chunk_index, page und text, nach chunk_index sortiert.
+    """
+    settings = get_settings()
+    return get_chunks_by_path(_qdrant(), settings.qdrant_collection, file_path)
+
+
+class BearerAuthMiddleware:
+    """Pure-ASGI gate: constant-time check of ``Authorization: Bearer <token>``.
+
+    Non-HTTP scopes (lifespan, websocket) pass straight through so the
+    StreamableHTTP session manager's lifespan still runs.
+    """
+
+    def __init__(self, app, token: str) -> None:
+        self._app = app
+        self._expected = f"Bearer {token}"
+
+    async def __call__(self, scope: Scope, receive: Receive, send: Send) -> None:
+        if scope["type"] != "http":
+            await self._app(scope, receive, send)
+            return
+
+        headers = dict(scope.get("headers") or [])
+        provided = headers.get(b"authorization", b"").decode()
+        if not hmac.compare_digest(provided, self._expected):
+            await send(
+                {
+                    "type": "http.response.start",
+                    "status": 401,
+                    "headers": [(b"content-type", b"text/plain")],
+                }
+            )
+            await send({"type": "http.response.body", "body": b"unauthorized"})
+            return
+
+        await self._app(scope, receive, send)
+
+
+def build_app():
+    """Token-gated ASGI app, or exit if RAG_MCP_TOKEN is unset."""
+    settings = get_settings()
+    if not settings.rag_mcp_token:
+        logger.error(
+            "refusing to start: RAG_MCP_TOKEN is empty",
+            extra={"event": "mcp_startup_abort"},
+        )
+        sys.exit(1)
+    mcp.settings.host = "0.0.0.0"
+    mcp.settings.port = settings.rag_mcp_port
+    return BearerAuthMiddleware(mcp.streamable_http_app(), settings.rag_mcp_token)
+
+
+def main() -> None:
+    settings = get_settings()
+    setup_logging(settings.log_level)
+    app = build_app()
+    logger.info(
+        "mcp server starting",
+        extra={"event": "mcp_startup", "port": settings.rag_mcp_port},
+    )
+    uvicorn.run(app, host="0.0.0.0", port=settings.rag_mcp_port)
+
+
+if __name__ == "__main__":
+    main()
--- a/app/qdrant_store.py
+++ b/app/qdrant_store.py
@@ -61,3 +61,84 @@ def delete_by_path(client: QdrantClient, name: str, file_path: str) -> None:
        )
    )
    client.delete(collection_name=name, points_selector=selector)
+
+
+_RESULT_FIELDS = (
+    "text",
+    "file_path",
+    "file_name",
+    "semester",
+    "fach",
+    "typ",
+    "page",
+    "chunk_index",
+)
+
+
+def _payload_filter(
+    semester: str | None, fach: str | None, typ: str | None
+) -> qm.Filter | None:
+    """Build a Qdrant filter from optional metadata constraints, or None."""
+    conditions = [
+        qm.FieldCondition(key=key, match=qm.MatchValue(value=value))
+        for key, value in (("semester", semester), ("fach", fach), ("typ", typ))
+        if value
+    ]
+    return qm.Filter(must=conditions) if conditions else None
+
+
+def search_chunks(
+    client: QdrantClient,
+    name: str,
+    vector: list[float],
+    limit: int,
+    semester: str | None = None,
+    fach: str | None = None,
+    typ: str | None = None,
+) -> list[dict[str, Any]]:
+    """Vector search with optional metadata filtering.
+
+    Returns one dict per hit: the indexed payload fields plus the similarity
+    ``score``. Caller must pass a vector embedded with the *same* model used
+    at ingest time, otherwise results are meaningless.
+    """
+    response = client.query_points(
+        collection_name=name,
+        query=vector,
+        limit=limit,
+        query_filter=_payload_filter(semester, fach, typ),
+        with_payload=True,
+    )
+    out: list[dict[str, Any]] = []
+    for point in response.points:
+        payload = point.payload or {}
+        row: dict[str, Any] = {field: payload.get(field) for field in _RESULT_FIELDS}
+        row["score"] = point.score
+        out.append(row)
+    return out
+
+
+def get_chunks_by_path(
+    client: QdrantClient, name: str, file_path: str
+) -> list[dict[str, Any]]:
+    """Return every chunk of one document, ordered by ``chunk_index``."""
+    points, _ = client.scroll(
+        collection_name=name,
+        scroll_filter=qm.Filter(
+            must=[qm.FieldCondition(key="file_path", match=qm.MatchValue(value=file_path))]
+        ),
+        limit=10_000,
+        with_payload=True,
+        with_vectors=False,
+    )
+    rows = [
+        {
+            "chunk_index": p.payload.get("chunk_index"),
+            "page": p.payload.get("page"),
+            "text": p.payload.get("text"),
+        }
+        for p in points
+        if p.payload is not None
+    ]
+    rows.sort(key=lambda r: r["chunk_index"] if r["chunk_index"] is not None else 0)
+    return rows