feat: MCP-Server für RAG-Retrieval + Webhook-Härtung
All checks were successful
CI / ci (push) Successful in 49s
Release / release (push) Successful in 1m2s

app/mcp_server.py: FastMCP (mcp SDK), streamable-http auf /mcp, statischer
Bearer-Token (constant-time ASGI-Middleware), Fail-Fast ohne RAG_MCP_TOKEN.
Tools rag_search (mit semester/fach/typ-Filter) + get_file_chunks. Läuft aus
demselben Image wie der Ingestor und reused den Embed-Pfad → Vektoren sind
garantiert kompatibel zum Ingest (der offizielle qdrant-MCP-Server kann nur
fastembed → Dimension-/Schema-Mismatch).

app/qdrant_store.py: search_chunks (query_points + optionaler Payload-Filter)
und get_chunks_by_path (scroll, nach chunk_index sortiert).

app/bulk.py: Amplification-Guard — /bulk-import lehnt mit 409 ab solange ein
vorheriger Bulk noch BackgroundTasks abarbeitet.

docker-compose.coolify.yml: rag-mcp-Service (nicht public, externes
metamcp-net statt Stack-Coupling) + Traefik-Rate-Limit-Middleware am ingestor.

tests/conftest.py: Settings-env_file in Tests neutralisieren (Dev-.env darf
die Suite nicht kontaminieren). 68 passed, ruff clean.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-18 22:08:37 +02:00
parent a6a2175f8b
commit 9643011e64
12 changed files with 935 additions and 8 deletions

View File

@@ -24,10 +24,21 @@ router = APIRouter()
BULK_CONCURRENCY = 4
_bulk_semaphore = asyncio.Semaphore(BULK_CONCURRENCY)
# Amplification guard: one /bulk-import dispatches one BackgroundTask per
# matching file with no upper bound. Repeated calls (a misfiring Nextcloud
# flow, or an attacker holding the secret) would pile unbounded tasks. We
# track outstanding dispatched work and reject a new bulk while any is
# still draining — bulk runs are rare, so serialising them is acceptable.
_inflight = 0
async def _process_with_semaphore(file_path: str, event_type: EventType) -> None:
async with _bulk_semaphore:
await process_file(file_path, event_type)
global _inflight
try:
async with _bulk_semaphore:
await process_file(file_path, event_type)
finally:
_inflight -= 1
class BulkRequest(BaseModel):
@@ -83,9 +94,20 @@ async def bulk_import(
background: BackgroundTasks,
x_webhook_secret: str | None = Header(default=None),
):
global _inflight
settings = get_settings()
verify_secret(x_webhook_secret, settings.webhook_secret)
if _inflight > 0:
logger.warning(
"bulk rejected: import already in progress",
extra={"event": "bulk_rejected", "path": body.path, "inflight": _inflight},
)
raise HTTPException(
status_code=status.HTTP_409_CONFLICT,
detail="a bulk import is already in progress",
)
try:
files = await list_files_recursive(
settings.nextcloud_webdav_url,
@@ -105,6 +127,7 @@ async def bulk_import(
ext = PurePosixPath(f).suffix.lstrip(".").lower()
if ext not in SUPPORTED_TYPES:
continue
_inflight += 1
background.add_task(_process_with_semaphore, f, EventType.CREATED)
dispatched += 1

View File

@@ -23,6 +23,12 @@ class Settings(BaseSettings):
chunk_overlap_words: int = 50
log_level: str = "INFO"
# MCP server (app.mcp_server). Optional so the ingestor — which shares
# this Settings model — is unaffected. The MCP server itself refuses to
# start when rag_mcp_token is empty.
rag_mcp_token: str = ""
rag_mcp_port: int = 9009
@lru_cache(maxsize=1)
def get_settings() -> Settings:

147
app/mcp_server.py Normal file
View File

@@ -0,0 +1,147 @@
"""MCP server exposing the THB-Studium RAG corpus.
Runs from the *same* image as the ingestor and reuses its embedding path
(`app.ingest.embedder`), so query vectors are produced by the exact model
used at ingest time — the only way Qdrant search returns meaningful hits.
Transport: streamable-http on ``/mcp``. A static bearer token gates every
request; the token is the second control layer behind network isolation
(the service is only reachable by MetaMCP over a dedicated bridge).
"""
import hmac
import logging
import sys
from functools import lru_cache
import uvicorn
from mcp.server.fastmcp import FastMCP
from qdrant_client import QdrantClient
from starlette.types import Receive, Scope, Send
from app.config import get_settings
from app.ingest.embedder import embed_texts
from app.logging_setup import setup_logging
from app.qdrant_store import get_chunks_by_path, search_chunks
logger = logging.getLogger(__name__)
mcp = FastMCP("rag-thb")
@lru_cache(maxsize=1)
def _qdrant() -> QdrantClient:
return QdrantClient(url=get_settings().qdrant_url)
@mcp.tool()
async def rag_search(
query: str,
limit: int = 5,
semester: str | None = None,
fach: str | None = None,
typ: str | None = None,
) -> list[dict]:
"""Semantische Suche im THB-Studium-Wissen (Vorlesungen, Übungen, Notizen).
Args:
query: Natürlichsprachige Suchanfrage.
limit: Maximale Trefferzahl (Default 5).
semester: Optionaler Filter, z.B. "2.Semester".
fach: Optionaler Filter, z.B. "Databases".
typ: Optionaler Filter, z.B. "Vorlesungen" oder "Uebungen".
Returns:
Treffer mit text und Quell-Metadaten (file_path, semester, fach,
typ, page, chunk_index) plus Similarity-score, absteigend sortiert.
"""
settings = get_settings()
vectors = await embed_texts([query], model=settings.ollama_embed_model)
return search_chunks(
_qdrant(),
settings.qdrant_collection,
vectors[0],
limit=limit,
semester=semester,
fach=fach,
typ=typ,
)
@mcp.tool()
async def get_file_chunks(file_path: str) -> list[dict]:
"""Alle Chunks eines Dokuments in Reihenfolge laden.
Nützlich, um nach einem rag_search-Treffer das vollständige Dokument
zu rekonstruieren.
Args:
file_path: Exakter Nextcloud-Pfad wie in rag_search-Treffern, z.B.
"Documents/THB/2.Semester/Databases/Uebungen/01/Loesung.pdf".
Returns:
Chunks mit chunk_index, page und text, nach chunk_index sortiert.
"""
settings = get_settings()
return get_chunks_by_path(_qdrant(), settings.qdrant_collection, file_path)
class BearerAuthMiddleware:
"""Pure-ASGI gate: constant-time check of ``Authorization: Bearer <token>``.
Non-HTTP scopes (lifespan, websocket) pass straight through so the
StreamableHTTP session manager's lifespan still runs.
"""
def __init__(self, app, token: str) -> None:
self._app = app
self._expected = f"Bearer {token}"
async def __call__(self, scope: Scope, receive: Receive, send: Send) -> None:
if scope["type"] != "http":
await self._app(scope, receive, send)
return
headers = dict(scope.get("headers") or [])
provided = headers.get(b"authorization", b"").decode()
if not hmac.compare_digest(provided, self._expected):
await send(
{
"type": "http.response.start",
"status": 401,
"headers": [(b"content-type", b"text/plain")],
}
)
await send({"type": "http.response.body", "body": b"unauthorized"})
return
await self._app(scope, receive, send)
def build_app():
"""Token-gated ASGI app, or exit if RAG_MCP_TOKEN is unset."""
settings = get_settings()
if not settings.rag_mcp_token:
logger.error(
"refusing to start: RAG_MCP_TOKEN is empty",
extra={"event": "mcp_startup_abort"},
)
sys.exit(1)
mcp.settings.host = "0.0.0.0"
mcp.settings.port = settings.rag_mcp_port
return BearerAuthMiddleware(mcp.streamable_http_app(), settings.rag_mcp_token)
def main() -> None:
settings = get_settings()
setup_logging(settings.log_level)
app = build_app()
logger.info(
"mcp server starting",
extra={"event": "mcp_startup", "port": settings.rag_mcp_port},
)
uvicorn.run(app, host="0.0.0.0", port=settings.rag_mcp_port)
if __name__ == "__main__":
main()

View File

@@ -61,3 +61,84 @@ def delete_by_path(client: QdrantClient, name: str, file_path: str) -> None:
)
)
client.delete(collection_name=name, points_selector=selector)
_RESULT_FIELDS = (
"text",
"file_path",
"file_name",
"semester",
"fach",
"typ",
"page",
"chunk_index",
)
def _payload_filter(
semester: str | None, fach: str | None, typ: str | None
) -> qm.Filter | None:
"""Build a Qdrant filter from optional metadata constraints, or None."""
conditions = [
qm.FieldCondition(key=key, match=qm.MatchValue(value=value))
for key, value in (("semester", semester), ("fach", fach), ("typ", typ))
if value
]
return qm.Filter(must=conditions) if conditions else None
def search_chunks(
client: QdrantClient,
name: str,
vector: list[float],
limit: int,
semester: str | None = None,
fach: str | None = None,
typ: str | None = None,
) -> list[dict[str, Any]]:
"""Vector search with optional metadata filtering.
Returns one dict per hit: the indexed payload fields plus the similarity
``score``. Caller must pass a vector embedded with the *same* model used
at ingest time, otherwise results are meaningless.
"""
response = client.query_points(
collection_name=name,
query=vector,
limit=limit,
query_filter=_payload_filter(semester, fach, typ),
with_payload=True,
)
out: list[dict[str, Any]] = []
for point in response.points:
payload = point.payload or {}
row: dict[str, Any] = {field: payload.get(field) for field in _RESULT_FIELDS}
row["score"] = point.score
out.append(row)
return out
def get_chunks_by_path(
client: QdrantClient, name: str, file_path: str
) -> list[dict[str, Any]]:
"""Return every chunk of one document, ordered by ``chunk_index``."""
points, _ = client.scroll(
collection_name=name,
scroll_filter=qm.Filter(
must=[qm.FieldCondition(key="file_path", match=qm.MatchValue(value=file_path))]
),
limit=10_000,
with_payload=True,
with_vectors=False,
)
rows = [
{
"chunk_index": p.payload.get("chunk_index"),
"page": p.payload.get("page"),
"text": p.payload.get("text"),
}
for p in points
if p.payload is not None
]
rows.sort(key=lambda r: r["chunk_index"] if r["chunk_index"] is not None else 0)
return rows