Files
rag-ingestor/app/bulk.py
Jean-Luc Makiola 9643011e64
All checks were successful
CI / ci (push) Successful in 49s
Release / release (push) Successful in 1m2s
feat: MCP-Server für RAG-Retrieval + Webhook-Härtung
app/mcp_server.py: FastMCP (mcp SDK), streamable-http auf /mcp, statischer
Bearer-Token (constant-time ASGI-Middleware), Fail-Fast ohne RAG_MCP_TOKEN.
Tools rag_search (mit semester/fach/typ-Filter) + get_file_chunks. Läuft aus
demselben Image wie der Ingestor und reused den Embed-Pfad → Vektoren sind
garantiert kompatibel zum Ingest (der offizielle qdrant-MCP-Server kann nur
fastembed → Dimension-/Schema-Mismatch).

app/qdrant_store.py: search_chunks (query_points + optionaler Payload-Filter)
und get_chunks_by_path (scroll, nach chunk_index sortiert).

app/bulk.py: Amplification-Guard — /bulk-import lehnt mit 409 ab solange ein
vorheriger Bulk noch BackgroundTasks abarbeitet.

docker-compose.coolify.yml: rag-mcp-Service (nicht public, externes
metamcp-net statt Stack-Coupling) + Traefik-Rate-Limit-Middleware am ingestor.

tests/conftest.py: Settings-env_file in Tests neutralisieren (Dev-.env darf
die Suite nicht kontaminieren). 68 passed, ruff clean.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 22:08:37 +02:00

139 lines
4.7 KiB
Python

import asyncio
import logging
import xml.etree.ElementTree as ET
from pathlib import PurePosixPath
from urllib.parse import unquote
import httpx
from fastapi import APIRouter, BackgroundTasks, Header, HTTPException, status
from pydantic import BaseModel, Field
from app.config import get_settings
from app.ingest.extractors import SUPPORTED_TYPES
from app.ingest.pipeline import process_file
from app.webhook.auth import verify_secret
from app.webhook.models import EventType
logger = logging.getLogger(__name__)
router = APIRouter()
# Limits parallel pipeline execution during a bulk import so we don't
# saturate Ollama/WebDAV with thousands of concurrent requests.
BULK_CONCURRENCY = 4
_bulk_semaphore = asyncio.Semaphore(BULK_CONCURRENCY)
# Amplification guard: one /bulk-import dispatches one BackgroundTask per
# matching file with no upper bound. Repeated calls (a misfiring Nextcloud
# flow, or an attacker holding the secret) would pile unbounded tasks. We
# track outstanding dispatched work and reject a new bulk while any is
# still draining — bulk runs are rare, so serialising them is acceptable.
_inflight = 0
async def _process_with_semaphore(file_path: str, event_type: EventType) -> None:
global _inflight
try:
async with _bulk_semaphore:
await process_file(file_path, event_type)
finally:
_inflight -= 1
class BulkRequest(BaseModel):
path: str = Field(min_length=1)
PROPFIND_BODY = """<?xml version="1.0"?>
<d:propfind xmlns:d="DAV:"><d:prop><d:resourcetype/></d:prop></d:propfind>
"""
DAV_NS = {"d": "DAV:"}
async def list_files_recursive(base_url: str, user: str, password: str, path: str) -> list[str]:
"""PROPFIND with Depth: infinity. Returns relative file paths (no folders)."""
base = base_url.rstrip("/")
rel = path.strip("/")
url = f"{base}/{rel}"
async with httpx.AsyncClient(auth=(user, password), timeout=120.0) as client:
response = await client.request(
"PROPFIND",
url,
headers={"Depth": "infinity", "Content-Type": "application/xml"},
content=PROPFIND_BODY,
)
if response.status_code not in (200, 207):
raise RuntimeError(f"PROPFIND failed: status={response.status_code}")
root = ET.fromstring(response.text)
base_path_segment = PurePosixPath(httpx.URL(base).path).as_posix() # "/remote.php/dav/files/u"
out: list[str] = []
for resp in root.findall("d:response", DAV_NS):
href = resp.findtext("d:href", default="", namespaces=DAV_NS)
decoded = unquote(href)
# Strip the WebDAV base prefix → leaves "Documents/.../file.pdf"
if decoded.startswith(base_path_segment):
decoded = decoded[len(base_path_segment):]
decoded = decoded.lstrip("/")
if not decoded or decoded.endswith("/"):
continue
# Skip directory entries (those have <d:collection/> resourcetype)
rt = resp.find("d:propstat/d:prop/d:resourcetype/d:collection", DAV_NS)
if rt is not None:
continue
out.append(decoded)
return out
@router.post("/bulk-import", status_code=status.HTTP_202_ACCEPTED)
async def bulk_import(
body: BulkRequest,
background: BackgroundTasks,
x_webhook_secret: str | None = Header(default=None),
):
global _inflight
settings = get_settings()
verify_secret(x_webhook_secret, settings.webhook_secret)
if _inflight > 0:
logger.warning(
"bulk rejected: import already in progress",
extra={"event": "bulk_rejected", "path": body.path, "inflight": _inflight},
)
raise HTTPException(
status_code=status.HTTP_409_CONFLICT,
detail="a bulk import is already in progress",
)
try:
files = await list_files_recursive(
settings.nextcloud_webdav_url,
settings.nextcloud_user,
settings.nextcloud_app_password,
body.path,
)
except (RuntimeError, httpx.HTTPError, ET.ParseError) as exc:
logger.exception(
"bulk listing failed",
extra={"event": "bulk_listing_failed", "path": body.path, "error": str(exc)},
)
raise HTTPException(status_code=502, detail="webdav listing failed") from exc
dispatched = 0
for f in files:
ext = PurePosixPath(f).suffix.lstrip(".").lower()
if ext not in SUPPORTED_TYPES:
continue
_inflight += 1
background.add_task(_process_with_semaphore, f, EventType.CREATED)
dispatched += 1
logger.info(
"bulk dispatch",
extra={"event": "bulk_dispatch", "path": body.path, "dispatched": dispatched, "total_listed": len(files)},
)
return {"status": "accepted", "dispatched": dispatched}