import asyncio import logging import xml.etree.ElementTree as ET from pathlib import PurePosixPath from urllib.parse import unquote import httpx from fastapi import APIRouter, BackgroundTasks, Header, HTTPException, status from pydantic import BaseModel, Field from app.config import get_settings from app.ingest.extractors import SUPPORTED_TYPES from app.ingest.pipeline import process_file from app.webhook.auth import verify_secret from app.webhook.models import EventType logger = logging.getLogger(__name__) router = APIRouter() # Limits parallel pipeline execution during a bulk import so we don't # saturate Ollama/WebDAV with thousands of concurrent requests. BULK_CONCURRENCY = 4 _bulk_semaphore = asyncio.Semaphore(BULK_CONCURRENCY) # Amplification guard: one /bulk-import dispatches one BackgroundTask per # matching file with no upper bound. Repeated calls (a misfiring Nextcloud # flow, or an attacker holding the secret) would pile unbounded tasks. We # track outstanding dispatched work and reject a new bulk while any is # still draining — bulk runs are rare, so serialising them is acceptable. _inflight = 0 async def _process_with_semaphore(file_path: str, event_type: EventType) -> None: global _inflight try: async with _bulk_semaphore: await process_file(file_path, event_type) finally: _inflight -= 1 class BulkRequest(BaseModel): path: str = Field(min_length=1) PROPFIND_BODY = """ """ DAV_NS = {"d": "DAV:"} async def list_files_recursive(base_url: str, user: str, password: str, path: str) -> list[str]: """PROPFIND with Depth: infinity. Returns relative file paths (no folders).""" base = base_url.rstrip("/") rel = path.strip("/") url = f"{base}/{rel}" async with httpx.AsyncClient(auth=(user, password), timeout=120.0) as client: response = await client.request( "PROPFIND", url, headers={"Depth": "infinity", "Content-Type": "application/xml"}, content=PROPFIND_BODY, ) if response.status_code not in (200, 207): raise RuntimeError(f"PROPFIND failed: status={response.status_code}") root = ET.fromstring(response.text) base_path_segment = PurePosixPath(httpx.URL(base).path).as_posix() # "/remote.php/dav/files/u" out: list[str] = [] for resp in root.findall("d:response", DAV_NS): href = resp.findtext("d:href", default="", namespaces=DAV_NS) decoded = unquote(href) # Strip the WebDAV base prefix → leaves "Documents/.../file.pdf" if decoded.startswith(base_path_segment): decoded = decoded[len(base_path_segment):] decoded = decoded.lstrip("/") if not decoded or decoded.endswith("/"): continue # Skip directory entries (those have resourcetype) rt = resp.find("d:propstat/d:prop/d:resourcetype/d:collection", DAV_NS) if rt is not None: continue out.append(decoded) return out @router.post("/bulk-import", status_code=status.HTTP_202_ACCEPTED) async def bulk_import( body: BulkRequest, background: BackgroundTasks, x_webhook_secret: str | None = Header(default=None), ): global _inflight settings = get_settings() verify_secret(x_webhook_secret, settings.webhook_secret) if _inflight > 0: logger.warning( "bulk rejected: import already in progress", extra={"event": "bulk_rejected", "path": body.path, "inflight": _inflight}, ) raise HTTPException( status_code=status.HTTP_409_CONFLICT, detail="a bulk import is already in progress", ) try: files = await list_files_recursive( settings.nextcloud_webdav_url, settings.nextcloud_user, settings.nextcloud_app_password, body.path, ) except (RuntimeError, httpx.HTTPError, ET.ParseError) as exc: logger.exception( "bulk listing failed", extra={"event": "bulk_listing_failed", "path": body.path, "error": str(exc)}, ) raise HTTPException(status_code=502, detail="webdav listing failed") from exc dispatched = 0 for f in files: ext = PurePosixPath(f).suffix.lstrip(".").lower() if ext not in SUPPORTED_TYPES: continue _inflight += 1 background.add_task(_process_with_semaphore, f, EventType.CREATED) dispatched += 1 logger.info( "bulk dispatch", extra={"event": "bulk_dispatch", "path": body.path, "dispatched": dispatched, "total_listed": len(files)}, ) return {"status": "accepted", "dispatched": dispatched}