Compare commits
2 Commits
v0.1.0
...
8c7dc7ab81
| Author | SHA1 | Date | |
|---|---|---|---|
| 8c7dc7ab81 | |||
| cce17f3517 |
@@ -59,6 +59,15 @@ Image bauen und in Coolify neben Qdrant + Ollama deployen:
|
|||||||
docker build -f docker/Dockerfile -t rag-ingestor .
|
docker build -f docker/Dockerfile -t rag-ingestor .
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Ollama-Ressourcenlimits
|
||||||
|
|
||||||
|
Embedding-Inferenz ist CPU-only und skaliert per Default auf alle verfügbaren Cores. Für Produktion daher Ollama hart limitieren, damit der Host nicht von Ingest-Spikes blockiert wird:
|
||||||
|
|
||||||
|
- `cpus: "2.0"` (Container-Cap)
|
||||||
|
- `OLLAMA_NUM_PARALLEL=1` (serialisiert Embedding-Requests intern)
|
||||||
|
|
||||||
|
Beide Werte sind in `docker-compose.yml` für die lokale Entwicklung gesetzt und sollten in Coolify entsprechend mitgepflegt werden. Folge: konstante ~2 CPU statt Peaks bis 8 CPU, dafür längere Bulk-Laufzeiten.
|
||||||
|
|
||||||
## Tests
|
## Tests
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
|||||||
96
docker-compose.coolify.yml
Normal file
96
docker-compose.coolify.yml
Normal file
@@ -0,0 +1,96 @@
|
|||||||
|
# Coolify production stack — self-contained: qdrant + ollama + ingestor.
|
||||||
|
#
|
||||||
|
# Services kommunizieren nur über das compose-interne Netz. Nichts wird
|
||||||
|
# publiziert außer dem Ingestor: in der Coolify-UI eine Domain auf den
|
||||||
|
# Service "ingestor" / Port 8000 mappen, damit Nextcloud den Webhook
|
||||||
|
# erreicht. qdrant + ollama bleiben bewusst nicht öffentlich.
|
||||||
|
#
|
||||||
|
# Voraussetzungen in Coolify:
|
||||||
|
# - Pull-Credentials für gitea.jeanlucmakiola.de Registry hinterlegen
|
||||||
|
# - Folgende Env-Vars setzen (Secrets wo markiert):
|
||||||
|
# NEXTCLOUD_WEBDAV_URL
|
||||||
|
# NEXTCLOUD_USER
|
||||||
|
# NEXTCLOUD_APP_PASSWORD (Secret)
|
||||||
|
# WEBHOOK_SECRET (Secret)
|
||||||
|
# QDRANT_COLLECTION (optional, Default rag_thb_studium)
|
||||||
|
# INGEST_ROOT (optional, Default Documents/THB)
|
||||||
|
# LOG_LEVEL (optional, Default INFO)
|
||||||
|
|
||||||
|
services:
|
||||||
|
qdrant:
|
||||||
|
image: qdrant/qdrant:latest
|
||||||
|
restart: unless-stopped
|
||||||
|
volumes:
|
||||||
|
- qdrant_data:/qdrant/storage
|
||||||
|
|
||||||
|
ollama:
|
||||||
|
image: ollama/ollama:latest
|
||||||
|
restart: unless-stopped
|
||||||
|
volumes:
|
||||||
|
- ollama_data:/root/.ollama
|
||||||
|
# Konstante ~2 Cores statt Peaks über alle Host-Cores. Bewusster
|
||||||
|
# Trade-off: langsamerer Ingest, dafür predictable Last.
|
||||||
|
cpus: "2.0"
|
||||||
|
environment:
|
||||||
|
OLLAMA_NUM_PARALLEL: "1"
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "ollama", "list"]
|
||||||
|
interval: 10s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 12
|
||||||
|
|
||||||
|
# One-shot: zieht das Embed-Modell in das ollama-Volume und beendet sich.
|
||||||
|
# Idempotent — bei Redeploy mit warmem Volume nur Digest-Verifikation.
|
||||||
|
# Verhindert den Startup-Crash des Ingestors bei fehlendem Modell.
|
||||||
|
ollama-pull:
|
||||||
|
image: ollama/ollama:latest
|
||||||
|
restart: "no"
|
||||||
|
depends_on:
|
||||||
|
ollama:
|
||||||
|
condition: service_healthy
|
||||||
|
environment:
|
||||||
|
OLLAMA_HOST: "http://ollama:11434"
|
||||||
|
command: ["pull", "qwen3-embedding:0.6b"]
|
||||||
|
|
||||||
|
ingestor:
|
||||||
|
image: gitea.jeanlucmakiola.de/makiolaj/rag-ingestor:latest
|
||||||
|
restart: unless-stopped
|
||||||
|
pull_policy: always
|
||||||
|
depends_on:
|
||||||
|
qdrant:
|
||||||
|
condition: service_started
|
||||||
|
ollama:
|
||||||
|
condition: service_healthy
|
||||||
|
ollama-pull:
|
||||||
|
condition: service_completed_successfully
|
||||||
|
environment:
|
||||||
|
# Coolify-Magie: macht den Ingestor öffentlich über den Coolify-Proxy
|
||||||
|
# (inkl. Let's-Encrypt-TLS). Coolify generiert eine Domain; in der UI
|
||||||
|
# auf die echte (z.B. ingest.jeanlucmakiola.de) überschreiben. Nextcloud
|
||||||
|
# ruft nur diese öffentliche URL an — kein Coolify-Netz-Zugriff nötig.
|
||||||
|
SERVICE_FQDN_INGESTOR_8000: /
|
||||||
|
NEXTCLOUD_WEBDAV_URL: ${NEXTCLOUD_WEBDAV_URL}
|
||||||
|
NEXTCLOUD_USER: ${NEXTCLOUD_USER}
|
||||||
|
NEXTCLOUD_APP_PASSWORD: ${NEXTCLOUD_APP_PASSWORD}
|
||||||
|
OLLAMA_URL: http://ollama:11434
|
||||||
|
OLLAMA_EMBED_MODEL: qwen3-embedding:0.6b
|
||||||
|
QDRANT_URL: http://qdrant:6333
|
||||||
|
QDRANT_COLLECTION: ${QDRANT_COLLECTION:-rag_thb_studium}
|
||||||
|
WEBHOOK_SECRET: ${WEBHOOK_SECRET}
|
||||||
|
INGEST_ROOT: ${INGEST_ROOT:-Documents/THB}
|
||||||
|
LOG_LEVEL: ${LOG_LEVEL:-INFO}
|
||||||
|
expose:
|
||||||
|
- "8000"
|
||||||
|
healthcheck:
|
||||||
|
test:
|
||||||
|
- CMD
|
||||||
|
- python
|
||||||
|
- -c
|
||||||
|
- "import urllib.request,sys; sys.exit(0 if urllib.request.urlopen('http://localhost:8000/health').status==200 else 1)"
|
||||||
|
interval: 15s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 5
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
qdrant_data:
|
||||||
|
ollama_data:
|
||||||
@@ -27,6 +27,12 @@ services:
|
|||||||
- "11434:11434"
|
- "11434:11434"
|
||||||
volumes:
|
volumes:
|
||||||
- ollama_data:/root/.ollama
|
- ollama_data:/root/.ollama
|
||||||
|
# Cap CPU so embedding peaks don't starve the host. Mirror these
|
||||||
|
# limits in the production Coolify config — Ollama otherwise scales
|
||||||
|
# inference threads to all available cores.
|
||||||
|
cpus: "2.0"
|
||||||
|
environment:
|
||||||
|
OLLAMA_NUM_PARALLEL: "1"
|
||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
qdrant_data:
|
qdrant_data:
|
||||||
|
|||||||
Reference in New Issue
Block a user