Embedding-Inferenz ist CPU-only und skaliert sonst auf alle Cores. cpus: "2.0" + OLLAMA_NUM_PARALLEL=1 halten die Last konstant bei ~2 statt Peaks bis 8 Cores. Bewusster Trade-off: ~5x langsamere Bulk- Laufzeit, dafuer predictable Host-Last (selten laufender Workload). README dokumentiert, dass Coolify dieselben Limits spiegeln muss. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
40 lines
926 B
YAML
40 lines
926 B
YAML
# Local development only.
|
|
# Production deployment goes via Coolify using docker/Dockerfile alone;
|
|
# the compose file here is for booting up qdrant + ollama next to the
|
|
# ingestor on a developer machine.
|
|
services:
|
|
ingestor:
|
|
build:
|
|
context: .
|
|
dockerfile: docker/Dockerfile
|
|
env_file: .env
|
|
ports:
|
|
- "8000:8000"
|
|
depends_on:
|
|
- qdrant
|
|
- ollama
|
|
|
|
qdrant:
|
|
image: qdrant/qdrant:latest
|
|
ports:
|
|
- "6333:6333"
|
|
volumes:
|
|
- qdrant_data:/qdrant/storage
|
|
|
|
ollama:
|
|
image: ollama/ollama:latest
|
|
ports:
|
|
- "11434:11434"
|
|
volumes:
|
|
- ollama_data:/root/.ollama
|
|
# Cap CPU so embedding peaks don't starve the host. Mirror these
|
|
# limits in the production Coolify config — Ollama otherwise scales
|
|
# inference threads to all available cores.
|
|
cpus: "2.0"
|
|
environment:
|
|
OLLAMA_NUM_PARALLEL: "1"
|
|
|
|
volumes:
|
|
qdrant_data:
|
|
ollama_data:
|