-
Notifications
You must be signed in to change notification settings - Fork 51
Expand file tree
/
Copy pathdocker-compose.yaml
More file actions
149 lines (141 loc) · 4.19 KB
/
Copy pathdocker-compose.yaml
File metadata and controls
149 lines (141 loc) · 4.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
include:
- vdb/milvus.yaml
- extern/infinity.yaml
x-openrag: &openrag_template
# image: ghcr.io/linagora/openrag:dev-latest
image: linagoraai/openrag:latest
build:
context: .
dockerfile: Dockerfile
volumes:
- ${DATA_VOLUME:-./data}:/app/data
- ${MODEL_WEIGHTS_VOLUME:-~/.cache/huggingface}:/app/model_weights # Model weights for RAG
# - ./openrag:/app/openrag # For dev mode
- /$SHARED_ENV:/ray_mount/.env # Shared environment variables
# - ./logs:/app/logs
ports:
- ${APP_PORT:-8080}:${APP_iPORT:-8080}
- 127.0.0.1:${RAY_DASHBOARD_PORT:-8265}:8265 # Localhost only: Ray dashboard/Jobs API is unauthenticated. Disable when in cluster mode
networks:
default:
aliases:
- openrag
env_file:
- ${SHARED_ENV:-.env}
shm_size: 10.24gb
x-vllm: &vllm_template
networks:
default:
aliases:
- vllm
restart: on-failure
environment:
- HUGGING_FACE_HUB_TOKEN
ipc: "host"
volumes:
- ${VLLM_CACHE:-/root/.cache/huggingface}:/root/.cache/huggingface # put ./vllm_cache if you want to have the weights on the vllm_cache folder in your project
command: >
--model ${EMBEDDER_MODEL_NAME:-jinaai/jina-embeddings-v3}
--trust-remote-code
--task embed
--gpu_memory_utilization 0.3
--max-model-len ${MAX_MODEL_LEN:-8192}
# --max-num-seqs 1
# gpu_memory_utilization, max-num-seqs et max-model-len can be tuned depending on your GPU memory
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 20s
timeout: 5s
retries: 4
start_period: 90s
# ports:
# - ${VLLM_PORT:-8000}:8000
services:
# OpenRAG Indexer UI
indexer-ui:
image: linagoraai/indexer-ui:latest
# build:
# context: ./extern/indexer-ui
# dockerfile: Dockerfile
environment:
- API_BASE_URL=${API_BASE_URL:-http://localhost:${APP_PORT:-8080}}
- INCLUDE_CREDENTIALS=${INCLUDE_CREDENTIALS:-false}
- DEFAULT_LANGUAGE=${DEFAULT_LANGUAGE:-}
ports:
- "${INDEXERUI_PORT:-3042}:3000"
restart: unless-stopped
# GPU - default
openrag:
<<: *openrag_template
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [ gpu ]
profiles:
- ''
depends_on:
rdb:
condition: service_started
milvus:
condition: service_healthy
vllm-gpu:
condition: service_healthy
# No GPU
openrag-cpu:
<<: *openrag_template
deploy: {}
profiles:
- 'cpu'
depends_on:
rdb:
condition: service_started
milvus:
condition: service_healthy
vllm-cpu:
condition: service_healthy
rdb:
image: postgres:15
environment:
- POSTGRES_PASSWORD=${POSTGRES_PASSWORD:?Set POSTGRES_PASSWORD in your .env}
- POSTGRES_USER=${POSTGRES_USER:-root}
volumes:
- ${DB_VOLUME:-./db}:/var/lib/postgresql/data
vllm-gpu:
<<: *vllm_template
image: vllm/vllm-openai:v0.9.2
runtime: nvidia
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
profiles:
- '' # Empty string gives default behavior (but does not run when cpu requested)
vllm-cpu:
<<: *vllm_template
build:
context: extern/vllm
dockerfile: Dockerfile.cpu
target: vllm-openai
image: openrag-vllm-openai-cpu
deploy: {}
environment:
- VLLM_CPU_KVCACHE_SPACE=8
# Default value isn't sufficient for full context length
command: >
--model ${EMBEDDER_MODEL_NAME:-jinaai/jina-embeddings-v3}
--trust-remote-code
--dtype float32
--max-model-len ${MAX_MODEL_LEN:-8192}
# --max-num-batched-tokens 32768
# dtype is required for aarch64 (https://github.com/vllm-project/vllm/issues/11327) and improves speed on amd64.
# max-num-batched-tokens is required for aarch64 because chunked prefill isn't supported by V1 vllm backend
# for aarch64 yet. On aarch64 max-num-batched-tokens must be equal max-model-len for now (without chunked prefill).
# For details see https://github.com/vllm-project/vllm/issues/21179
profiles:
- 'cpu'