Skip to content

Commit a895020

Browse files
committed
feat: make vllm-metal backend installation opt-in
Signed-off-by: Dorin Geman <dorin.geman@docker.com>
1 parent aa6b09e commit a895020

File tree

13 files changed

+234
-25
lines changed

13 files changed

+234
-25
lines changed

cmd/cli/commands/install-runner.go

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import (
1717
"github.com/docker/model-runner/pkg/inference/backends/diffusers"
1818
"github.com/docker/model-runner/pkg/inference/backends/llamacpp"
1919
"github.com/docker/model-runner/pkg/inference/backends/vllm"
20+
"github.com/docker/model-runner/pkg/inference/backends/vllmmetal"
2021
"github.com/spf13/cobra"
2122
)
2223

@@ -28,7 +29,7 @@ const (
2829
// installation will try to reach the model runner while waiting for it to
2930
// be ready.
3031
installWaitRetryInterval = 500 * time.Millisecond
31-
backendUsage = "Specify backend (" + llamacpp.Name + "|" + vllm.Name + "|" + diffusers.Name + "). Default: " + llamacpp.Name
32+
backendUsage = "Specify backend (" + llamacpp.Name + "|" + vllm.Name + "|" + diffusers.Name + "|" + vllmmetal.Name + "). Default: " + llamacpp.Name
3233
)
3334

3435
// waitForStandaloneRunnerAfterInstall waits for a standalone model runner
@@ -237,6 +238,17 @@ type runnerOptions struct {
237238

238239
// runInstallOrStart is shared logic for install-runner and start-runner commands
239240
func runInstallOrStart(cmd *cobra.Command, opts runnerOptions, debug bool) error {
241+
// vllm-metal is installed on-demand via the running model runner,
242+
// not as a standalone container. This applies to all engine kinds.
243+
if opts.backend == vllmmetal.Name {
244+
cmd.Println("Installing vllm-metal backend...")
245+
if err := desktopClient.InstallBackend(vllmmetal.Name); err != nil {
246+
return fmt.Errorf("failed to install vllm-metal backend: %w", err)
247+
}
248+
cmd.Println("vllm-metal backend installed successfully")
249+
return nil
250+
}
251+
240252
var vllmOnWSL bool
241253
// Ensure that we're running in a supported model runner context.
242254
engineKind := modelRunner.EngineKind()
@@ -324,7 +336,7 @@ func runInstallOrStart(cmd *cobra.Command, opts runnerOptions, debug bool) error
324336
}
325337

326338
// Validate backend selection
327-
validBackends := []string{llamacpp.Name, vllm.Name, diffusers.Name}
339+
validBackends := []string{llamacpp.Name, vllm.Name, diffusers.Name, vllmmetal.Name}
328340
if opts.backend != "" {
329341
isValid := false
330342
for _, valid := range validBackends {

cmd/cli/desktop/desktop.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -782,6 +782,30 @@ func (c *Client) ShowConfigs(modelFilter string) ([]scheduling.ModelConfigEntry,
782782
return configs, nil
783783
}
784784

785+
// InstallBackend triggers on-demand installation of a deferred backend
786+
func (c *Client) InstallBackend(backend string) error {
787+
installPath := inference.InferencePrefix + "/install-backend"
788+
jsonData, err := json.Marshal(struct {
789+
Backend string `json:"backend"`
790+
}{Backend: backend})
791+
if err != nil {
792+
return fmt.Errorf("error marshaling request: %w", err)
793+
}
794+
795+
resp, err := c.doRequest(http.MethodPost, installPath, bytes.NewReader(jsonData))
796+
if err != nil {
797+
return c.handleQueryError(err, installPath)
798+
}
799+
defer resp.Body.Close()
800+
801+
if resp.StatusCode != http.StatusOK {
802+
body, _ := io.ReadAll(resp.Body)
803+
return fmt.Errorf("install backend failed with status %s: %s", resp.Status, string(body))
804+
}
805+
806+
return nil
807+
}
808+
785809
func (c *Client) ConfigureBackend(request scheduling.ConfigureRequest) error {
786810
configureBackendPath := inference.InferencePrefix + "/_configure"
787811
jsonData, err := json.Marshal(request)

cmd/cli/docs/reference/docker_model_install-runner.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@ plink: docker_model.yaml
88
options:
99
- option: backend
1010
value_type: string
11-
description: 'Specify backend (llama.cpp|vllm|diffusers). Default: llama.cpp'
11+
description: |
12+
Specify backend (llama.cpp|vllm|diffusers|vllm-metal). Default: llama.cpp
1213
deprecated: false
1314
hidden: false
1415
experimental: false

cmd/cli/docs/reference/docker_model_reinstall-runner.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@ plink: docker_model.yaml
88
options:
99
- option: backend
1010
value_type: string
11-
description: 'Specify backend (llama.cpp|vllm|diffusers). Default: llama.cpp'
11+
description: |
12+
Specify backend (llama.cpp|vllm|diffusers|vllm-metal). Default: llama.cpp
1213
deprecated: false
1314
hidden: false
1415
experimental: false

cmd/cli/docs/reference/docker_model_start-runner.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@ plink: docker_model.yaml
1010
options:
1111
- option: backend
1212
value_type: string
13-
description: 'Specify backend (llama.cpp|vllm|diffusers). Default: llama.cpp'
13+
description: |
14+
Specify backend (llama.cpp|vllm|diffusers|vllm-metal). Default: llama.cpp
1415
deprecated: false
1516
hidden: false
1617
experimental: false

cmd/cli/docs/reference/model_install-runner.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ Install Docker Model Runner (Docker Engine only)
77

88
| Name | Type | Default | Description |
99
|:-----------------|:---------|:------------|:-------------------------------------------------------------------------------------------------------|
10-
| `--backend` | `string` | | Specify backend (llama.cpp\|vllm\|diffusers). Default: llama.cpp |
10+
| `--backend` | `string` | | Specify backend (llama.cpp\|vllm\|diffusers\|vllm-metal). Default: llama.cpp |
1111
| `--debug` | `bool` | | Enable debug logging |
1212
| `--do-not-track` | `bool` | | Do not track models usage in Docker Model Runner |
1313
| `--gpu` | `string` | `auto` | Specify GPU support (none\|auto\|cuda\|rocm\|musa\|cann) |

cmd/cli/docs/reference/model_reinstall-runner.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ Reinstall Docker Model Runner (Docker Engine only)
77

88
| Name | Type | Default | Description |
99
|:-----------------|:---------|:------------|:-------------------------------------------------------------------------------------------------------|
10-
| `--backend` | `string` | | Specify backend (llama.cpp\|vllm\|diffusers). Default: llama.cpp |
10+
| `--backend` | `string` | | Specify backend (llama.cpp\|vllm\|diffusers\|vllm-metal). Default: llama.cpp |
1111
| `--debug` | `bool` | | Enable debug logging |
1212
| `--do-not-track` | `bool` | | Do not track models usage in Docker Model Runner |
1313
| `--gpu` | `string` | `auto` | Specify GPU support (none\|auto\|cuda\|rocm\|musa\|cann) |

cmd/cli/docs/reference/model_start-runner.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ Start Docker Model Runner (Docker Engine only)
77

88
| Name | Type | Default | Description |
99
|:-----------------|:---------|:------------|:-------------------------------------------------------------------------------------------------------|
10-
| `--backend` | `string` | | Specify backend (llama.cpp\|vllm\|diffusers). Default: llama.cpp |
10+
| `--backend` | `string` | | Specify backend (llama.cpp\|vllm\|diffusers\|vllm-metal). Default: llama.cpp |
1111
| `--debug` | `bool` | | Enable debug logging |
1212
| `--do-not-track` | `bool` | | Do not track models usage in Docker Model Runner |
1313
| `--gpu` | `string` | `auto` | Specify GPU support (none\|auto\|cuda\|rocm\|musa\|cann) |

main.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,12 @@ func main() {
208208
backends[vllmmetal.Name] = vllmMetalBackend
209209
}
210210

211+
// Backends whose installation is deferred until explicitly requested.
212+
var deferredBackends []string
213+
if vllmMetalBackend != nil {
214+
deferredBackends = append(deferredBackends, vllmmetal.Name)
215+
}
216+
211217
scheduler := scheduling.NewScheduler(
212218
log,
213219
backends,
@@ -220,6 +226,7 @@ func main() {
220226
"",
221227
false,
222228
),
229+
deferredBackends,
223230
)
224231

225232
// Create the HTTP handler for the scheduler

pkg/inference/scheduling/http_handler.go

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ func (h *HTTPHandler) routeHandlers() map[string]http.HandlerFunc {
9797
m["GET "+inference.InferencePrefix+"/v1/models"] = h.handleModels
9898
m["GET "+inference.InferencePrefix+"/v1/models/{name...}"] = h.handleModels
9999

100+
m["POST "+inference.InferencePrefix+"/install-backend"] = h.InstallBackend
100101
m["GET "+inference.InferencePrefix+"/status"] = h.GetBackendStatus
101102
m["GET "+inference.InferencePrefix+"/ps"] = h.GetRunningBackends
102103
m["GET "+inference.InferencePrefix+"/df"] = h.GetDiskUsage
@@ -201,6 +202,8 @@ func (h *HTTPHandler) handleOpenAIInference(w http.ResponseWriter, r *http.Reque
201202
// don't allow any requests to be scheduled for a backend until it has
202203
// completed installation.
203204
if err := h.scheduler.installer.wait(r.Context(), backend.Name()); err != nil {
205+
h.scheduler.log.Warnln("ALOHA", err)
206+
204207
if errors.Is(err, ErrBackendNotFound) {
205208
http.Error(w, err.Error(), http.StatusNotFound)
206209
} else if errors.Is(err, errInstallerNotStarted) {
@@ -211,6 +214,8 @@ func (h *HTTPHandler) handleOpenAIInference(w http.ResponseWriter, r *http.Reque
211214
// shutting down (since that will also cancel the request context).
212215
// Either way, provide a response, even if it's ignored.
213216
http.Error(w, "service unavailable", http.StatusServiceUnavailable)
217+
} else if errors.Is(err, errBackendNotInstalled) {
218+
http.Error(w, fmt.Sprintf("backend %q is not installed; run: docker model install-runner --backend %s", backend.Name(), backend.Name()), http.StatusPreconditionFailed)
214219
} else if errors.Is(err, vllm.ErrorNotFound) {
215220
http.Error(w, err.Error(), http.StatusPreconditionFailed)
216221
} else {
@@ -336,6 +341,38 @@ func (h *HTTPHandler) Unload(w http.ResponseWriter, r *http.Request) {
336341
}
337342
}
338343

344+
// installBackendRequest is the JSON body for the install-backend endpoint.
345+
type installBackendRequest struct {
346+
Backend string `json:"backend"`
347+
}
348+
349+
// InstallBackend handles POST <inference-prefix>/install-backend requests.
350+
// It triggers on-demand installation of a deferred backend.
351+
func (h *HTTPHandler) InstallBackend(w http.ResponseWriter, r *http.Request) {
352+
body, err := io.ReadAll(http.MaxBytesReader(w, r.Body, maximumOpenAIInferenceRequestSize))
353+
if err != nil {
354+
http.Error(w, "failed to read request body", http.StatusInternalServerError)
355+
return
356+
}
357+
358+
var req installBackendRequest
359+
if err := json.Unmarshal(body, &req); err != nil || req.Backend == "" {
360+
http.Error(w, "invalid request: backend is required", http.StatusBadRequest)
361+
return
362+
}
363+
364+
if err := h.scheduler.InstallBackend(r.Context(), req.Backend); err != nil {
365+
if errors.Is(err, ErrBackendNotFound) {
366+
http.Error(w, err.Error(), http.StatusNotFound)
367+
} else {
368+
http.Error(w, fmt.Sprintf("backend installation failed: %v", err), http.StatusInternalServerError)
369+
}
370+
return
371+
}
372+
373+
w.WriteHeader(http.StatusOK)
374+
}
375+
339376
// Configure handles POST <inference-prefix>/{backend}/_configure requests.
340377
func (h *HTTPHandler) Configure(w http.ResponseWriter, r *http.Request) {
341378
// Determine the requested backend and ensure that it's valid.

0 commit comments

Comments
 (0)