feat: make vllm-metal backend installation opt-in

doringeman · doringeman · commit a8950207452b · 2026-02-10T19:43:43.000+02:00
Signed-off-by: Dorin Geman &lt;dorin.geman@docker.com&gt;
diff --git a/cmd/cli/commands/install-runner.go b/cmd/cli/commands/install-runner.go
@@ -17,6 +17,7 @@ import (
 	"github.com/docker/model-runner/pkg/inference/backends/diffusers"
 	"github.com/docker/model-runner/pkg/inference/backends/llamacpp"
 	"github.com/docker/model-runner/pkg/inference/backends/vllm"
+	"github.com/docker/model-runner/pkg/inference/backends/vllmmetal"
 	"github.com/spf13/cobra"
 )
 
@@ -28,7 +29,7 @@ const (
 	// installation will try to reach the model runner while waiting for it to
 	// be ready.
 	installWaitRetryInterval = 500 * time.Millisecond
-	backendUsage             = "Specify backend (" + llamacpp.Name + "|" + vllm.Name + "|" + diffusers.Name + "). Default: " + llamacpp.Name
+	backendUsage             = "Specify backend (" + llamacpp.Name + "|" + vllm.Name + "|" + diffusers.Name + "|" + vllmmetal.Name + "). Default: " + llamacpp.Name
 )
 
 // waitForStandaloneRunnerAfterInstall waits for a standalone model runner
@@ -237,6 +238,17 @@ type runnerOptions struct {
 
 // runInstallOrStart is shared logic for install-runner and start-runner commands
 func runInstallOrStart(cmd *cobra.Command, opts runnerOptions, debug bool) error {
+	// vllm-metal is installed on-demand via the running model runner,
+	// not as a standalone container. This applies to all engine kinds.
+	if opts.backend == vllmmetal.Name {
+		cmd.Println("Installing vllm-metal backend...")
+		if err := desktopClient.InstallBackend(vllmmetal.Name); err != nil {
+			return fmt.Errorf("failed to install vllm-metal backend: %w", err)
+		}
+		cmd.Println("vllm-metal backend installed successfully")
+		return nil
+	}
+
 	var vllmOnWSL bool
 	// Ensure that we're running in a supported model runner context.
 	engineKind := modelRunner.EngineKind()
@@ -324,7 +336,7 @@ func runInstallOrStart(cmd *cobra.Command, opts runnerOptions, debug bool) error
 	}
 
 	// Validate backend selection
-	validBackends := []string{llamacpp.Name, vllm.Name, diffusers.Name}
+	validBackends := []string{llamacpp.Name, vllm.Name, diffusers.Name, vllmmetal.Name}
 	if opts.backend != "" {
 		isValid := false
 		for _, valid := range validBackends {
diff --git a/cmd/cli/desktop/desktop.go b/cmd/cli/desktop/desktop.go
@@ -782,6 +782,30 @@ func (c *Client) ShowConfigs(modelFilter string) ([]scheduling.ModelConfigEntry,
 	return configs, nil
 }
 
+// InstallBackend triggers on-demand installation of a deferred backend
+func (c *Client) InstallBackend(backend string) error {
+	installPath := inference.InferencePrefix + "/install-backend"
+	jsonData, err := json.Marshal(struct {
+		Backend string `json:"backend"`
+	}{Backend: backend})
+	if err != nil {
+		return fmt.Errorf("error marshaling request: %w", err)
+	}
+
+	resp, err := c.doRequest(http.MethodPost, installPath, bytes.NewReader(jsonData))
+	if err != nil {
+		return c.handleQueryError(err, installPath)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		body, _ := io.ReadAll(resp.Body)
+		return fmt.Errorf("install backend failed with status %s: %s", resp.Status, string(body))
+	}
+
+	return nil
+}
+
 func (c *Client) ConfigureBackend(request scheduling.ConfigureRequest) error {
 	configureBackendPath := inference.InferencePrefix + "/_configure"
 	jsonData, err := json.Marshal(request)
diff --git a/cmd/cli/docs/reference/docker_model_install-runner.yaml b/cmd/cli/docs/reference/docker_model_install-runner.yaml
@@ -8,7 +8,8 @@ plink: docker_model.yaml
 options:
     - option: backend
       value_type: string
-      description: 'Specify backend (llama.cpp|vllm|diffusers). Default: llama.cpp'
+      description: |
+        Specify backend (llama.cpp|vllm|diffusers|vllm-metal). Default: llama.cpp
       deprecated: false
       hidden: false
       experimental: false
diff --git a/cmd/cli/docs/reference/docker_model_reinstall-runner.yaml b/cmd/cli/docs/reference/docker_model_reinstall-runner.yaml
@@ -8,7 +8,8 @@ plink: docker_model.yaml
 options:
     - option: backend
       value_type: string
-      description: 'Specify backend (llama.cpp|vllm|diffusers). Default: llama.cpp'
+      description: |
+        Specify backend (llama.cpp|vllm|diffusers|vllm-metal). Default: llama.cpp
       deprecated: false
       hidden: false
       experimental: false
diff --git a/cmd/cli/docs/reference/docker_model_start-runner.yaml b/cmd/cli/docs/reference/docker_model_start-runner.yaml
@@ -10,7 +10,8 @@ plink: docker_model.yaml
 options:
     - option: backend
       value_type: string
-      description: 'Specify backend (llama.cpp|vllm|diffusers). Default: llama.cpp'
+      description: |
+        Specify backend (llama.cpp|vllm|diffusers|vllm-metal). Default: llama.cpp
       deprecated: false
       hidden: false
       experimental: false
diff --git a/cmd/cli/docs/reference/model_install-runner.md b/cmd/cli/docs/reference/model_install-runner.md
@@ -7,7 +7,7 @@ Install Docker Model Runner (Docker Engine only)
 
 | Name             | Type     | Default     | Description                                                                                            |
 |:-----------------|:---------|:------------|:-------------------------------------------------------------------------------------------------------|
-| `--backend`      | `string` |             | Specify backend (llama.cpp\|vllm\|diffusers). Default: llama.cpp                                       |
+| `--backend`      | `string` |             | Specify backend (llama.cpp\|vllm\|diffusers\|vllm-metal). Default: llama.cpp                           |
 | `--debug`        | `bool`   |             | Enable debug logging                                                                                   |
 | `--do-not-track` | `bool`   |             | Do not track models usage in Docker Model Runner                                                       |
 | `--gpu`          | `string` | `auto`      | Specify GPU support (none\|auto\|cuda\|rocm\|musa\|cann)                                               |
diff --git a/cmd/cli/docs/reference/model_reinstall-runner.md b/cmd/cli/docs/reference/model_reinstall-runner.md
@@ -7,7 +7,7 @@ Reinstall Docker Model Runner (Docker Engine only)
 
 | Name             | Type     | Default     | Description                                                                                            |
 |:-----------------|:---------|:------------|:-------------------------------------------------------------------------------------------------------|
-| `--backend`      | `string` |             | Specify backend (llama.cpp\|vllm\|diffusers). Default: llama.cpp                                       |
+| `--backend`      | `string` |             | Specify backend (llama.cpp\|vllm\|diffusers\|vllm-metal). Default: llama.cpp                           |
 | `--debug`        | `bool`   |             | Enable debug logging                                                                                   |
 | `--do-not-track` | `bool`   |             | Do not track models usage in Docker Model Runner                                                       |
 | `--gpu`          | `string` | `auto`      | Specify GPU support (none\|auto\|cuda\|rocm\|musa\|cann)                                               |
diff --git a/cmd/cli/docs/reference/model_start-runner.md b/cmd/cli/docs/reference/model_start-runner.md
@@ -7,7 +7,7 @@ Start Docker Model Runner (Docker Engine only)
 
 | Name             | Type     | Default     | Description                                                                                            |
 |:-----------------|:---------|:------------|:-------------------------------------------------------------------------------------------------------|
-| `--backend`      | `string` |             | Specify backend (llama.cpp\|vllm\|diffusers). Default: llama.cpp                                       |
+| `--backend`      | `string` |             | Specify backend (llama.cpp\|vllm\|diffusers\|vllm-metal). Default: llama.cpp                           |
 | `--debug`        | `bool`   |             | Enable debug logging                                                                                   |
 | `--do-not-track` | `bool`   |             | Do not track models usage in Docker Model Runner                                                       |
 | `--gpu`          | `string` | `auto`      | Specify GPU support (none\|auto\|cuda\|rocm\|musa\|cann)                                               |
diff --git a/main.go b/main.go
@@ -208,6 +208,12 @@ func main() {
 		backends[vllmmetal.Name] = vllmMetalBackend
 	}
 
+	// Backends whose installation is deferred until explicitly requested.
+	var deferredBackends []string
+	if vllmMetalBackend != nil {
+		deferredBackends = append(deferredBackends, vllmmetal.Name)
+	}
+
 	scheduler := scheduling.NewScheduler(
 		log,
 		backends,
@@ -220,6 +226,7 @@ func main() {
 			"",
 			false,
 		),
+		deferredBackends,
 	)
 
 	// Create the HTTP handler for the scheduler
diff --git a/pkg/inference/scheduling/http_handler.go b/pkg/inference/scheduling/http_handler.go
@@ -97,6 +97,7 @@ func (h *HTTPHandler) routeHandlers() map[string]http.HandlerFunc {
 	m["GET "+inference.InferencePrefix+"/v1/models"] = h.handleModels
 	m["GET "+inference.InferencePrefix+"/v1/models/{name...}"] = h.handleModels
 
+	m["POST "+inference.InferencePrefix+"/install-backend"] = h.InstallBackend
 	m["GET "+inference.InferencePrefix+"/status"] = h.GetBackendStatus
 	m["GET "+inference.InferencePrefix+"/ps"] = h.GetRunningBackends
 	m["GET "+inference.InferencePrefix+"/df"] = h.GetDiskUsage
@@ -201,6 +202,8 @@ func (h *HTTPHandler) handleOpenAIInference(w http.ResponseWriter, r *http.Reque
 	// don't allow any requests to be scheduled for a backend until it has
 	// completed installation.
 	if err := h.scheduler.installer.wait(r.Context(), backend.Name()); err != nil {
+		h.scheduler.log.Warnln("ALOHA", err)
+
 		if errors.Is(err, ErrBackendNotFound) {
 			http.Error(w, err.Error(), http.StatusNotFound)
 		} else if errors.Is(err, errInstallerNotStarted) {
@@ -211,6 +214,8 @@ func (h *HTTPHandler) handleOpenAIInference(w http.ResponseWriter, r *http.Reque
 			// shutting down (since that will also cancel the request context).
 			// Either way, provide a response, even if it's ignored.
 			http.Error(w, "service unavailable", http.StatusServiceUnavailable)
+		} else if errors.Is(err, errBackendNotInstalled) {
+			http.Error(w, fmt.Sprintf("backend %q is not installed; run: docker model install-runner --backend %s", backend.Name(), backend.Name()), http.StatusPreconditionFailed)
 		} else if errors.Is(err, vllm.ErrorNotFound) {
 			http.Error(w, err.Error(), http.StatusPreconditionFailed)
 		} else {
@@ -336,6 +341,38 @@ func (h *HTTPHandler) Unload(w http.ResponseWriter, r *http.Request) {
 	}
 }
 
+// installBackendRequest is the JSON body for the install-backend endpoint.
+type installBackendRequest struct {
+	Backend string `json:"backend"`
+}
+
+// InstallBackend handles POST <inference-prefix>/install-backend requests.
+// It triggers on-demand installation of a deferred backend.
+func (h *HTTPHandler) InstallBackend(w http.ResponseWriter, r *http.Request) {
+	body, err := io.ReadAll(http.MaxBytesReader(w, r.Body, maximumOpenAIInferenceRequestSize))
+	if err != nil {
+		http.Error(w, "failed to read request body", http.StatusInternalServerError)
+		return
+	}
+
+	var req installBackendRequest
+	if err := json.Unmarshal(body, &req); err != nil || req.Backend == "" {
+		http.Error(w, "invalid request: backend is required", http.StatusBadRequest)
+		return
+	}
+
+	if err := h.scheduler.InstallBackend(r.Context(), req.Backend); err != nil {
+		if errors.Is(err, ErrBackendNotFound) {
+			http.Error(w, err.Error(), http.StatusNotFound)
+		} else {
+			http.Error(w, fmt.Sprintf("backend installation failed: %v", err), http.StatusInternalServerError)
+		}
+		return
+	}
+
+	w.WriteHeader(http.StatusOK)
+}
+
 // Configure handles POST <inference-prefix>/{backend}/_configure requests.
 func (h *HTTPHandler) Configure(w http.ResponseWriter, r *http.Request) {
 	// Determine the requested backend and ensure that it's valid.
diff --git a/pkg/inference/scheduling/installer.go b/pkg/inference/scheduling/installer.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"errors"
 	"net/http"
+	"sync"
 	"sync/atomic"
 
 	"github.com/docker/model-runner/pkg/inference"
@@ -17,6 +18,9 @@ var (
 	// errInstallerShuttingDown indicates that the installer's run loop has been
 	// terminated and the installer is shutting down.
 	errInstallerShuttingDown = errors.New("backend installer shutting down")
+	// errBackendNotInstalled indicates that a deferred backend has not been
+	// installed. Callers should install it via installBackend before use.
+	errBackendNotInstalled = errors.New("backend not installed")
 )
 
 // installStatus tracks the installation status of a backend.
@@ -44,14 +48,28 @@ type installer struct {
 	started atomic.Bool
 	// statuses maps backend names to their installation statuses.
 	statuses map[string]*installStatus
+	// deferredBackends tracks backends whose installation is deferred until
+	// explicitly requested via installBackend.
+	deferredBackends map[string]bool
+	// mu protects on-demand installation via installBackend.
+	mu sync.Mutex
 }
 
-// newInstaller creates a new backend installer.
+// newInstaller creates a new backend installer. Backends listed in
+// deferredBackends are skipped during the automatic run loop and must be
+// installed on-demand via installBackend.
 func newInstaller(
 	log logging.Logger,
 	backends map[string]inference.Backend,
 	httpClient *http.Client,
+	deferredBackends []string,
 ) *installer {
+	// Build the deferred set.
+	deferred := make(map[string]bool, len(deferredBackends))
+	for _, name := range deferredBackends {
+		deferred[name] = true
+	}
+
 	// Create status trackers.
 	statuses := make(map[string]*installStatus, len(backends))
 	for name := range backends {
@@ -63,10 +81,11 @@ func newInstaller(
 
 	// Create the installer.
 	return &installer{
-		log:        log,
-		backends:   backends,
-		httpClient: httpClient,
-		statuses:   statuses,
+		log:              log,
+		backends:         backends,
+		httpClient:       httpClient,
+		statuses:         statuses,
+		deferredBackends: deferred,
 	}
 }
 
@@ -84,6 +103,22 @@ func (i *installer) run(ctx context.Context) {
 	// ubiquitous backend and mlx as a relatively lightweight backend (on macOS
 	// only), this granularity is probably less of a concern.
 	for name, backend := range i.backends {
+		// For deferred backends, check if they are already installed on disk
+		// from a previous session. Only call Install() (which verifies the
+		// existing installation) when files are present, to avoid triggering
+		// a download.
+		if i.deferredBackends[name] {
+			status := i.statuses[name]
+			if diskUsage, err := backend.GetDiskUsage(); err == nil && diskUsage > 0 {
+				if err := backend.Install(ctx, i.httpClient); err == nil {
+					close(status.installed)
+				}
+			}
+			// If not on disk, leave channels open so wait() returns
+			// errBackendNotInstalled.
+			continue
+		}
+
 		status := i.statuses[name]
 
 		var installedClosed bool
@@ -114,13 +149,29 @@ func (i *installer) run(ctx context.Context) {
 }
 
 // wait waits for installation of the specified backend to complete or fail.
+// For deferred backends that have never been installed, it returns
+// errBackendNotInstalled immediately instead of blocking.
 func (i *installer) wait(ctx context.Context, backend string) error {
 	// Grab the backend status.
 	status, ok := i.statuses[backend]
 	if !ok {
 		return ErrBackendNotFound
 	}
 
+	// For deferred backends, check whether installation has completed without
+	// blocking. This doesn't depend on the installer being started, since
+	// deferred backends are installed on-demand, not by the run loop.
+	if i.deferredBackends[backend] {
+		select {
+		case <-status.installed:
+			return nil
+		case <-status.failed:
+			return status.err
+		default:
+			return errBackendNotInstalled
+		}
+	}
+
 	// If the installer hasn't started, then don't poll for readiness, because
 	// it may never come. If it has started, then even if it's cancelled we can
 	// be sure that we'll at least see failure for all backend installations.
@@ -138,3 +189,60 @@ func (i *installer) wait(ctx context.Context, backend string) error {
 		return status.err
 	}
 }
+
+// installBackend triggers on-demand installation of a deferred backend.
+// It is idempotent: if the backend is already installed, it returns nil.
+func (i *installer) installBackend(ctx context.Context, name string) error {
+	i.mu.Lock()
+	defer i.mu.Unlock()
+
+	backend, ok := i.backends[name]
+	if !ok {
+		return ErrBackendNotFound
+	}
+
+	status := i.statuses[name]
+
+	// Already installed — nothing to do.
+	select {
+	case <-status.installed:
+		return nil
+	default:
+	}
+
+	// If previously failed, reset status for retry.
+	select {
+	case <-status.failed:
+		status = &installStatus{
+			installed: make(chan struct{}),
+			failed:    make(chan struct{}),
+		}
+		i.statuses[name] = status
+	default:
+	}
+
+	// Perform installation.
+	if err := backend.Install(ctx, i.httpClient); err != nil {
+		status.err = err
+		close(status.failed)
+		return err
+	}
+
+	close(status.installed)
+	return nil
+}
+
+// isInstalled returns true if the given backend has completed installation.
+// It is non-blocking.
+func (i *installer) isInstalled(name string) bool {
+	status, ok := i.statuses[name]
+	if !ok {
+		return false
+	}
+	select {
+	case <-status.installed:
+		return true
+	default:
+		return false
+	}
+}
diff --git a/pkg/inference/scheduling/scheduler.go b/pkg/inference/scheduling/scheduler.go
diff --git a/pkg/inference/scheduling/scheduler_test.go b/pkg/inference/scheduling/scheduler_test.go