kubernetes-sigs · mkoushni · Mar 9, 2026 · Mar 9, 2026 · Mar 11, 2026 · Mar 16, 2026
diff --git a/config/manifests/regression-testing/vllm/multi-lora-deployment.yaml b/config/manifests/regression-testing/vllm/multi-lora-deployment.yaml
@@ -59,31 +59,13 @@ spec:
               # to give upstream gateways a chance to take us out of rotation. The time we wait
               # is dependent on the time it takes for all upstreams to completely remove us from
               # rotation. Older or simpler load balancers might take upwards of 30s, but we expect
-              # our deployment to run behind a modern gateway like Envoy which is designed to 
+              # our deployment to run behind a modern gateway like Envoy which is designed to
               # probe for readiness aggressively.
-              sleep:
-                # Upstream gateway probers for health should be set on a low period, such as 5s,
-                # and the shorter we can tighten that bound the faster that we release
-                # accelerators during controlled shutdowns. However, we should expect variance,
-                # as load balancers may have internal delays, and we don't want to drop requests
-                # normally, so we're often aiming to set this value to a p99 propagation latency
-                # of readiness -> load balancer taking backend out of rotation, not the average.
-                # 
-                # This value is generally stable and must often be experimentally determined on
-                # for a given load balancer and health check period. We set the value here to
-                # the highest value we observe on a supported load balancer, and we recommend
-                # tuning this value down and verifying no requests are dropped.
-                #
-                # If this value is updated, be sure to update terminationGracePeriodSeconds.
-                #
-                seconds: 30
-              #
-              # IMPORTANT: preStop.sleep is beta as of Kubernetes 1.30 - for older versions
-              # replace with this exec action.
-              #exec:
-              #  command:
-              #  - /usr/bin/sleep
-              #  - "30"
+              # Uses exec for compatibility with K8s < 1.30; preStop.sleep is beta in 1.30+.
+              exec:
+                command:
+                  - /usr/bin/sleep
+                  - "30"
           livenessProbe:
             httpGet:
               path: /health
@@ -154,7 +136,7 @@ spec:
       initContainers:
         - name: lora-adapter-syncer
           tty: true
-          stdin: true 
+          stdin: true
           image: registry.k8s.io/gateway-api-inference-extension/lora-syncer:v1.2.1
           restartPolicy: Always
           imagePullPolicy: IfNotPresent

diff --git a/config/manifests/vllm/gpu-deployment.yaml b/config/manifests/vllm/gpu-deployment.yaml
@@ -60,31 +60,13 @@ spec:
               # to give upstream gateways a chance to take us out of rotation. The time we wait
               # is dependent on the time it takes for all upstreams to completely remove us from
               # rotation. Older or simpler load balancers might take upwards of 30s, but we expect
-              # our deployment to run behind a modern gateway like Envoy which is designed to 
+              # our deployment to run behind a modern gateway like Envoy which is designed to
               # probe for readiness aggressively.
-              sleep:
-                # Upstream gateway probers for health should be set on a low period, such as 5s,
-                # and the shorter we can tighten that bound the faster that we release
-                # accelerators during controlled shutdowns. However, we should expect variance,
-                # as load balancers may have internal delays, and we don't want to drop requests
-                # normally, so we're often aiming to set this value to a p99 propagation latency
-                # of readiness -> load balancer taking backend out of rotation, not the average.
-                # 
-                # This value is generally stable and must often be experimentally determined on
-                # for a given load balancer and health check period. We set the value here to
-                # the highest value we observe on a supported load balancer, and we recommend
-                # tuning this value down and verifying no requests are dropped.
-                #
-                # If this value is updated, be sure to update terminationGracePeriodSeconds.
-                #
-                seconds: 30
-              #
-              # IMPORTANT: preStop.sleep is beta as of Kubernetes 1.30 - for older versions
-              # replace with this exec action.
-              #exec:
-              #  command:
-              #  - /usr/bin/sleep
-              #  - "30"
+              # Uses exec for compatibility with K8s < 1.30; preStop.sleep is beta in 1.30+.
+              exec:
+                command:
+                  - /usr/bin/sleep
+                  - "30"
           livenessProbe:
             httpGet:
               path: /health
@@ -155,7 +137,7 @@ spec:
       initContainers:
         - name: lora-adapter-syncer
           tty: true
-          stdin: true 
+          stdin: true
           image: registry.k8s.io/gateway-api-inference-extension/lora-syncer:v1.2.1
           restartPolicy: Always
           imagePullPolicy: IfNotPresent