monitoring: add buildkit autoscaling alerts

huydhn · huydhn · commit 36060743e5fb · 2026-06-11T14:09:43.000-07:00
KEDA scaler/scaledobject errors (fallback risk) + HAProxy queue backlog (pool not scaling fast enough). Uses metrics from the KEDA ServiceMonitor and the existing buildkit-haproxy scrape. ghstack-source-id: 3400f35 Pull-Request: #727
diff --git a/osdc/modules/monitoring/kubernetes/alerts/buildkit-autoscaling-alerts.yaml b/osdc/modules/monitoring/kubernetes/alerts/buildkit-autoscaling-alerts.yaml
@@ -0,0 +1,51 @@
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: buildkit-autoscaling-alerts
+  namespace: monitoring
+  labels:
+    app.kubernetes.io/part-of: osdc-monitoring
+spec:
+  groups:
+    - name: buildkit-autoscaling
+      rules:
+        # KEDA can't read the scale metric — if it persists past the ScaledObject's
+        # failureThreshold, KEDA drops to the fixed fallback pool instead of scaling.
+        - alert: BuildkitKedaScalerErrors
+          expr: |
+            sum by (scaledObject) (increase(keda_scaler_detail_errors_total[15m])) > 0
+          for: 10m
+          labels:
+            severity: warning
+            team: pytorch-dev-infra
+            priority: P3
+          annotations:
+            summary: "KEDA can't read the scale metric for {{ $labels.scaledObject }}"
+            description: "KEDA scaler errors for {{ $labels.scaledObject }} over the last 15m; sustained errors trip the fallback to the fixed BuildKit pool."
+
+        - alert: BuildkitKedaScaledObjectErrors
+          expr: |
+            sum by (scaledObject) (increase(keda_scaled_object_errors_total[15m])) > 0
+          for: 10m
+          labels:
+            severity: warning
+            team: pytorch-dev-infra
+            priority: P3
+          annotations:
+            summary: "KEDA ScaledObject {{ $labels.scaledObject }} reconcile errors"
+            description: "KEDA failed to reconcile ScaledObject {{ $labels.scaledObject }} in the last 15m; autoscaling for that arch may be stale."
+
+        # A real backlog the pool can't keep up with. The >20 threshold (not >0)
+        # avoids firing on normal burst churn, where small batches keep the queue
+        # briefly non-zero but still drain within minutes as pods scale up.
+        - alert: BuildkitQueueBacklog
+          expr: |
+            haproxy_backend_current_queue{proxy=~"bk_amd64|bk_arm64"} > 20
+          for: 15m
+          labels:
+            severity: warning
+            team: pytorch-dev-infra
+            priority: P3
+          annotations:
+            summary: "BuildKit {{ $labels.proxy }} backlog: >20 builds queued for 15m"
+            description: "More than 20 builds have been waiting in the {{ $labels.proxy }} queue for 15m — beyond normal burst churn; the pool isn't scaling up fast enough (or is at max)."
diff --git a/osdc/modules/monitoring/kubernetes/alerts/kustomization.yaml b/osdc/modules/monitoring/kubernetes/alerts/kustomization.yaml
@@ -3,6 +3,7 @@ kind: Kustomization
 
 resources:
   - arc-alerts.yaml
+  - buildkit-autoscaling-alerts.yaml
   - infrastructure-alerts.yaml
   - gpu-alerts.yaml
   - node-compactor-alerts.yaml
diff --git a/osdc/modules/monitoring/kubernetes/monitors/servicemonitors/buildkit-haproxy.yaml b/osdc/modules/monitoring/kubernetes/monitors/servicemonitors/buildkit-haproxy.yaml
@@ -20,4 +20,4 @@ spec:
         # Keep only operationally important HAProxy metrics
         - action: keep
           sourceLabels: [__name__]
-          regex: "haproxy_server_status|haproxy_server_current_sessions|haproxy_server_connection_errors_total|haproxy_backend_current_sessions"
+          regex: "haproxy_server_status|haproxy_server_current_sessions|haproxy_server_connection_errors_total|haproxy_backend_current_sessions|haproxy_backend_current_queue"