pytorch · huydhn · Jun 10, 2026 · Jun 10, 2026 · Jun 11, 2026 · Jun 11, 2026
@@ -0,0 +1,49 @@
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: buildkit-autoscaling-alerts
+  namespace: monitoring
+  labels:
+    app.kubernetes.io/part-of: osdc-monitoring
+spec:
+  groups:
+    - name: buildkit-autoscaling
+      rules:
+        # KEDA can't read the scale metric — if it persists past the ScaledObject's
+        # failureThreshold, KEDA drops to the fixed fallback pool instead of scaling.
+        - alert: BuildkitKedaScalerErrors
+          expr: |
+            sum by (scaledObject) (increase(keda_scaler_errors_total[15m])) > 0
+          for: 10m
+          labels:
+            severity: warning
+            team: pytorch-dev-infra
+            priority: P3
+          annotations:
+            summary: "KEDA can't read the scale metric for {{ $labels.scaledObject }}"
+            description: "KEDA scaler errors for {{ $labels.scaledObject }} over the last 15m; sustained errors trip the fallback to the fixed BuildKit pool."
+
+        - alert: BuildkitKedaScaledObjectErrors
+          expr: |
+            sum by (scaledObject) (increase(keda_scaledobject_errors_total[15m])) > 0
+          for: 10m
+          labels:
+            severity: warning
+            team: pytorch-dev-infra
+            priority: P3
+          annotations:
+            summary: "KEDA ScaledObject {{ $labels.scaledObject }} reconcile errors"
+            description: "KEDA failed to reconcile ScaledObject {{ $labels.scaledObject }} in the last 15m; autoscaling for that arch may be stale."
+
+        # Builds stuck waiting for a pod — the pool isn't scaling up fast enough.
+        - alert: BuildkitQueueBacklog
+          expr: |
+            haproxy_backend_current_queue{proxy=~"bk_amd64|bk_arm64"} > 0
+          for: 15m
+          labels:
+            severity: warning
+            team: pytorch-dev-infra
+            priority: P3
+          annotations:
+            summary: "BuildKit {{ $labels.proxy }} has builds queued for 15m"
+            description: "Builds have been waiting in the {{ $labels.proxy }} queue for 15m — the pool isn't scaling up fast enough (or is at max) to meet demand."
@@ -3,6 +3,7 @@ kind: Kustomization
 
 resources:
   - arc-alerts.yaml
+  - buildkit-autoscaling-alerts.yaml
   - infrastructure-alerts.yaml
   - gpu-alerts.yaml
   - node-compactor-alerts.yaml