gpuaudit · maksimov · Apr 19, 2026 · Apr 19, 2026 · Apr 19, 2026 · Apr 19, 2026
diff --git a/cmd/gpuaudit/main.go b/cmd/gpuaudit/main.go
@@ -13,6 +13,9 @@ import (
 
 	"github.com/spf13/cobra"
 
+	awsconfig "github.com/aws/aws-sdk-go-v2/config"
+	"github.com/aws/aws-sdk-go-v2/service/cloudwatch"
+
 	"github.com/gpuaudit/cli/internal/analysis"
 	"github.com/gpuaudit/cli/internal/diff"
 	"github.com/gpuaudit/cli/internal/models"
@@ -50,6 +53,8 @@ var (
 	scanSkipCosts     bool
 	scanKubeconfig    string
 	scanKubeContext   string
+	scanPromURL       string
+	scanPromEndpoint  string
 	scanExcludeTags   []string
 	scanMinUptimeDays int
 	scanTargets       []string
@@ -88,6 +93,8 @@ func init() {
 	scanCmd.Flags().BoolVar(&scanSkipCosts, "skip-costs", false, "Skip Cost Explorer data enrichment")
 	scanCmd.Flags().StringVar(&scanKubeconfig, "kubeconfig", "", "Path to kubeconfig file (default: ~/.kube/config)")
 	scanCmd.Flags().StringVar(&scanKubeContext, "kube-context", "", "Kubernetes context to use (default: current context)")
+	scanCmd.Flags().StringVar(&scanPromURL, "prom-url", "", "Prometheus URL for GPU metrics (e.g., https://prometheus.corp.example.com)")
+	scanCmd.Flags().StringVar(&scanPromEndpoint, "prom-endpoint", "", "In-cluster Prometheus service as namespace/service:port (e.g., monitoring/prometheus:9090)")
 	scanCmd.Flags().StringSliceVar(&scanExcludeTags, "exclude-tag", nil, "Exclude instances matching tag (key=value, repeatable)")
 	scanCmd.Flags().IntVar(&scanMinUptimeDays, "min-uptime-days", 0, "Only flag instances running for at least this many days")
 	scanCmd.Flags().StringSliceVar(&scanTargets, "targets", nil, "Account IDs to scan (comma-separated)")
@@ -107,6 +114,10 @@ func init() {
 }
 
 func runScan(cmd *cobra.Command, args []string) error {
+	if scanPromURL != "" && scanPromEndpoint != "" {
+		return fmt.Errorf("--prom-url and --prom-endpoint are mutually exclusive")
+	}
+
 	ctx := context.Background()
 
 	if (len(scanTargets) > 0 || scanOrg) && scanRole == "" {
@@ -128,8 +139,10 @@ func runScan(cmd *cobra.Command, args []string) error {
 	opts.OrgScan = scanOrg
 	opts.SkipSelf = scanSkipSelf
 
+	awsAvailable := true
 	result, err := awsprovider.Scan(ctx, opts)
 	if err != nil {
+		awsAvailable = false
 		if scanSkipK8s {
 			return fmt.Errorf("scan failed: %w", err)
 		}
@@ -141,13 +154,18 @@ func runScan(cmd *cobra.Command, args []string) error {
 	// Kubernetes API scan
 	if !scanSkipK8s {
 		k8sOpts := k8sprovider.ScanOptions{
-			Kubeconfig: scanKubeconfig,
-			Context:    scanKubeContext,
+			Kubeconfig:   scanKubeconfig,
+			Context:      scanKubeContext,
+			PromURL:      scanPromURL,
+			PromEndpoint: scanPromEndpoint,
 		}
 		k8sInstances, err := k8sprovider.Scan(ctx, k8sOpts)
 		if err != nil {
 			fmt.Fprintf(os.Stderr, "  warning: Kubernetes scan failed: %v\n", err)
 		} else if len(k8sInstances) > 0 {
+			if !scanSkipMetrics {
+				enrichK8sGPUMetrics(ctx, k8sInstances, k8sOpts, opts, awsAvailable)
+			}
 			analysis.AnalyzeAll(k8sInstances)
 			result.Instances = append(result.Instances, k8sInstances...)
 			result.Summary = awsprovider.BuildSummary(result.Instances)
@@ -383,3 +401,60 @@ func parseExcludeTags(raw []string) map[string]string {
 	}
 	return tags
 }
+
+func enrichK8sGPUMetrics(ctx context.Context, instances []models.GPUInstance, k8sOpts k8sprovider.ScanOptions, awsOpts awsprovider.ScanOptions, awsAvailable bool) {
+	// Source 1: CloudWatch Container Insights (skip if AWS creds unavailable)
+	if awsAvailable && len(instances) > 0 && instances[0].ClusterName != "" {
+		cfgOpts := []func(*awsconfig.LoadOptions) error{}
+		if awsOpts.Profile != "" {
+			cfgOpts = append(cfgOpts, awsconfig.WithSharedConfigProfile(awsOpts.Profile))
+		}
+		cfg, err := awsconfig.LoadDefaultConfig(ctx, cfgOpts...)
+		if err == nil {
+			region := instances[0].Region
+			if region == "" {
+				region = "us-east-1"
+			}
+			cfg.Region = region
+			cwClient := cloudwatch.NewFromConfig(cfg)
+			fmt.Fprintf(os.Stderr, "  Enriching K8s GPU metrics via CloudWatch Container Insights...\n")
+			awsprovider.EnrichK8sGPUMetrics(ctx, cwClient, instances, instances[0].ClusterName, awsprovider.DefaultMetricWindow)
+		}
+	}
+
+	// Source 2: DCGM exporter scrape
+	remaining := 0
+	for _, inst := range instances {
+		if inst.Source == models.SourceK8sNode && inst.AvgGPUUtilization == nil {
+			remaining++
+		}
+	}
+	if remaining > 0 {
+		client, _, err := k8sprovider.BuildClientPublic(k8sOpts.Kubeconfig, k8sOpts.Context)
+		if err == nil {
+			k8sprovider.EnrichDCGMMetrics(ctx, client, instances)
+		}
+	}
+
+	// Source 3: Prometheus query
+	remaining = 0
+	for _, inst := range instances {
+		if inst.Source == models.SourceK8sNode && inst.AvgGPUUtilization == nil {
+			remaining++
+		}
+	}
+	if remaining > 0 && (k8sOpts.PromURL != "" || k8sOpts.PromEndpoint != "") {
+		var client k8sprovider.K8sClient
+		if k8sOpts.PromEndpoint != "" {
+			c, _, err := k8sprovider.BuildClientPublic(k8sOpts.Kubeconfig, k8sOpts.Context)
+			if err == nil {
+				client = c
+			}
+		}
+		promOpts := k8sprovider.PrometheusOptions{
+			URL:      k8sOpts.PromURL,
+			Endpoint: k8sOpts.PromEndpoint,
+		}
+		k8sprovider.EnrichPrometheusMetrics(ctx, client, instances, promOpts)
+	}
+}
diff --git a/docs/specs/2026-04-19-k8s-gpu-metrics-design.md b/docs/specs/2026-04-19-k8s-gpu-metrics-design.md
@@ -0,0 +1,149 @@
+# K8s GPU Metrics Collection
+
+## Goal
+
+Collect GPU utilization metrics for Kubernetes GPU nodes discovered by gpuaudit, using a per-node fallback chain of three sources: CloudWatch Container Insights, DCGM exporter scrape, and Prometheus query. Enable utilization-based waste detection for K8s GPU nodes (currently limited to allocation-based detection only).
+
+## Architecture
+
+Three metrics sources, tried in priority order **per node** (stop at the first source that returns data for a given node):
+
+1. **CloudWatch Container Insights** — AWS API call, no in-cluster access needed beyond what we already have.
+2. **DCGM exporter scrape** — probe port 9400 on dcgm-exporter pods via K8s API proxy.
+3. **Prometheus query** — query a user-configured Prometheus endpoint for historical GPU metrics.
+
+All three populate the same existing fields: `GPUInstance.AvgGPUUtilization` and `GPUInstance.AvgGPUMemUtilization`.
+
+## Data Flow
+
+```
+1. AWS scan → ScanResult (EC2, SageMaker, EKS)
+2. K8s scan → []GPUInstance (nodes + allocation)
+3. Enrich K8s GPU metrics (fallback chain):
+   a. CloudWatch Container Insights (if AWS creds available, !skipMetrics)
+   b. DCGM scrape via K8s API proxy (for nodes still missing metrics)
+   c. Prometheus query (for remaining nodes, if --prom-url or --prom-endpoint set)
+4. AnalyzeAll on K8s instances
+5. Merge into result
+```
+
+Steps 3a through 3c each skip nodes that already have `AvgGPUUtilization` populated by a prior step.
+
+## Source 1: CloudWatch Container Insights
+
+Requires the CloudWatch Observability EKS add-on to be installed in the cluster. If not installed, the query returns empty (not an error) and we fall through.
+
+**Metrics queried:**
+- `node_gpu_utilization` (Average) — maps to `AvgGPUUtilization`
+- `node_gpu_memory_utilization` (Average) — maps to `AvgGPUMemUtilization`
+
+**Namespace:** `ContainerInsights`
+
+**Dimensions:** `ClusterName` + `InstanceId`
+
+**Implementation:** New function `EnrichK8sGPUMetrics(ctx, client CloudWatchClient, instances []GPUInstance, clusterName string, window MetricWindow)` in `internal/providers/aws/cloudwatch.go`, following the same pattern as `EnrichEC2Metrics` and `EnrichSageMakerMetrics`.
+
+**Prerequisites per node:** The node must have an EC2 instance ID (extracted from `providerID`). Non-AWS nodes are skipped for this source.
+
+**Wiring:** Called from `main.go` after the K8s scan returns instances, passing the CloudWatch client from the AWS config. Only called when AWS credentials are available and `!skipMetrics`.
+
+## Source 2: DCGM Exporter Scrape
+
+Auto-detected, no user configuration needed.
+
+**Discovery:** List pods across all namespaces matching labels `app=nvidia-dcgm-exporter` or `app.kubernetes.io/name=dcgm-exporter`. If no pods found, log `"DCGM exporter not detected, skipping"` and fall through to Prometheus.
+
+**Scraping:** For each GPU node still missing metrics, find the dcgm-exporter pod on that node (match by `pod.Spec.NodeName`), then scrape `/metrics` on port 9400 via the K8s API proxy (`ProxyGet`).
+
+**Metrics parsed:**
+- `DCGM_FI_DEV_GPU_UTIL` — maps to `AvgGPUUtilization`
+- `DCGM_FI_DEV_MEM_COPY_UTIL` — maps to `AvgGPUMemUtilization`
+
+These are point-in-time values, not historical averages. The analysis rule's confidence (0.85 vs 0.9) accounts for this lower fidelity.
+
+**Prometheus text format parsing:** Use `prometheus/common/expfmt` to parse the scrape response.
+
+**K8s client extension:** Add `ProxyGet(ctx, namespace, podName, port, path string) ([]byte, error)` to the `K8sClient` interface. Wraps `clientset.CoreV1().Pods(ns).ProxyGet()`.
+
+**Stderr output:**
+```
+  Probing DCGM exporter on GPU nodes...
+  DCGM: got GPU metrics for 3 of 5 remaining nodes
+```
+
+## Source 3: Prometheus Query
+
+Only attempted when `--prom-url` or `--prom-endpoint` is provided. No auto-discovery.
+
+**CLI flags:**
+- `--prom-url` — full URL to a Prometheus-compatible API (e.g., `https://prometheus.corp.example.com`, AMP endpoint, Grafana Cloud). Hit directly via HTTP.
+- `--prom-endpoint` — in-cluster service as `namespace/service:port` (e.g., `monitoring/prometheus:9090`). Proxied through the K8s API server.
+
+These flags are mutually exclusive. Error if both are set.
+
+**Query:** Batch all remaining nodes into one PromQL query:
+```
+avg_over_time(DCGM_FI_DEV_GPU_UTIL{node=~"node1|node2|..."}[7d])
+```
+And similarly for `DCGM_FI_DEV_MEM_COPY_UTIL`.
+
+**API:** HTTP GET to `/api/v1/query`, parse the standard Prometheus JSON response. No client library — plain `net/http` for direct URLs, K8s API proxy for in-cluster endpoints.
+
+**Stderr output:**
+```
+  Querying Prometheus at monitoring/prometheus:9090...
+  Prometheus: got GPU metrics for 2 of 3 remaining nodes
+```
+
+## Analysis Rule
+
+New rule `ruleK8sLowGPUUtil` in `internal/analysis/rules.go`:
+
+- **Source filter:** `SourceK8sNode` only
+- **Guard:** `AvgGPUUtilization != nil` (skip nodes where no metrics were collected)
+- **Threshold:** average GPU utilization < 10%
+- **Signal type:** `low_utilization`
+- **Severity:** Critical
+- **Confidence:** 0.85
+- **Recommendation:** "GPU utilization averaging X%. Consider bin-packing more workloads, downsizing, or removing from the node pool."
+- **Savings estimate:** `MonthlyCost * 0.8` (same rough estimate as SageMaker equivalent)
+
+**Interplay with `ruleK8sUnallocatedGPU`:** Both rules can fire on the same node. Unallocated detects zero pod scheduling (allocation-based). Low-util detects pods that are scheduled but barely using the GPU (utilization-based). Different problems, different fixes.
+
+## File Changes
+
+- **Modify:** `internal/providers/aws/cloudwatch.go` — add `EnrichK8sGPUMetrics()`
+- **Create:** `internal/providers/k8s/metrics.go` — DCGM scraping, Prometheus querying, fallback orchestration
+- **Create:** `internal/providers/k8s/metrics_test.go` — tests for DCGM and Prometheus paths
+- **Modify:** `internal/providers/k8s/discover.go` — extend `K8sClient` interface with `ProxyGet` (DCGM pod discovery uses existing `ListPods` with label selector)
+- **Modify:** `internal/providers/k8s/scanner.go` — wire metrics enrichment into the K8s scan, accept new options
+- **Modify:** `internal/analysis/rules.go` — add `ruleK8sLowGPUUtil`
+- **Modify:** `internal/analysis/rules_test.go` — tests for the new rule
+- **Modify:** `cmd/gpuaudit/main.go` — add `--prom-url` and `--prom-endpoint` flags, wire CloudWatch enrichment for K8s instances
+
+## Error Handling
+
+- **CloudWatch returns empty:** Not an error. Container Insights add-on probably not installed. Fall through to DCGM.
+- **No EC2 instance ID on a node:** Skip CW enrichment for that node (non-AWS or providerID not set).
+- **No dcgm-exporter pods found:** Log on stderr, fall through to Prometheus.
+- **DCGM scrape fails for a node:** Warn on stderr, continue with other nodes. Don't fail the scan.
+- **Prometheus endpoint unreachable:** Warn on stderr, continue without metrics for remaining nodes.
+- **Both `--prom-url` and `--prom-endpoint` set:** Return an error at flag validation time.
+
+## New Dependencies
+
+- `prometheus/common/expfmt` — for parsing Prometheus text format from DCGM exporter scrapes. Small, well-established library.
+
+## IAM Policy
+
+No new IAM permissions required. `EnrichK8sGPUMetrics` uses the existing `cloudwatch:GetMetricData` permission already in the IAM policy output.
+
+## RBAC
+
+The K8s API proxy calls (`ProxyGet` to pods) require the `pods/proxy` resource permission. For DCGM scraping:
+```
+- apiGroups: [""]
+  resources: ["pods/proxy"]
+  verbs: ["get"]
+```
+This should be documented and added to any RBAC guide.