Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 98 additions & 0 deletions pkg/controllers/provisioning/nodeclaim_limits_regression_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
package provisioning_test

import (
"context"
"testing"
"time"

"github.com/awslabs/operatorpkg/status"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/client-go/kubernetes/scheme"
"k8s.io/client-go/tools/record"
clock "k8s.io/utils/clock/testing"
"sigs.k8s.io/controller-runtime/pkg/client"
fakeclient "sigs.k8s.io/controller-runtime/pkg/client/fake"

v1 "sigs.k8s.io/karpenter/pkg/apis/v1"
"sigs.k8s.io/karpenter/pkg/cloudprovider/fake"
"sigs.k8s.io/karpenter/pkg/controllers/provisioning"
"sigs.k8s.io/karpenter/pkg/controllers/state"
"sigs.k8s.io/karpenter/pkg/events"
"sigs.k8s.io/karpenter/pkg/operator/options"
"sigs.k8s.io/karpenter/pkg/test"
)

func TestPendingNodeClaimPreventsOverProvisioning(t *testing.T) {
t.Parallel()

ctx := options.ToContext(context.Background(), test.Options())
kubeClient := fakeclient.NewClientBuilder().
WithScheme(scheme.Scheme).
WithIndex(&corev1.Pod{}, "spec.nodeName", func(obj client.Object) []string {
return []string{obj.(*corev1.Pod).Spec.NodeName}
}).
Build()

cloudProvider := fake.NewCloudProvider()
fakeClock := clock.NewFakeClock(time.Now())
cluster := state.NewCluster(fakeClock, kubeClient, cloudProvider)
prov := provisioning.NewProvisioner(kubeClient, events.NewRecorder(&record.FakeRecorder{}), cloudProvider, cluster, fakeClock)

nodePool := test.NodePool(v1.NodePool{
Spec: v1.NodePoolSpec{
Limits: v1.Limits(corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("2"),
}),
},
})
nodePool.StatusConditions().SetTrue(status.ConditionReady)
if err := kubeClient.Create(ctx, nodePool); err != nil {
t.Fatalf("creating nodepool, %v", err)
}

pod1 := test.UnschedulablePod(test.PodOptions{
ResourceRequirements: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("1.75"),
},
},
})
if err := kubeClient.Create(ctx, pod1); err != nil {
t.Fatalf("creating first pod, %v", err)
}

firstResults, err := prov.Schedule(ctx)
if err != nil {
t.Fatalf("scheduling first pod, %v", err)
}
if got := len(firstResults.NewNodeClaims); got != 1 {
t.Fatalf("expected first scheduling round to create 1 nodeclaim, got %d", got)
}
if _, err = prov.Create(ctx, firstResults.NewNodeClaims[0]); err != nil {
t.Fatalf("creating first nodeclaim, %v", err)
}

if err := kubeClient.Delete(ctx, pod1); err != nil {
t.Fatalf("deleting first pod, %v", err)
}

pod2 := test.UnschedulablePod(test.PodOptions{
ResourceRequirements: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("1.75"),
},
},
})
if err := kubeClient.Create(ctx, pod2); err != nil {
t.Fatalf("creating second pod, %v", err)
}

secondResults, err := prov.Schedule(ctx)
if err != nil {
t.Fatalf("scheduling second pod, %v", err)
}
if got := len(secondResults.NewNodeClaims); got != 0 {
t.Fatalf("expected second scheduling round to create 0 nodeclaims after an unlaunched nodeclaim consumed the nodepool limit, got %d", got)
}
}
31 changes: 21 additions & 10 deletions pkg/controllers/provisioning/provisioner.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
Expand All @@ -20,16 +20,11 @@ import (
"context"
"errors"
"fmt"
"math"
"strings"
"time"

"github.com/awslabs/operatorpkg/option"
"github.com/awslabs/operatorpkg/reconciler"
"github.com/awslabs/operatorpkg/serrors"
"github.com/awslabs/operatorpkg/singleton"
"github.com/awslabs/operatorpkg/status"

"github.com/samber/lo"
"go.uber.org/multierr"
appsv1 "k8s.io/api/apps/v1"
Expand All @@ -38,25 +33,27 @@ import (
"k8s.io/client-go/util/workqueue"
"k8s.io/klog/v2"
"k8s.io/utils/clock"
"math"
controllerruntime "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/manager"

"sigs.k8s.io/karpenter/pkg/operator/options"

v1 "sigs.k8s.io/karpenter/pkg/apis/v1"
"sigs.k8s.io/karpenter/pkg/cloudprovider"
scheduler "sigs.k8s.io/karpenter/pkg/controllers/provisioning/scheduling"
"sigs.k8s.io/karpenter/pkg/controllers/state"
"sigs.k8s.io/karpenter/pkg/events"
"sigs.k8s.io/karpenter/pkg/metrics"
"sigs.k8s.io/karpenter/pkg/operator/injection"
"sigs.k8s.io/karpenter/pkg/operator/options"
"sigs.k8s.io/karpenter/pkg/scheduling"
"sigs.k8s.io/karpenter/pkg/utils/daemonset"
nodeutils "sigs.k8s.io/karpenter/pkg/utils/node"
nodepoolutils "sigs.k8s.io/karpenter/pkg/utils/nodepool"
"sigs.k8s.io/karpenter/pkg/utils/pretty"
"sigs.k8s.io/karpenter/pkg/utils/resources"
"strings"
"time"
)

// LaunchOptions are the set of options that can be used to trigger certain
Expand Down Expand Up @@ -456,7 +453,7 @@ func (p *Provisioner) Create(ctx context.Context, n *scheduler.NodeClaim, opts .
// requeue. This can race with controller-runtime's internal cache as it watches events on the cluster
// to then trigger cluster state updates. Triggering it manually ensures that Karpenter waits for the
// internal cache to sync before moving onto another disruption loop.
p.cluster.UpdateNodeClaim(nodeClaim)
p.cluster.UpdateNodeClaim(nodeClaimForState(nodeClaim, n.InstanceTypeOptions))
if option.Resolve(opts...).RecordPodNomination {
for _, pod := range n.Pods {
p.recorder.Publish(scheduler.NominatePodEvent(pod, nil, nodeClaim))
Expand Down Expand Up @@ -600,3 +597,17 @@ func validateNodeSelectorTerm(ctx context.Context, term corev1.NodeSelectorTerm)
}
return errs
}
func nodeClaimForState(nodeClaim *v1.NodeClaim, instanceTypes []*cloudprovider.InstanceType) *v1.NodeClaim {
if nodeClaim.Status.ProviderID != "" || len(instanceTypes) == 0 {
return nodeClaim
}
tracked := nodeClaim.DeepCopy()
tracked.Status.ProviderID = tracked.Name
tracked.Status.Capacity = resources.MaxResources(lo.Map(instanceTypes, func(instanceType *cloudprovider.InstanceType, _ int) corev1.ResourceList {
return instanceType.Capacity
})...)
tracked.Status.Allocatable = resources.MaxResources(lo.Map(instanceTypes, func(instanceType *cloudprovider.InstanceType, _ int) corev1.ResourceList {
return instanceType.Allocatable()
})...)
return tracked
}
Loading