diff --git a/pkg/providers/v1/aws.go b/pkg/providers/v1/aws.go index 6b6c7084fe..cdc6a3425e 100644 --- a/pkg/providers/v1/aws.go +++ b/pkg/providers/v1/aws.go @@ -390,7 +390,8 @@ type Cloud struct { nodeInformer informercorev1.NodeInformer // Extract the function out to make it easier to test - nodeInformerHasSynced cache.InformerSynced + nodeInformerHasSynced cache.InformerSynced + nodeEventualConsistencyGracePeriod time.Duration eventBroadcaster record.EventBroadcaster eventRecorder record.EventRecorder @@ -600,13 +601,14 @@ func newAWSCloud2(cfg config.CloudConfig, awsServices Services, provider config. } awsCloud := &Cloud{ - ec2: ec2, - elb: elb, - elbv2: elbv2, - metadata: metadata, - kms: kms, - cfg: &cfg, - region: regionName, + ec2: ec2, + elb: elb, + elbv2: elbv2, + metadata: metadata, + kms: kms, + cfg: &cfg, + region: regionName, + nodeEventualConsistencyGracePeriod: cfg.Global.NodeEventualConsistencyGracePeriod, } awsCloud.instanceCache.cloud = awsCloud awsCloud.zoneCache.cloud = awsCloud @@ -887,10 +889,9 @@ func (c *Cloud) InstanceExistsByProviderID(ctx context.Context, providerID strin instances, err := c.ec2.DescribeInstances(request) if err != nil { - // if err is InstanceNotFound, return false with no error - if IsAWSErrorInstanceNotFound(err) { - return false, nil - } + // we may receive InvalidInstanceId.NotFound for some time after launch + // due to eventual-consistency. return this error so the caller can decide + // how to handle this return false, err } if len(instances) == 0 { diff --git a/pkg/providers/v1/config/config.go b/pkg/providers/v1/config/config.go index ef6e371115..7a3764c0c4 100644 --- a/pkg/providers/v1/config/config.go +++ b/pkg/providers/v1/config/config.go @@ -2,8 +2,10 @@ package config import ( "fmt" - "github.com/aws/aws-sdk-go/aws/request" "strings" + "time" + + "github.com/aws/aws-sdk-go/aws/request" "github.com/aws/aws-sdk-go/aws/endpoints" @@ -62,6 +64,11 @@ type CloudConfig struct { // NodeIPFamilies determines which IP addresses are added to node objects and their ordering. NodeIPFamilies []string + + // NodeEventualConsistencyGracePeriod is used to account for propogation delays in the EC2 API. + // An instance may not appear in `ec2:DescribeInstances` output for a period of time after launch. + // The cloud-node-lifecycle-controller must not delete the Node prematurely in this case. + NodeEventualConsistencyGracePeriod time.Duration } // [ServiceOverride "1"] // Service = s3 diff --git a/pkg/providers/v1/instances_v2.go b/pkg/providers/v1/instances_v2.go index f7a08a2d16..284f506511 100644 --- a/pkg/providers/v1/instances_v2.go +++ b/pkg/providers/v1/instances_v2.go @@ -22,7 +22,9 @@ package aws import ( "context" + "fmt" "strconv" + "time" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/types" @@ -50,7 +52,23 @@ func (c *Cloud) InstanceExists(ctx context.Context, node *v1.Node) (bool, error) return false, err } - return c.InstanceExistsByProviderID(ctx, providerID) + exists, err := c.InstanceExistsByProviderID(ctx, providerID) + if err != nil { + if IsAWSErrorInstanceNotFound(err) { + if time.Since(node.CreationTimestamp.Time) < c.nodeEventualConsistencyGracePeriod { + // recently-launched EC2 instances may not appear in `ec2:DescribeInstances` + // we return an error if we're within the eventual-consistency grace period + // e.g. to cause the cloud-node-lifecycle-controller to ignore this node + return false, fmt.Errorf("node is within eventual-consistency grace period (%v): %v", c.nodeEventualConsistencyGracePeriod, err) + } + // if the grace period has elapsed, assume the instance was not found because it was terminated a while ago. + // instances remain in the API with "terminated" status for "a short while": + // https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/TroubleshootingInstancesShuttingDown.html#terminated-instance-still-displaying + return false, nil + } + + } + return exists, nil } // InstanceShutdown returns true if the instance is shutdown according to the cloud provider.