Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Assume instance exists within eventual-consistency grace period #1024

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 28 additions & 13 deletions pkg/providers/v1/aws.go
Original file line number Diff line number Diff line change
Expand Up @@ -391,7 +391,8 @@ type Cloud struct {

nodeInformer informercorev1.NodeInformer
// Extract the function out to make it easier to test
nodeInformerHasSynced cache.InformerSynced
nodeInformerHasSynced cache.InformerSynced
nodeEventualConsistencyGracePeriod time.Duration

eventBroadcaster record.EventBroadcaster
eventRecorder record.EventRecorder
Expand Down Expand Up @@ -610,13 +611,14 @@ func newAWSCloud2(cfg config.CloudConfig, awsServices Services, provider config.
}

awsCloud := &Cloud{
ec2: ec2,
elb: elb,
elbv2: elbv2,
metadata: metadata,
kms: kms,
cfg: &cfg,
region: regionName,
ec2: ec2,
elb: elb,
elbv2: elbv2,
metadata: metadata,
kms: kms,
cfg: &cfg,
region: regionName,
nodeEventualConsistencyGracePeriod: cfg.Global.NodeEventualConsistencyGracePeriod,
}
awsCloud.instanceCache.cloud = awsCloud
awsCloud.zoneCache.cloud = awsCloud
Expand Down Expand Up @@ -879,9 +881,26 @@ func (c *Cloud) NodeAddressesByProviderID(ctx context.Context, providerID string
return addresses, nil
}

// InstanceExistsByProviderID returns true if the instance with the given provider id still exists.
// InstanceExistsByProviderID implements Instances.InstanceExistsByProviderID (v1)
// returns true if the instance with the given provider id still exists.
// If false is returned with no error, the instance will be immediately deleted by the cloud controller manager.
func (c *Cloud) InstanceExistsByProviderID(ctx context.Context, providerID string) (bool, error) {
exists, err := c.instanceExistsByProviderID(ctx, providerID)
if err != nil {
if IsAWSErrorInstanceNotFound(err) {
// we have to assume this means the instance was terminated a while ago
return false, nil
}
return false, err
}
return exists, nil
}

// instanceExistsByProviderID returns true if the instance with the given provider id still exists.
// If error InvalidInstanceId.NotFound is returned, the caller should decide how to handle this. It could mean:
// a. the instance was launched recently and isn't found in the API due to eventual-consistency
// b. the instance was terminated a while ago, and no longer appears in the API with "terminated" status
func (c *Cloud) instanceExistsByProviderID(_ context.Context, providerID string) (bool, error) {
instanceID, err := KubernetesInstanceID(providerID).MapToAWSInstanceID()
if err != nil {
return false, err
Expand All @@ -897,10 +916,6 @@ func (c *Cloud) InstanceExistsByProviderID(ctx context.Context, providerID strin

instances, err := c.ec2.DescribeInstances(request)
if err != nil {
// if err is InstanceNotFound, return false with no error
if IsAWSErrorInstanceNotFound(err) {
return false, nil
}
return false, err
}
if len(instances) == 0 {
Expand Down
9 changes: 8 additions & 1 deletion pkg/providers/v1/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@ package config

import (
"fmt"
"github.com/aws/aws-sdk-go/aws/request"
"strings"
"time"

"github.com/aws/aws-sdk-go/aws/request"

"github.com/aws/aws-sdk-go/aws/endpoints"

Expand Down Expand Up @@ -62,6 +64,11 @@ type CloudConfig struct {

// NodeIPFamilies determines which IP addresses are added to node objects and their ordering.
NodeIPFamilies []string

// NodeEventualConsistencyGracePeriod is used to account for propogation delays in the EC2 API.
// An instance may not appear in `ec2:DescribeInstances` output for a period of time after launch.
// The cloud-node-lifecycle-controller must not delete the Node prematurely in this case.
NodeEventualConsistencyGracePeriod time.Duration
}
// [ServiceOverride "1"]
// Service = s3
Expand Down
17 changes: 16 additions & 1 deletion pkg/providers/v1/instances_v2.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@ package aws

import (
"context"
"fmt"
"strconv"
"time"

v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
Expand Down Expand Up @@ -51,7 +53,20 @@ func (c *Cloud) InstanceExists(ctx context.Context, node *v1.Node) (bool, error)
return false, err
}

return c.InstanceExistsByProviderID(ctx, providerID)
exists, err := c.instanceExistsByProviderID(ctx, providerID)
if err != nil {
if IsAWSErrorInstanceNotFound(err) {
if time.Since(node.CreationTimestamp.Time) < c.nodeEventualConsistencyGracePeriod {
// recently-launched EC2 instances may not appear in `ec2:DescribeInstances`
// we return an error if we're within the eventual-consistency grace period
// e.g. to cause the cloud-node-lifecycle-controller to ignore this node
return false, fmt.Errorf("node is within eventual-consistency grace period (%v): %v", c.nodeEventualConsistencyGracePeriod, err)
}
return false, nil
}

}
return exists, nil
}

// InstanceShutdown returns true if the instance is shutdown according to the cloud provider.
Expand Down
Loading