Skip to content

Commit

Permalink
Drop Node events when EC2 instance does not exist and node is not new
Browse files Browse the repository at this point in the history
  • Loading branch information
cartermckinnon committed Nov 27, 2023
1 parent 08ac6f0 commit 1ccb35f
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 3 deletions.
19 changes: 19 additions & 0 deletions pkg/controllers/tagging/tagging_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@ const (

// The label for depicting total number of errors a work item encounter and fail
errorsAfterRetriesExhaustedWorkItemErrorMetric = "errors_after_retries_exhausted"

// The period of time after Node creation to retry tagging due to eventual consistency of the CreateTags API.
newNodeEventualConsistencyGracePeriod = time.Minute * 5
)

// Controller is the controller implementation for tagging cluster resources.
Expand Down Expand Up @@ -292,6 +295,18 @@ func (tc *Controller) tagEc2Instance(node *v1.Node) error {
err := tc.cloud.TagResource(string(instanceID), tc.tags)

if err != nil {
if awsv1.IsAWSErrorInstanceNotFound(err) {
// This can happen for two reasons.
// 1. The CreateTags API is eventually consistent. In rare cases, a newly-created instance may not be taggable for a short period.
// We will re-queue the event and retry.
if isNodeWithinEventualConsistencyGracePeriod(node) {
return fmt.Errorf("EC2 instance %s for node %s does not exist, but node is within eventual consistency grace period", instanceID, node.GetName())
}
// 2. The event in our workQueue is stale, and the instance no longer exists.
// Tagging will never succeed, and the event should not be re-queued.
klog.Infof("Skip tagging since EC2 instance %s for node %s does not exist", instanceID, node.GetName())
return nil
}
klog.Errorf("Error in tagging EC2 instance %s for node %s, error: %v", instanceID, node.GetName(), err)
return err
}
Expand Down Expand Up @@ -380,3 +395,7 @@ func (tc *Controller) getChecksumOfTags() string {
sort.Strings(tags)
return fmt.Sprintf("%x", md5.Sum([]byte(strings.Join(tags, ","))))
}

func isNodeWithinEventualConsistencyGracePeriod(node *v1.Node) bool {
return time.Since(node.CreationTimestamp.Time) < newNodeEventualConsistencyGracePeriod
}
5 changes: 3 additions & 2 deletions pkg/providers/v1/aws.go
Original file line number Diff line number Diff line change
Expand Up @@ -1707,7 +1707,7 @@ func (c *Cloud) InstanceExistsByProviderID(ctx context.Context, providerID strin
instances, err := c.ec2.DescribeInstances(request)
if err != nil {
// if err is InstanceNotFound, return false with no error
if isAWSErrorInstanceNotFound(err) {
if IsAWSErrorInstanceNotFound(err) {
return false, nil
}
return false, err
Expand Down Expand Up @@ -1946,7 +1946,8 @@ func (c *Cloud) GetZoneByNodeName(ctx context.Context, nodeName types.NodeName)

}

func isAWSErrorInstanceNotFound(err error) bool {
// IsAWSErrorInstanceNotFound returns true if the specified error is an awserr.Error with the code `InvalidInstanceId.NotFound`.
func IsAWSErrorInstanceNotFound(err error) bool {
if err == nil {
return false
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/providers/v1/tags.go
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,7 @@ func (c *Cloud) UntagResource(resourceID string, tags map[string]string) error {
if err != nil {
// An instance not found should not fail the untagging workflow as it
// would for tagging, since the target state is already reached.
if isAWSErrorInstanceNotFound(err) {
if IsAWSErrorInstanceNotFound(err) {
klog.Infof("Couldn't find resource when trying to untag it hence skipping it, %v", err)
return nil
}
Expand Down

0 comments on commit 1ccb35f

Please sign in to comment.