diff --git a/.changes/unreleased/Fixed-20241105-192850.yaml b/.changes/unreleased/Fixed-20241105-192850.yaml new file mode 100644 index 0000000..088c6bf --- /dev/null +++ b/.changes/unreleased/Fixed-20241105-192850.yaml @@ -0,0 +1,3 @@ +kind: Fixed +body: ydbops now properly continues the restart loop even if listing nodes during maintenance check fails with "retry exceeded" error +time: 2024-11-05T19:28:50.308019908+01:00 diff --git a/pkg/rolling/rolling.go b/pkg/rolling/rolling.go index 15db1b1..a6a5463 100644 --- a/pkg/rolling/rolling.go +++ b/pkg/rolling/rolling.go @@ -2,6 +2,7 @@ package rolling import ( "bytes" + "errors" "fmt" "math" "strings" @@ -220,6 +221,7 @@ func (r *Rolling) cmsWaitingLoop(task cms.MaintenanceTask, totalNodes int) error task, err = r.cms.RefreshMaintenanceTask(taskID) if err != nil { r.logger.Warnf("Failed to refresh maintenance task: %+v", err) + continue } // NOTE: compatibility check will not fire if rolling restart just @@ -235,7 +237,10 @@ func (r *Rolling) cmsWaitingLoop(task cms.MaintenanceTask, totalNodes int) error // issues once more. We better exit quickly. if !r.opts.SuppressCompatibilityCheck { incompatible := r.tryDetectCompatibilityIssues() - if incompatible != nil { + + // if error is retryExceeded, just keep trying - maybe you have been asking CMS + // from a node that has just been restarted, and it's okay. + if incompatible != nil && !errors.Is(incompatible, &utils.RetryExceededError{}) { return incompatible } } diff --git a/pkg/utils/retries.go b/pkg/utils/retries.go index fd9b6ea..b7fe14e 100644 --- a/pkg/utils/retries.go +++ b/pkg/utils/retries.go @@ -11,6 +11,24 @@ import ( "google.golang.org/grpc/status" ) +type RetryExceededError struct { + msg string + err error +} + +func (e *RetryExceededError) Error() string { + return e.msg + ". Last error:" + e.err.Error() +} + +func (e *RetryExceededError) Unwrap() error { + return e.err +} + +func (e *RetryExceededError) Is(targetErr error) bool { + _, ok := targetErr.(*RetryExceededError) + return ok +} + func backoffTimeAfter(attempt int) time.Duration { return time.Second * time.Duration(int(math.Pow(2, float64(attempt)))) } @@ -45,5 +63,8 @@ func WrapWithRetries( } } - return nil, fmt.Errorf("number of retries exceeded: %v. Last error: %w", maxAttempts, lastError) + return nil, &RetryExceededError{ + msg: fmt.Sprintf("number of retries exceeded: %v", maxAttempts), + err: lastError, + } }