From 40616077ee5233c7c62553128c5f04b049acf6b9 Mon Sep 17 00:00:00 2001 From: Ricardo Maraschini Date: Wed, 13 Nov 2024 14:32:26 +0100 Subject: [PATCH] bug: account for etcd leader changes error when validation if a session is valid or not we were failing if the etcd leader changed. this case happens quite often when joining the second node to an embedded cluster installation. this commit adds a retry mechanism for this specific scenario. we retry 8 times, the backoff time should span up to ~5 seconds. --- pkg/handlers/middleware.go | 1 + pkg/handlers/session.go | 22 ++++++++++++++++++++-- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/pkg/handlers/middleware.go b/pkg/handlers/middleware.go index ea328f8a89..0525e1febb 100644 --- a/pkg/handlers/middleware.go +++ b/pkg/handlers/middleware.go @@ -99,6 +99,7 @@ func RequireValidSessionQuietMiddleware(kotsStore store.Store) mux.MiddlewareFun return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { sess, err := requireValidSession(kotsStore, w, r) if err != nil { + logger.Errorf("failed validating session: %s", err) return } diff --git a/pkg/handlers/session.go b/pkg/handlers/session.go index 2179f996ed..a9413244d3 100644 --- a/pkg/handlers/session.go +++ b/pkg/handlers/session.go @@ -17,6 +17,8 @@ import ( "github.com/replicatedhq/kots/pkg/util" kuberneteserrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/util/retry" ) type authorization struct { @@ -92,12 +94,28 @@ func requireValidSession(kotsStore store.Store, w http.ResponseWriter, r *http.R return nil, err } - passwordUpdatedAt, err := kotsStore.GetPasswordUpdatedAt() - if err != nil { + // XXX: we have noticed that when joining a second controller to an + // embedded cluster installation the etcd leader usually changes. + // GetPasswordUpdatedAt() function reads a secret from the cluster + // and if it attempts to do so while the leader is changing we receive + // an error back. here we retry for this specific etcd error. + var passwordUpdatedAt *time.Time + if err = retry.OnError( + // this amounts to a maximum of ~5 seconds. + wait.Backoff{Steps: 8, Duration: 40 * time.Millisecond, Factor: 2}, + func(err error) bool { + return strings.Contains(err.Error(), "leader changed") + }, + func() (err error) { + passwordUpdatedAt, err = kotsStore.GetPasswordUpdatedAt() + return + }, + ); err != nil { response := types.ErrorResponse{Error: util.StrPointer("failed to validate session with current password")} JSON(w, http.StatusUnauthorized, response) return nil, err } + if passwordUpdatedAt != nil && passwordUpdatedAt.After(sess.IssuedAt) { if err := kotsStore.DeleteSession(sess.ID); err != nil { logger.Error(errors.Wrapf(err, "password was updated after session created. failed to delete invalid session %s", sess.ID))