From 914dcf4bbf040389519ed9bf8bff7e5cc2009be2 Mon Sep 17 00:00:00 2001 From: Ricardo Maraschini Date: Wed, 13 Nov 2024 14:32:26 +0100 Subject: [PATCH] bug: account for etcd leader changes error when validation if a session is valid or not we were failing if the etcd leader changed. this case happens quite often when joining the second node to an embedded cluster installation. this commit adds a retry mechanism for this specific scenario. we retry 8 times, the backoff time should span up to ~5 seconds. --- pkg/handlers/middleware.go | 1 + pkg/handlers/session.go | 26 ++++++++++++++++++++++++-- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/pkg/handlers/middleware.go b/pkg/handlers/middleware.go index ea328f8a89..0525e1febb 100644 --- a/pkg/handlers/middleware.go +++ b/pkg/handlers/middleware.go @@ -99,6 +99,7 @@ func RequireValidSessionQuietMiddleware(kotsStore store.Store) mux.MiddlewareFun return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { sess, err := requireValidSession(kotsStore, w, r) if err != nil { + logger.Errorf("failed validating session: %s", err) return } diff --git a/pkg/handlers/session.go b/pkg/handlers/session.go index 2179f996ed..e2e199359a 100644 --- a/pkg/handlers/session.go +++ b/pkg/handlers/session.go @@ -3,6 +3,7 @@ package handlers import ( "context" "encoding/base64" + "fmt" "net/http" "strings" "time" @@ -17,6 +18,8 @@ import ( "github.com/replicatedhq/kots/pkg/util" kuberneteserrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/util/retry" ) type authorization struct { @@ -92,12 +95,31 @@ func requireValidSession(kotsStore store.Store, w http.ResponseWriter, r *http.R return nil, err } - passwordUpdatedAt, err := kotsStore.GetPasswordUpdatedAt() - if err != nil { + // XXX: we have noticed that when joining a second controller to an + // embedded cluster installation the etcd leader usually changes. + // GetPasswordUpdatedAt() function reads a secret from the cluster + // and if it attempts to do so while the leader is changing we receive + // an error back. here we retry for this specific etcd error. + var passwordUpdatedAt *time.Time + var i int + if err = retry.OnError( + // this amounts to a maximum of ~5 seconds. + wait.Backoff{Steps: 8, Duration: 40 * time.Millisecond, Factor: 2}, + func(err error) bool { + return strings.Contains(err.Error(), "leader changed") + }, + func() (err error) { + i++ + passwordUpdatedAt, err = kotsStore.GetPasswordUpdatedAt() + fmt.Println("retry nr ", i, ": ", err) + return + }, + ); err != nil { response := types.ErrorResponse{Error: util.StrPointer("failed to validate session with current password")} JSON(w, http.StatusUnauthorized, response) return nil, err } + if passwordUpdatedAt != nil && passwordUpdatedAt.After(sess.IssuedAt) { if err := kotsStore.DeleteSession(sess.ID); err != nil { logger.Error(errors.Wrapf(err, "password was updated after session created. failed to delete invalid session %s", sess.ID))