Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rework retry and error classification #349

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 0 additions & 12 deletions proxy/fails/basic_classifiers.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,6 @@ import (
"strings"
)

var IdempotentRequestEOFError = errors.New("EOF (via idempotent request)")

var IncompleteRequestError = errors.New("incomplete request")

var AttemptedTLSWithNonTLSBackend = ClassifierFunc(func(err error) bool {
return errors.As(err, &tls.RecordHeaderError{})
})
Expand Down Expand Up @@ -78,11 +74,3 @@ var UntrustedCert = ClassifierFunc(func(err error) bool {
return false
}
})

var IdempotentRequestEOF = ClassifierFunc(func(err error) bool {
return errors.Is(err, IdempotentRequestEOFError)
})

var IncompleteRequest = ClassifierFunc(func(err error) bool {
return errors.Is(err, IncompleteRequestError)
})
39 changes: 34 additions & 5 deletions proxy/fails/classifier_group.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,11 @@ type ClassifierGroup []Classifier
//
// Otherwise, there’s risk of a mutating non-idempotent request (e.g. send
// payment) being silently retried without the client knowing.
//
// IMPORTANT: to truly determine whether a request is retry-able the function
// round_tripper.isRetrieable must be used. It includes additional checks that
// allow requests to be retried more often than it is allowed by the
// classifiers.
var RetriableClassifiers = ClassifierGroup{
Dial,
AttemptedTLSWithNonTLSBackend,
Expand All @@ -19,18 +24,42 @@ var RetriableClassifiers = ClassifierGroup{
RemoteHandshakeTimeout,
UntrustedCert,
ExpiredOrNotYetValidCertFailure,
IdempotentRequestEOF,
IncompleteRequest,
}

// FailableClassifiers match all errors that should result in the endpoint
// being marked as failed and taken out of the available pool. These endpoints
// will be cleaned up automatically by the route-pruning in case they have
// become stale, therefore there is no need to prune those endpoints
// proactively.
var FailableClassifiers = ClassifierGroup{
RetriableClassifiers,
Dial,
AttemptedTLSWithNonTLSBackend,
HostnameMismatch,
RemoteFailedCertCheck,
RemoteHandshakeFailure,
RemoteHandshakeTimeout,
UntrustedCert,
ExpiredOrNotYetValidCertFailure,
ConnectionResetOnRead,
}

var PrunableClassifiers = RetriableClassifiers
// PrunableClassifiers match all errors that should result in the endpoint
// being pruned. This applies only if the connection to the backend is using
// TLS since the route-integrity prevents routes from being pruned
// automatically if they are configured with TLS.
var PrunableClassifiers = ClassifierGroup{
Dial,
AttemptedTLSWithNonTLSBackend,
HostnameMismatch,
RemoteFailedCertCheck,
RemoteHandshakeFailure,
RemoteHandshakeTimeout,
UntrustedCert,
ExpiredOrNotYetValidCertFailure,
}

// Classify returns true on errors that are retryable
// Classify returns true on errors that match the at least one Classifier from
// the ClassifierGroup it is called on.
func (cg ClassifierGroup) Classify(err error) bool {
for _, classifier := range cg {
if classifier.Classify(err) {
Expand Down
16 changes: 1 addition & 15 deletions proxy/fails/classifier_group_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ import (
"crypto/tls"
"crypto/x509"
"errors"
"fmt"
"net"

"code.cloudfoundry.org/gorouter/proxy/fails"
Expand Down Expand Up @@ -33,30 +32,19 @@ var _ = Describe("ClassifierGroup", func() {
rc := fails.RetriableClassifiers

Expect(rc.Classify(&net.OpError{Op: "dial"})).To(BeTrue())
Expect(rc.Classify(fmt.Errorf("%w (%w)", fails.IncompleteRequestError, &net.OpError{Op: "dial"}))).To(BeTrue())
Expect(rc.Classify(&net.OpError{Op: "remote error", Err: errors.New("tls: bad certificate")})).To(BeTrue())
Expect(rc.Classify(fmt.Errorf("%w (%w)", fails.IncompleteRequestError, &net.OpError{Op: "remote error", Err: errors.New("tls: bad certificate")}))).To(BeTrue())
Expect(rc.Classify(&net.OpError{Op: "remote error", Err: errors.New("tls: handshake failure")})).To(BeTrue())
Expect(rc.Classify(fmt.Errorf("%w (%w)", fails.IncompleteRequestError, &net.OpError{Op: "remote error", Err: errors.New("tls: handshake failure")}))).To(BeTrue())
Expect(rc.Classify(errors.New("net/http: TLS handshake timeout"))).To(BeTrue())
Expect(rc.Classify(fmt.Errorf("%w (%w)", fails.IncompleteRequestError, errors.New("net/http: TLS handshake timeout")))).To(BeTrue())
Expect(rc.Classify(tls.RecordHeaderError{})).To(BeTrue())
Expect(rc.Classify(fmt.Errorf("%w (%w)", fails.IncompleteRequestError, tls.RecordHeaderError{}))).To(BeTrue())
Expect(rc.Classify(x509.HostnameError{})).To(BeTrue())
Expect(rc.Classify(fmt.Errorf("%w (%w)", fails.IncompleteRequestError, x509.HostnameError{}))).To(BeTrue())
Expect(rc.Classify(x509.UnknownAuthorityError{})).To(BeTrue())
Expect(rc.Classify(fmt.Errorf("%w (%w)", fails.IncompleteRequestError, x509.UnknownAuthorityError{}))).To(BeTrue())
Expect(rc.Classify(x509.CertificateInvalidError{Reason: x509.Expired})).To(BeTrue())
Expect(rc.Classify(fmt.Errorf("%w (%w)", fails.IncompleteRequestError, x509.CertificateInvalidError{Reason: x509.Expired}))).To(BeTrue())
Expect(rc.Classify(errors.New("i'm a potato"))).To(BeFalse())
Expect(rc.Classify(fails.IdempotentRequestEOFError)).To(BeTrue())
Expect(rc.Classify(fails.IncompleteRequestError)).To(BeTrue())
Expect(rc.Classify(fmt.Errorf("%w (%w)", fails.IncompleteRequestError, x509.HostnameError{}))).To(BeTrue())
})
})

Describe("prunable", func() {
It("matches hostname mismatch", func() {
It("matches prunable errors", func() {
pc := fails.PrunableClassifiers

Expect(pc.Classify(&net.OpError{Op: "dial"})).To(BeTrue())
Expand All @@ -69,8 +57,6 @@ var _ = Describe("ClassifierGroup", func() {
Expect(pc.Classify(x509.UnknownAuthorityError{})).To(BeTrue())
Expect(pc.Classify(x509.CertificateInvalidError{Reason: x509.Expired})).To(BeTrue())
Expect(pc.Classify(errors.New("i'm a potato"))).To(BeFalse())
Expect(pc.Classify(fails.IdempotentRequestEOFError)).To(BeTrue())
Expect(pc.Classify(fails.IncompleteRequestError)).To(BeTrue())
})
})
})
4 changes: 4 additions & 0 deletions proxy/fails/fails.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
// Package fails provides error classifiers that allow the user to check if a
// certain error fulfills certain requirements. See the individual
// documentation of each ClassifierGroup for details.
package fails
65 changes: 0 additions & 65 deletions proxy/round_tripper/error_handler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ import (
"crypto/tls"
"crypto/x509"
"errors"
"fmt"
"net"
"net/http/httptest"

Expand Down Expand Up @@ -137,22 +136,6 @@ var _ = Describe("HandleError", func() {
})
})

Context("HostnameMismatch wrapped in IncompleteRequestError", func() {
BeforeEach(func() {
wrappedErr := x509.HostnameError{Host: "the wrong one"}
err = fmt.Errorf("%w (%w)", fails.IncompleteRequestError, wrappedErr)
errorHandler.HandleError(responseWriter, err)
})

It("has a 503 Status Code", func() {
Expect(responseWriter.Status()).To(Equal(503))
})

It("emits a backend_invalid_id metric", func() {
Expect(metricReporter.CaptureBackendInvalidIDCallCount()).To(Equal(1))
})
})

Context("Untrusted Cert", func() {
BeforeEach(func() {
err = x509.UnknownAuthorityError{}
Expand All @@ -168,22 +151,6 @@ var _ = Describe("HandleError", func() {
})
})

Context("Untrusted Cert wrapped in IncompleteRequestError", func() {
BeforeEach(func() {
wrappedErr := x509.UnknownAuthorityError{}
err = fmt.Errorf("%w (%w)", fails.IncompleteRequestError, wrappedErr)
errorHandler.HandleError(responseWriter, err)
})

It("has a 526 Status Code", func() {
Expect(responseWriter.Status()).To(Equal(526))
})

It("emits a backend_invalid_tls_cert metric", func() {
Expect(metricReporter.CaptureBackendInvalidTLSCertCallCount()).To(Equal(1))
})
})

Context("Attempted TLS with non-TLS backend error", func() {
BeforeEach(func() {
err = tls.RecordHeaderError{Msg: "bad handshake"}
Expand All @@ -199,22 +166,6 @@ var _ = Describe("HandleError", func() {
})
})

Context("Attempted TLS with non-TLS backend error wrapped in IncompleteRequestError", func() {
BeforeEach(func() {
wrappedErr := tls.RecordHeaderError{Msg: "bad handshake"}
err = fmt.Errorf("%w (%w)", fails.IncompleteRequestError, wrappedErr)
errorHandler.HandleError(responseWriter, err)
})

It("has a 525 Status Code", func() {
Expect(responseWriter.Status()).To(Equal(525))
})

It("emits a backend_tls_handshake_failed metric", func() {
Expect(metricReporter.CaptureBackendTLSHandshakeFailedCallCount()).To(Equal(1))
})
})

Context("Remote handshake failure", func() {
BeforeEach(func() {
err = &net.OpError{Op: "remote error", Err: errors.New("tls: handshake failure")}
Expand All @@ -230,22 +181,6 @@ var _ = Describe("HandleError", func() {
})
})

Context("Remote handshake failure wrapped in IncompleteRequestError", func() {
BeforeEach(func() {
wrappedErr := &net.OpError{Op: "remote error", Err: errors.New("tls: handshake failure")}
err = fmt.Errorf("%w (%w)", fails.IncompleteRequestError, wrappedErr)
errorHandler.HandleError(responseWriter, err)
})

It("has a 525 Status Code", func() {
Expect(responseWriter.Status()).To(Equal(525))
})

It("emits a backend_tls_handshake_failed metric", func() {
Expect(metricReporter.CaptureBackendTLSHandshakeFailedCallCount()).To(Equal(1))
})
})

Context("Context Cancelled Error", func() {
BeforeEach(func() {
err = context.Canceled
Expand Down
16 changes: 7 additions & 9 deletions proxy/round_tripper/proxy_round_tripper.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ package round_tripper
import (
"context"
"errors"
"fmt"
"io"
"io/ioutil"
"net/http"
Expand Down Expand Up @@ -177,7 +176,7 @@ func (rt *roundTripper) RoundTrip(originalRequest *http.Request) (*http.Response
if err != nil {
reqInfo.FailedAttempts++
reqInfo.LastFailedAttemptFinishedAt = time.Now()
retriable, err := rt.isRetriable(request, err, trace)
retriable := rt.isRetriable(request, err, trace)

logger.Error("backend-endpoint-failed",
zap.Error(err),
Expand Down Expand Up @@ -226,7 +225,7 @@ func (rt *roundTripper) RoundTrip(originalRequest *http.Request) (*http.Response
if err != nil {
reqInfo.FailedAttempts++
reqInfo.LastFailedAttemptFinishedAt = time.Now()
retriable, err := rt.isRetriable(request, err, trace)
retriable := rt.isRetriable(request, err, trace)

logger.Error(
"route-service-connection-failed",
Expand Down Expand Up @@ -482,24 +481,23 @@ func isIdempotent(request *http.Request) bool {
return false
}

func (rt *roundTripper) isRetriable(request *http.Request, err error, trace *requestTracer) (bool, error) {
func (rt *roundTripper) isRetriable(request *http.Request, err error, trace *requestTracer) bool {
// if the context has been cancelled we do not perform further retries
if request.Context().Err() != nil {
return false, fmt.Errorf("%w (%w)", request.Context().Err(), err)
return false
}

// io.EOF errors are considered safe to retry for certain requests
// Replace the error here to track this state when classifying later.
if err == io.EOF && isIdempotent(request) {
err = fails.IdempotentRequestEOFError
return true
}
// We can retry for sure if we never obtained a connection
// since there is no way any data was transmitted. If headers could not
// be written in full, the request should also be safe to retry.
if !trace.GotConn() || !trace.WroteHeaders() {
err = fmt.Errorf("%w (%w)", fails.IncompleteRequestError, err)
return true
}

retriable := rt.retriableClassifier.Classify(err)
return retriable, err
return rt.retriableClassifier.Classify(err)
}
Loading