From b0b8d669109e3336f0f3d411a52d705b4a5dd038 Mon Sep 17 00:00:00 2001
From: lklimek <842586+lklimek@users.noreply.github.com>
Date: Wed, 6 Mar 2024 10:32:23 +0100
Subject: [PATCH] feat(proxy): add request result to prometheus stats (#743)

* feat(proxy): add request result to prometheus stats

* chore: add missing metric name

* test(e2e): fix flaky TestApp_TxTooBig

* fix(kvstore): json encoding consumes too much memory

* fix(e2e): dashcore fails

* chore: initial app state changed to string

* fix(kvstore): escape keys/values properly in json

* fix: genesis test

* test(statesync): limit backfill test time to 15s

* chore(kvstore): snapshot chunks streaming reader

* chore: self review minor changes

* test(statesync): revert timeout change in backfill
---
 go.mod                        |   1 -
 internal/proxy/client.go      |  69 +++++++++++-----------
 internal/proxy/client_test.go | 105 +++++++++++++++++++++++++++++++++-
 internal/proxy/metrics.gen.go |   2 +-
 internal/proxy/metrics.go     |   2 +-
 5 files changed, 142 insertions(+), 37 deletions(-)

diff --git a/go.mod b/go.mod
index 2c7a5d95ed..c521ea9e80 100644
--- a/go.mod
+++ b/go.mod
@@ -53,7 +53,6 @@ require (
 	github.com/go-pkgz/jrpc v0.2.0
 	github.com/google/go-cmp v0.6.0
 	github.com/vektra/mockery/v2 v2.41.0
-	gotest.tools v2.2.0+incompatible
 )
 
 require (
diff --git a/internal/proxy/client.go b/internal/proxy/client.go
index 4ed8395759..5d6d06f8c6 100644
--- a/internal/proxy/client.go
+++ b/internal/proxy/client.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"io"
 	"os"
+	"strconv"
 	"syscall"
 	"time"
 
@@ -136,78 +137,78 @@ func kill() error {
 	return p.Signal(syscall.SIGABRT)
 }
 
-func (app *proxyClient) InitChain(ctx context.Context, req *types.RequestInitChain) (*types.ResponseInitChain, error) {
-	defer addTimeSample(app.metrics.MethodTiming.With("method", "init_chain", "type", "sync"))()
+func (app *proxyClient) InitChain(ctx context.Context, req *types.RequestInitChain) (r *types.ResponseInitChain, err error) {
+	defer addTimeSample(app.metrics.MethodTiming.With("method", "init_chain", "type", "sync", "success"))(&err)
 	return app.client.InitChain(ctx, req)
 }
 
-func (app *proxyClient) PrepareProposal(ctx context.Context, req *types.RequestPrepareProposal) (*types.ResponsePrepareProposal, error) {
-	defer addTimeSample(app.metrics.MethodTiming.With("method", "prepare_proposal", "type", "sync"))()
+func (app *proxyClient) PrepareProposal(ctx context.Context, req *types.RequestPrepareProposal) (r *types.ResponsePrepareProposal, err error) {
+	defer addTimeSample(app.metrics.MethodTiming.With("method", "prepare_proposal", "type", "sync"))(&err)
 	return app.client.PrepareProposal(ctx, req)
 }
 
-func (app *proxyClient) ProcessProposal(ctx context.Context, req *types.RequestProcessProposal) (*types.ResponseProcessProposal, error) {
-	defer addTimeSample(app.metrics.MethodTiming.With("method", "process_proposal", "type", "sync"))()
+func (app *proxyClient) ProcessProposal(ctx context.Context, req *types.RequestProcessProposal) (r *types.ResponseProcessProposal, err error) {
+	defer addTimeSample(app.metrics.MethodTiming.With("method", "process_proposal", "type", "sync"))(&err)
 	return app.client.ProcessProposal(ctx, req)
 }
 
-func (app *proxyClient) ExtendVote(ctx context.Context, req *types.RequestExtendVote) (*types.ResponseExtendVote, error) {
-	defer addTimeSample(app.metrics.MethodTiming.With("method", "extend_vote", "type", "sync"))()
+func (app *proxyClient) ExtendVote(ctx context.Context, req *types.RequestExtendVote) (r *types.ResponseExtendVote, err error) {
+	defer addTimeSample(app.metrics.MethodTiming.With("method", "extend_vote", "type", "sync"))(&err)
 	return app.client.ExtendVote(ctx, req)
 }
 
-func (app *proxyClient) VerifyVoteExtension(ctx context.Context, req *types.RequestVerifyVoteExtension) (*types.ResponseVerifyVoteExtension, error) {
-	defer addTimeSample(app.metrics.MethodTiming.With("method", "verify_vote_extension", "type", "sync"))()
+func (app *proxyClient) VerifyVoteExtension(ctx context.Context, req *types.RequestVerifyVoteExtension) (r *types.ResponseVerifyVoteExtension, err error) {
+	defer addTimeSample(app.metrics.MethodTiming.With("method", "verify_vote_extension", "type", "sync"))(&err)
 	return app.client.VerifyVoteExtension(ctx, req)
 }
 
-func (app *proxyClient) FinalizeBlock(ctx context.Context, req *types.RequestFinalizeBlock) (*types.ResponseFinalizeBlock, error) {
-	defer addTimeSample(app.metrics.MethodTiming.With("method", "finalize_block", "type", "sync"))()
+func (app *proxyClient) FinalizeBlock(ctx context.Context, req *types.RequestFinalizeBlock) (r *types.ResponseFinalizeBlock, err error) {
+	defer addTimeSample(app.metrics.MethodTiming.With("method", "finalize_block", "type", "sync"))(&err)
 	return app.client.FinalizeBlock(ctx, req)
 }
 
-func (app *proxyClient) Flush(ctx context.Context) error {
-	defer addTimeSample(app.metrics.MethodTiming.With("method", "flush", "type", "sync"))()
+func (app *proxyClient) Flush(ctx context.Context) (err error) {
+	defer addTimeSample(app.metrics.MethodTiming.With("method", "flush", "type", "sync"))(&err)
 	return app.client.Flush(ctx)
 }
 
-func (app *proxyClient) CheckTx(ctx context.Context, req *types.RequestCheckTx) (*types.ResponseCheckTx, error) {
-	defer addTimeSample(app.metrics.MethodTiming.With("method", "check_tx", "type", "sync"))()
+func (app *proxyClient) CheckTx(ctx context.Context, req *types.RequestCheckTx) (r *types.ResponseCheckTx, err error) {
+	defer addTimeSample(app.metrics.MethodTiming.With("method", "check_tx", "type", "sync"))(&err)
 	return app.client.CheckTx(ctx, req)
 }
 
-func (app *proxyClient) Echo(ctx context.Context, msg string) (*types.ResponseEcho, error) {
-	defer addTimeSample(app.metrics.MethodTiming.With("method", "echo", "type", "sync"))()
+func (app *proxyClient) Echo(ctx context.Context, msg string) (r *types.ResponseEcho, err error) {
+	defer addTimeSample(app.metrics.MethodTiming.With("method", "echo", "type", "sync"))(&err)
 	return app.client.Echo(ctx, msg)
 }
 
-func (app *proxyClient) Info(ctx context.Context, req *types.RequestInfo) (*types.ResponseInfo, error) {
-	defer addTimeSample(app.metrics.MethodTiming.With("method", "info", "type", "sync"))()
+func (app *proxyClient) Info(ctx context.Context, req *types.RequestInfo) (r *types.ResponseInfo, err error) {
+	defer addTimeSample(app.metrics.MethodTiming.With("method", "info", "type", "sync"))(&err)
 	return app.client.Info(ctx, req)
 }
 
-func (app *proxyClient) Query(ctx context.Context, req *types.RequestQuery) (*types.ResponseQuery, error) {
-	defer addTimeSample(app.metrics.MethodTiming.With("method", "query", "type", "sync"))()
+func (app *proxyClient) Query(ctx context.Context, req *types.RequestQuery) (r *types.ResponseQuery, err error) {
+	defer addTimeSample(app.metrics.MethodTiming.With("method", "query", "type", "sync"))(&err)
 	return app.client.Query(ctx, req)
 }
 
-func (app *proxyClient) ListSnapshots(ctx context.Context, req *types.RequestListSnapshots) (*types.ResponseListSnapshots, error) {
-	defer addTimeSample(app.metrics.MethodTiming.With("method", "list_snapshots", "type", "sync"))()
+func (app *proxyClient) ListSnapshots(ctx context.Context, req *types.RequestListSnapshots) (r *types.ResponseListSnapshots, err error) {
+	defer addTimeSample(app.metrics.MethodTiming.With("method", "list_snapshots", "type", "sync"))(&err)
 	return app.client.ListSnapshots(ctx, req)
 }
 
-func (app *proxyClient) OfferSnapshot(ctx context.Context, req *types.RequestOfferSnapshot) (*types.ResponseOfferSnapshot, error) {
-	defer addTimeSample(app.metrics.MethodTiming.With("method", "offer_snapshot", "type", "sync"))()
+func (app *proxyClient) OfferSnapshot(ctx context.Context, req *types.RequestOfferSnapshot) (r *types.ResponseOfferSnapshot, err error) {
+	defer addTimeSample(app.metrics.MethodTiming.With("method", "offer_snapshot", "type", "sync"))(&err)
 	return app.client.OfferSnapshot(ctx, req)
 }
 
-func (app *proxyClient) LoadSnapshotChunk(ctx context.Context, req *types.RequestLoadSnapshotChunk) (*types.ResponseLoadSnapshotChunk, error) {
-	defer addTimeSample(app.metrics.MethodTiming.With("method", "load_snapshot_chunk", "type", "sync"))()
+func (app *proxyClient) LoadSnapshotChunk(ctx context.Context, req *types.RequestLoadSnapshotChunk) (r *types.ResponseLoadSnapshotChunk, err error) {
+	defer addTimeSample(app.metrics.MethodTiming.With("method", "load_snapshot_chunk", "type", "sync"))(&err)
 	return app.client.LoadSnapshotChunk(ctx, req)
 }
 
-func (app *proxyClient) ApplySnapshotChunk(ctx context.Context, req *types.RequestApplySnapshotChunk) (*types.ResponseApplySnapshotChunk, error) {
-	defer addTimeSample(app.metrics.MethodTiming.With("method", "apply_snapshot_chunk", "type", "sync"))()
+func (app *proxyClient) ApplySnapshotChunk(ctx context.Context, req *types.RequestApplySnapshotChunk) (r *types.ResponseApplySnapshotChunk, err error) {
+	defer addTimeSample(app.metrics.MethodTiming.With("method", "apply_snapshot_chunk", "type", "sync"))(&err)
 	return app.client.ApplySnapshotChunk(ctx, req)
 }
 
@@ -215,7 +216,11 @@ func (app *proxyClient) ApplySnapshotChunk(ctx context.Context, req *types.Reque
 // The observation added to m is the number of seconds ellapsed since addTimeSample
 // was initially called. addTimeSample is meant to be called in a defer to calculate
 // the amount of time a function takes to complete.
-func addTimeSample(m metrics.Histogram) func() {
+func addTimeSample(m metrics.Histogram) func(*error) {
 	start := time.Now()
-	return func() { m.Observe(time.Since(start).Seconds()) }
+
+	// we take err address to simplify usage in defer()
+	return func(err *error) {
+		m.With("success", strconv.FormatBool(*err == nil)).Observe(time.Since(start).Seconds())
+	}
 }
diff --git a/internal/proxy/client_test.go b/internal/proxy/client_test.go
index 700ee47a65..06ab338916 100644
--- a/internal/proxy/client_test.go
+++ b/internal/proxy/client_test.go
@@ -7,20 +7,22 @@ import (
 	"os"
 	"os/signal"
 	"strings"
+	"sync"
 	"syscall"
 	"testing"
 	"time"
 
+	"github.com/go-kit/kit/metrics"
+	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/mock"
 	"github.com/stretchr/testify/require"
 
-	"gotest.tools/assert"
-
 	abciclient "github.com/dashpay/tenderdash/abci/client"
 	abcimocks "github.com/dashpay/tenderdash/abci/client/mocks"
 	"github.com/dashpay/tenderdash/abci/example/kvstore"
 	"github.com/dashpay/tenderdash/abci/server"
 	"github.com/dashpay/tenderdash/abci/types"
+	"github.com/dashpay/tenderdash/abci/types/mocks"
 	"github.com/dashpay/tenderdash/libs/log"
 	tmrand "github.com/dashpay/tenderdash/libs/rand"
 )
@@ -239,3 +241,102 @@ func TestAppConns_Failure(t *testing.T) {
 		t.Fatal("expected process to receive SIGTERM signal")
 	}
 }
+func TestFailureMetrics(t *testing.T) {
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
+	logger := log.NewTestingLogger(t)
+	mtr := mockMetrics()
+	hst := mtr.MethodTiming.(*mockMetric)
+
+	app := mocks.NewApplication(t)
+	app.On("CheckTx", mock.Anything, mock.Anything).Return(&types.ResponseCheckTx{}, errors.New("some error")).Once()
+	app.On("CheckTx", mock.Anything, mock.Anything).Return(&types.ResponseCheckTx{}, nil).Times(2)
+	app.On("Info", mock.Anything, mock.Anything).Return(&types.ResponseInfo{}, nil)
+
+	// promtest.ToFloat64(hst)
+	cli := abciclient.NewLocalClient(logger, app)
+
+	proxy := New(cli, log.NewNopLogger(), mtr)
+
+	var err error
+	for i := 0; i < 5; i++ {
+		_, err = proxy.Info(ctx, &types.RequestInfo{})
+		assert.NoError(t, err)
+	}
+
+	for i := 0; i < 3; i++ {
+		_, err = proxy.CheckTx(ctx, &types.RequestCheckTx{})
+		if i == 0 {
+			assert.Error(t, err)
+		} else {
+			assert.NoError(t, err)
+		}
+	}
+
+	cancel() // this should stop all clients
+	proxy.Wait()
+	assert.Equal(t, 5, hst.count["method=info,type=sync,success=true"])
+	assert.Equal(t, 1, hst.count["method=check_tx,type=sync,success=false"])
+	assert.Equal(t, 2, hst.count["method=check_tx,type=sync,success=true"])
+}
+
+func mockMetrics() *Metrics {
+	return &Metrics{
+		MethodTiming: &mockMetric{
+			labels: []string{},
+			count:  make(map[string]int),
+			mtx:    &sync.Mutex{},
+		},
+	}
+}
+
+type mockMetric struct {
+	labels []string
+	/// count maps concatenated labels to the count of observations.
+	count map[string]int
+	mtx   *sync.Mutex
+}
+
+var _ = metrics.Histogram(&mockMetric{})
+
+func (m *mockMetric) With(labelValues ...string) metrics.Histogram {
+	m.mtx.Lock()
+	defer m.mtx.Unlock()
+
+	return &mockMetric{
+		labels: append(m.labels, labelValues...),
+		count:  m.count,
+		mtx:    m.mtx, // pointer, as we use the same m.count
+	}
+}
+
+func (m *mockMetric) Observe(_value float64) {
+	m.mtx.Lock()
+	defer m.mtx.Unlock()
+
+	labels := ""
+	for i, label := range m.labels {
+		labels += label
+		if i < len(m.labels)-1 {
+			if i%2 == 0 {
+				labels += "="
+			} else {
+				labels += ","
+			}
+		}
+	}
+
+	m.count[labels]++
+}
+
+func (m *mockMetric) String() (s string) {
+	m.mtx.Lock()
+	defer m.mtx.Unlock()
+
+	for labels, total := range m.count {
+		s += fmt.Sprintf("%s: %d\n", labels, total)
+	}
+
+	return s
+}
diff --git a/internal/proxy/metrics.gen.go b/internal/proxy/metrics.gen.go
index ea483f83db..d786740c63 100644
--- a/internal/proxy/metrics.gen.go
+++ b/internal/proxy/metrics.gen.go
@@ -21,7 +21,7 @@ func PrometheusMetrics(namespace string, labelsAndValues ...string) *Metrics {
 			Help:      "Timing for each ABCI method.",
 
 			Buckets: []float64{.0001, .0004, .002, .009, .02, .1, .65, 2, 6, 25},
-		}, append(labels, "method", "type")).With(labelsAndValues...),
+		}, append(labels, "method", "type", "success")).With(labelsAndValues...),
 	}
 }
 
diff --git a/internal/proxy/metrics.go b/internal/proxy/metrics.go
index b95687a03b..043b191eae 100644
--- a/internal/proxy/metrics.go
+++ b/internal/proxy/metrics.go
@@ -15,5 +15,5 @@ const (
 // Metrics contains the prometheus metrics exposed by the proxy package.
 type Metrics struct {
 	// Timing for each ABCI method.
-	MethodTiming metrics.Histogram `metrics_bucketsizes:".0001,.0004,.002,.009,.02,.1,.65,2,6,25" metrics_labels:"method, type"`
+	MethodTiming metrics.Histogram `metrics_bucketsizes:".0001,.0004,.002,.009,.02,.1,.65,2,6,25" metrics_labels:"method, type, success"`
 }