From 2823d6688e5af86f8918ebd19dd7529c88e0f633 Mon Sep 17 00:00:00 2001 From: Tristan Wilson Date: Thu, 25 Jul 2024 17:41:32 +0200 Subject: [PATCH] Remove some dynamically generated metrics from DAS This removes some infrequently used dynamically generated metrics from the DAS aggregator. The arb/das/rpc/aggregator/store/backend/error/total metrics should be monitored and then drilling down into the error cause can be done by looking at logs. --- das/aggregator.go | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/das/aggregator.go b/das/aggregator.go index 9aa558b92c..d944f8d48a 100644 --- a/das/aggregator.go +++ b/das/aggregator.go @@ -193,11 +193,7 @@ func (a *Aggregator) Store(ctx context.Context, message []byte, timeout uint64) cert, err := d.service.Store(storeCtx, message, timeout) if err != nil { incFailureMetric() - if errors.Is(err, context.DeadlineExceeded) { - metrics.GetOrRegisterCounter(metricWithServiceName+"/error/timeout/total", nil).Inc(1) - } else { - metrics.GetOrRegisterCounter(metricWithServiceName+"/error/client/total", nil).Inc(1) - } + log.Warn("DAS Aggregator failed to store batch to backend", "backend", d.metricName, "err", err) responses <- storeResponse{d, nil, err} return } @@ -207,13 +203,13 @@ func (a *Aggregator) Store(ctx context.Context, message []byte, timeout uint64) ) if err != nil { incFailureMetric() - metrics.GetOrRegisterCounter(metricWithServiceName+"/error/bad_response/total", nil).Inc(1) + log.Warn("DAS Aggregator couldn't parse backend's store response signature", "backend", d.metricName, "err", err) responses <- storeResponse{d, nil, err} return } if !verified { incFailureMetric() - metrics.GetOrRegisterCounter(metricWithServiceName+"/error/bad_response/total", nil).Inc(1) + log.Warn("DAS Aggregator failed to verify backend's store response signature", "backend", d.metricName, "err", err) responses <- storeResponse{d, nil, errors.New("signature verification failed")} return } @@ -222,13 +218,13 @@ func (a *Aggregator) Store(ctx context.Context, message []byte, timeout uint64) if cert.DataHash != expectedHash { incFailureMetric() - metrics.GetOrRegisterCounter(metricWithServiceName+"/error/bad_response/total", nil).Inc(1) + log.Warn("DAS Aggregator got a store response with a data hash not matching the expected hash", "backend", d.metricName, "dataHash", cert.DataHash, "expectedHash", expectedHash, "err", err) responses <- storeResponse{d, nil, errors.New("hash verification failed")} return } if cert.Timeout != timeout { incFailureMetric() - metrics.GetOrRegisterCounter(metricWithServiceName+"/error/bad_response/total", nil).Inc(1) + log.Warn("DAS Aggregator got a store response with any expiry time not matching the expected expiry time", "backend", d.metricName, "dataHash", cert.DataHash, "expectedHash", expectedHash, "err", err) responses <- storeResponse{d, nil, fmt.Errorf("timeout was %d, expected %d", cert.Timeout, timeout)} return }