From 0251f3e4681d1c761da8e037bb423f5b661c144e Mon Sep 17 00:00:00 2001 From: Joe Adams Date: Fri, 13 Sep 2024 16:02:32 -0400 Subject: [PATCH] Refactor slm collector - Move metric Desc to vars to aid in unused linter checks - Use new Collector interface Signed-off-by: Joe Adams --- CHANGELOG.md | 8 + collector/slm.go | 493 +++++++++++++++----------------------- collector/slm_response.go | 42 ---- collector/slm_test.go | 4 +- main.go | 7 - 5 files changed, 203 insertions(+), 351 deletions(-) delete mode 100644 collector/slm_response.go diff --git a/CHANGELOG.md b/CHANGELOG.md index 5838486c..5136e1fe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,11 @@ +## master / unreleased + +BREAKING CHANGES: + +The flag `--es.slm` has been renamed to `--collector.slm`. + +* [CHANGE] Rename --es.slm to --collector.slm #XXX +* ## 1.7.0 / 2023-12-02 BREAKING CHANGES: diff --git a/collector/slm.go b/collector/slm.go index f821c75d..eb3cf809 100644 --- a/collector/slm.go +++ b/collector/slm.go @@ -14,355 +14,248 @@ package collector import ( + "context" "encoding/json" - "fmt" - "io" "net/http" "net/url" - "path" "github.com/go-kit/log" - "github.com/go-kit/log/level" "github.com/prometheus/client_golang/prometheus" ) -type policyMetric struct { - Type prometheus.ValueType - Desc *prometheus.Desc - Value func(policyStats PolicyStats) float64 - Labels func(policyStats PolicyStats) []string -} - -type slmMetric struct { - Type prometheus.ValueType - Desc *prometheus.Desc - Value func(slmStats SLMStatsResponse) float64 -} - -type slmStatusMetric struct { - Type prometheus.ValueType - Desc *prometheus.Desc - Value func(slmStatus SLMStatusResponse, operationMode string) float64 - Labels func(operationMode string) []string -} - var ( - defaultPolicyLabels = []string{"policy"} - defaultPolicyLabelValues = func(policyStats PolicyStats) []string { - return []string{policyStats.Policy} - } - statuses = []string{"RUNNING", "STOPPING", "STOPPED"} ) +var ( + slmRetentionRunsTotal = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "slm_stats", "retention_runs_total"), + "Total retention runs", + nil, nil, + ) + slmRetentionFailedTotal = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "slm_stats", "retention_failed_total"), + "Total failed retention runs", + nil, nil, + ) + slmRetentionTimedOutTotal = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "slm_stats", "retention_timed_out_total"), + "Total timed out retention runs", + nil, nil, + ) + slmRetentionDeletionTimeSeconds = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "slm_stats", "retention_deletion_time_seconds"), + "Retention run deletion time", + nil, nil, + ) + slmTotalSnapshotsTaken = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "slm_stats", "total_snapshots_taken_total"), + "Total snapshots taken", + nil, nil, + ) + slmTotalSnapshotsFailed = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "slm_stats", "total_snapshots_failed_total"), + "Total snapshots failed", + nil, nil, + ) + slmTotalSnapshotsDeleted = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "slm_stats", "total_snapshots_deleted_total"), + "Total snapshots deleted", + nil, nil, + ) + slmTotalSnapshotsDeleteFailed = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "slm_stats", "total_snapshot_deletion_failures_total"), + "Total snapshot deletion failures", + nil, nil, + ) + + slmOperationMode = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "slm_stats", "operation_mode"), + "Operating status of SLM", + []string{"operation_mode"}, nil, + ) + + slmSnapshotsTaken = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "slm_stats", "snapshots_taken_total"), + "Total snapshots taken", + []string{"policy"}, nil, + ) + slmSnapshotsFailed = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "slm_stats", "snapshots_failed_total"), + "Total snapshots failed", + []string{"policy"}, nil, + ) + slmSnapshotsDeleted = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "slm_stats", "snapshots_deleted_total"), + "Total snapshots deleted", + []string{"policy"}, nil, + ) + slmSnapshotsDeletionFailure = prometheus.NewDesc( + prometheus.BuildFQName(namespace, "slm_stats", "snapshot_deletion_failures_total"), + "Total snapshot deletion failures", + []string{"policy"}, nil, + ) +) + +func init() { + registerCollector("slm", defaultDisabled, NewSLM) +} + // SLM information struct type SLM struct { logger log.Logger - client *http.Client - url *url.URL - - slmMetrics []*slmMetric - policyMetrics []*policyMetric - slmStatusMetric *slmStatusMetric + hc *http.Client + u *url.URL } // NewSLM defines SLM Prometheus metrics -func NewSLM(logger log.Logger, client *http.Client, url *url.URL) *SLM { +func NewSLM(logger log.Logger, u *url.URL, hc *http.Client) (Collector, error) { return &SLM{ logger: logger, - client: client, - url: url, - slmMetrics: []*slmMetric{ - { - Type: prometheus.CounterValue, - Desc: prometheus.NewDesc( - prometheus.BuildFQName(namespace, "slm_stats", "retention_runs_total"), - "Total retention runs", - nil, nil, - ), - Value: func(slmStats SLMStatsResponse) float64 { - return float64(slmStats.RetentionRuns) - }, - }, - { - Type: prometheus.CounterValue, - Desc: prometheus.NewDesc( - prometheus.BuildFQName(namespace, "slm_stats", "retention_failed_total"), - "Total failed retention runs", - nil, nil, - ), - Value: func(slmStats SLMStatsResponse) float64 { - return float64(slmStats.RetentionFailed) - }, - }, - { - Type: prometheus.CounterValue, - Desc: prometheus.NewDesc( - prometheus.BuildFQName(namespace, "slm_stats", "retention_timed_out_total"), - "Total timed out retention runs", - nil, nil, - ), - Value: func(slmStats SLMStatsResponse) float64 { - return float64(slmStats.RetentionTimedOut) - }, - }, - { - Type: prometheus.GaugeValue, - Desc: prometheus.NewDesc( - prometheus.BuildFQName(namespace, "slm_stats", "retention_deletion_time_seconds"), - "Retention run deletion time", - nil, nil, - ), - Value: func(slmStats SLMStatsResponse) float64 { - return float64(slmStats.RetentionDeletionTimeMillis) / 1000 - }, - }, - { - Type: prometheus.CounterValue, - Desc: prometheus.NewDesc( - prometheus.BuildFQName(namespace, "slm_stats", "total_snapshots_taken_total"), - "Total snapshots taken", - nil, nil, - ), - Value: func(slmStats SLMStatsResponse) float64 { - return float64(slmStats.TotalSnapshotsTaken) - }, - }, - { - Type: prometheus.CounterValue, - Desc: prometheus.NewDesc( - prometheus.BuildFQName(namespace, "slm_stats", "total_snapshots_failed_total"), - "Total snapshots failed", - nil, nil, - ), - Value: func(slmStats SLMStatsResponse) float64 { - return float64(slmStats.TotalSnapshotsFailed) - }, - }, - { - Type: prometheus.CounterValue, - Desc: prometheus.NewDesc( - prometheus.BuildFQName(namespace, "slm_stats", "total_snapshots_deleted_total"), - "Total snapshots deleted", - nil, nil, - ), - Value: func(slmStats SLMStatsResponse) float64 { - return float64(slmStats.TotalSnapshotsDeleted) - }, - }, - { - Type: prometheus.CounterValue, - Desc: prometheus.NewDesc( - prometheus.BuildFQName(namespace, "slm_stats", "total_snapshot_deletion_failures_total"), - "Total snapshot deletion failures", - nil, nil, - ), - Value: func(slmStats SLMStatsResponse) float64 { - return float64(slmStats.TotalSnapshotDeletionFailures) - }, - }, - }, - policyMetrics: []*policyMetric{ - { - Type: prometheus.CounterValue, - Desc: prometheus.NewDesc( - prometheus.BuildFQName(namespace, "slm_stats", "snapshots_taken_total"), - "Total snapshots taken", - defaultPolicyLabels, nil, - ), - Value: func(policyStats PolicyStats) float64 { - return float64(policyStats.SnapshotsTaken) - }, - Labels: defaultPolicyLabelValues, - }, - { - Type: prometheus.CounterValue, - Desc: prometheus.NewDesc( - prometheus.BuildFQName(namespace, "slm_stats", "snapshots_failed_total"), - "Total snapshots failed", - defaultPolicyLabels, nil, - ), - Value: func(policyStats PolicyStats) float64 { - return float64(policyStats.SnapshotsFailed) - }, - Labels: defaultPolicyLabelValues, - }, - { - Type: prometheus.CounterValue, - Desc: prometheus.NewDesc( - prometheus.BuildFQName(namespace, "slm_stats", "snapshots_deleted_total"), - "Total snapshots deleted", - defaultPolicyLabels, nil, - ), - Value: func(policyStats PolicyStats) float64 { - return float64(policyStats.SnapshotsDeleted) - }, - Labels: defaultPolicyLabelValues, - }, - { - Type: prometheus.CounterValue, - Desc: prometheus.NewDesc( - prometheus.BuildFQName(namespace, "slm_stats", "snapshot_deletion_failures_total"), - "Total snapshot deletion failures", - defaultPolicyLabels, nil, - ), - Value: func(policyStats PolicyStats) float64 { - return float64(policyStats.SnapshotDeletionFailures) - }, - Labels: defaultPolicyLabelValues, - }, - }, - slmStatusMetric: &slmStatusMetric{ - Type: prometheus.GaugeValue, - Desc: prometheus.NewDesc( - prometheus.BuildFQName(namespace, "slm_stats", "operation_mode"), - "Operating status of SLM", - []string{"operation_mode"}, nil, - ), - Value: func(slmStatus SLMStatusResponse, operationMode string) float64 { - if slmStatus.OperationMode == operationMode { - return 1 - } - return 0 - }, - }, - } + hc: hc, + u: u, + }, nil } -// Describe adds SLM metrics descriptions -func (s *SLM) Describe(ch chan<- *prometheus.Desc) { - ch <- s.slmStatusMetric.Desc - - for _, metric := range s.slmMetrics { - ch <- metric.Desc - } - - for _, metric := range s.policyMetrics { - ch <- metric.Desc - } - +// SLMStatsResponse is a representation of the SLM stats +type SLMStatsResponse struct { + RetentionRuns int64 `json:"retention_runs"` + RetentionFailed int64 `json:"retention_failed"` + RetentionTimedOut int64 `json:"retention_timed_out"` + RetentionDeletionTime string `json:"retention_deletion_time"` + RetentionDeletionTimeMillis int64 `json:"retention_deletion_time_millis"` + TotalSnapshotsTaken int64 `json:"total_snapshots_taken"` + TotalSnapshotsFailed int64 `json:"total_snapshots_failed"` + TotalSnapshotsDeleted int64 `json:"total_snapshots_deleted"` + TotalSnapshotDeletionFailures int64 `json:"total_snapshot_deletion_failures"` + PolicyStats []PolicyStats `json:"policy_stats"` } -func (s *SLM) fetchAndDecodeSLMStats() (SLMStatsResponse, error) { - var ssr SLMStatsResponse - - u := *s.url - u.Path = path.Join(u.Path, "/_slm/stats") - res, err := s.client.Get(u.String()) - if err != nil { - return ssr, fmt.Errorf("failed to get slm stats health from %s://%s:%s%s: %s", - u.Scheme, u.Hostname(), u.Port(), u.Path, err) - } - - defer func() { - err = res.Body.Close() - if err != nil { - level.Warn(s.logger).Log( - "msg", "failed to close http.Client", - "err", err, - ) - } - }() - - if res.StatusCode != http.StatusOK { - return ssr, fmt.Errorf("HTTP Request failed with code %d", res.StatusCode) - } - - bts, err := io.ReadAll(res.Body) - if err != nil { - return ssr, err - } - - if err := json.Unmarshal(bts, &ssr); err != nil { - return ssr, err - } +// PolicyStats is a representation of SLM stats for specific policies +type PolicyStats struct { + Policy string `json:"policy"` + SnapshotsTaken int64 `json:"snapshots_taken"` + SnapshotsFailed int64 `json:"snapshots_failed"` + SnapshotsDeleted int64 `json:"snapshots_deleted"` + SnapshotDeletionFailures int64 `json:"snapshot_deletion_failures"` +} - return ssr, nil +// SLMStatusResponse is a representation of the SLM status +type SLMStatusResponse struct { + OperationMode string `json:"operation_mode"` } -func (s *SLM) fetchAndDecodeSLMStatus() (SLMStatusResponse, error) { - var ssr SLMStatusResponse +func (s *SLM) Update(ctx context.Context, ch chan<- prometheus.Metric) error { + u := s.u.ResolveReference(&url.URL{Path: "/_slm/status"}) + var slmStatusResp SLMStatusResponse - u := *s.url - u.Path = path.Join(u.Path, "/_slm/status") - res, err := s.client.Get(u.String()) + resp, err := getURL(ctx, s.hc, s.logger, u.String()) if err != nil { - return ssr, fmt.Errorf("failed to get slm status from %s://%s:%s%s: %s", - u.Scheme, u.Hostname(), u.Port(), u.Path, err) - } - - defer func() { - err = res.Body.Close() - if err != nil { - level.Warn(s.logger).Log( - "msg", "failed to close http.Client", - "err", err, - ) - } - }() - - if res.StatusCode != http.StatusOK { - return ssr, fmt.Errorf("HTTP Request failed with code %d", res.StatusCode) + return err } - bts, err := io.ReadAll(res.Body) + err = json.Unmarshal(resp, &slmStatusResp) if err != nil { - return ssr, err + return err } - if err := json.Unmarshal(bts, &ssr); err != nil { - return ssr, err - } - - return ssr, nil -} - -// Collect gets SLM metric values -func (s *SLM) Collect(ch chan<- prometheus.Metric) { + u = s.u.ResolveReference(&url.URL{Path: "/_slm/stats"}) + var slmStatsResp SLMStatsResponse - slmStatusResp, err := s.fetchAndDecodeSLMStatus() + resp, err = getURL(ctx, s.hc, s.logger, u.String()) if err != nil { - level.Warn(s.logger).Log( - "msg", "failed to fetch and decode slm status", - "err", err, - ) - return + return err } - slmStatsResp, err := s.fetchAndDecodeSLMStats() + err = json.Unmarshal(resp, &slmStatsResp) if err != nil { - level.Warn(s.logger).Log( - "msg", "failed to fetch and decode slm stats", - "err", err, - ) - return + return err } for _, status := range statuses { + var value float64 = 0 + if slmStatusResp.OperationMode == status { + value = 1 + } ch <- prometheus.MustNewConstMetric( - s.slmStatusMetric.Desc, - s.slmStatusMetric.Type, - s.slmStatusMetric.Value(slmStatusResp, status), + slmOperationMode, + prometheus.GaugeValue, + value, status, ) } - for _, metric := range s.slmMetrics { + ch <- prometheus.MustNewConstMetric( + slmRetentionRunsTotal, + prometheus.CounterValue, + float64(slmStatsResp.RetentionRuns), + ) + + ch <- prometheus.MustNewConstMetric( + slmRetentionFailedTotal, + prometheus.CounterValue, + float64(slmStatsResp.RetentionFailed), + ) + + ch <- prometheus.MustNewConstMetric( + slmRetentionTimedOutTotal, + prometheus.CounterValue, + float64(slmStatsResp.RetentionTimedOut), + ) + ch <- prometheus.MustNewConstMetric( + slmRetentionDeletionTimeSeconds, + prometheus.GaugeValue, + float64(slmStatsResp.RetentionDeletionTimeMillis)/1000, + ) + ch <- prometheus.MustNewConstMetric( + slmTotalSnapshotsTaken, + prometheus.CounterValue, + float64(slmStatsResp.TotalSnapshotsTaken), + ) + ch <- prometheus.MustNewConstMetric( + slmTotalSnapshotsFailed, + prometheus.CounterValue, + float64(slmStatsResp.TotalSnapshotsFailed), + ) + ch <- prometheus.MustNewConstMetric( + slmTotalSnapshotsDeleted, + prometheus.CounterValue, + float64(slmStatsResp.TotalSnapshotsDeleted), + ) + ch <- prometheus.MustNewConstMetric( + slmTotalSnapshotsDeleteFailed, + prometheus.CounterValue, + float64(slmStatsResp.TotalSnapshotDeletionFailures), + ) + + for _, policy := range slmStatsResp.PolicyStats { ch <- prometheus.MustNewConstMetric( - metric.Desc, - metric.Type, - metric.Value(slmStatsResp), + slmSnapshotsTaken, + prometheus.CounterValue, + float64(policy.SnapshotsTaken), + policy.Policy, + ) + ch <- prometheus.MustNewConstMetric( + slmSnapshotsFailed, + prometheus.CounterValue, + float64(policy.SnapshotsFailed), + policy.Policy, + ) + ch <- prometheus.MustNewConstMetric( + slmSnapshotsDeleted, + prometheus.CounterValue, + float64(policy.SnapshotsDeleted), + policy.Policy, + ) + ch <- prometheus.MustNewConstMetric( + slmSnapshotsDeletionFailure, + prometheus.CounterValue, + float64(policy.SnapshotDeletionFailures), + policy.Policy, ) - } - for _, metric := range s.policyMetrics { - for _, policy := range slmStatsResp.PolicyStats { - ch <- prometheus.MustNewConstMetric( - metric.Desc, - metric.Type, - metric.Value(policy), - metric.Labels(policy)..., - ) - } } + + return nil + } diff --git a/collector/slm_response.go b/collector/slm_response.go deleted file mode 100644 index b1cfc1b1..00000000 --- a/collector/slm_response.go +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2022 The Prometheus Authors -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package collector - -// SLMStatsResponse is a representation of the SLM stats -type SLMStatsResponse struct { - RetentionRuns int64 `json:"retention_runs"` - RetentionFailed int64 `json:"retention_failed"` - RetentionTimedOut int64 `json:"retention_timed_out"` - RetentionDeletionTime string `json:"retention_deletion_time"` - RetentionDeletionTimeMillis int64 `json:"retention_deletion_time_millis"` - TotalSnapshotsTaken int64 `json:"total_snapshots_taken"` - TotalSnapshotsFailed int64 `json:"total_snapshots_failed"` - TotalSnapshotsDeleted int64 `json:"total_snapshots_deleted"` - TotalSnapshotDeletionFailures int64 `json:"total_snapshot_deletion_failures"` - PolicyStats []PolicyStats `json:"policy_stats"` -} - -// PolicyStats is a representation of SLM stats for specific policies -type PolicyStats struct { - Policy string `json:"policy"` - SnapshotsTaken int64 `json:"snapshots_taken"` - SnapshotsFailed int64 `json:"snapshots_failed"` - SnapshotsDeleted int64 `json:"snapshots_deleted"` - SnapshotDeletionFailures int64 `json:"snapshot_deletion_failures"` -} - -// SLMStatusResponse is a representation of the SLM status -type SLMStatusResponse struct { - OperationMode string `json:"operation_mode"` -} diff --git a/collector/slm_test.go b/collector/slm_test.go index cc028d4c..5351b8dd 100644 --- a/collector/slm_test.go +++ b/collector/slm_test.go @@ -123,12 +123,12 @@ func TestSLM(t *testing.T) { t.Fatalf("Failed to parse URL: %s", err) } - s := NewSLM(log.NewNopLogger(), http.DefaultClient, u) + s, err := NewSLM(log.NewNopLogger(), u, http.DefaultClient) if err != nil { t.Fatal(err) } - if err := testutil.CollectAndCompare(s, strings.NewReader(tt.want)); err != nil { + if err := testutil.CollectAndCompare(wrapCollector{s}, strings.NewReader(tt.want)); err != nil { t.Fatal(err) } }) diff --git a/main.go b/main.go index 5e4c6da0..dcb6bfa4 100644 --- a/main.go +++ b/main.go @@ -83,9 +83,6 @@ func main() { esExportShards = kingpin.Flag("es.shards", "Export stats for shards in the cluster (implies --es.indices)."). Default("false").Bool() - esExportSLM = kingpin.Flag("es.slm", - "Export stats for SLM snapshots."). - Default("false").Bool() esExportDataStream = kingpin.Flag("es.data_stream", "Export stas for Data Streams."). Default("false").Bool() @@ -213,10 +210,6 @@ func main() { } } - if *esExportSLM { - prometheus.MustRegister(collector.NewSLM(logger, httpClient, esURL)) - } - if *esExportDataStream { prometheus.MustRegister(collector.NewDataStream(logger, httpClient, esURL)) }