From c0ba0f998e9b1ebf2d27ecc5fd72ba088051667e Mon Sep 17 00:00:00 2001 From: Augustin Husson Date: Mon, 4 Nov 2024 10:27:30 +0100 Subject: [PATCH] add a retry system for the rules-collector Signed-off-by: Augustin Husson --- config/collector.go | 8 +++++++- docs/configuration.md | 4 ++++ source/rules/rules.go | 27 ++++++++++++++++++++++++++- 3 files changed, 37 insertions(+), 2 deletions(-) diff --git a/config/collector.go b/config/collector.go index ec18dab..4d1835c 100644 --- a/config/collector.go +++ b/config/collector.go @@ -112,7 +112,10 @@ type RulesCollector struct { Period model.Duration `yaml:"period,omitempty"` // MetricUsageClient is a client to send the metrics usage to a remote metrics_usage server. MetricUsageClient *HTTPClient `yaml:"metric_usage_client,omitempty"` - HTTPClient HTTPClient `yaml:"prometheus_client"` + // RetryToGetRules is the number of retries the collector will do to get the rules from Prometheus before actually failing. + // Between each retry, the collector will wait first 10 seconds, then 20 seconds, then 30 seconds ...etc. + RetryToGetRules uint `yaml:"retry_to_get_rules,omitempty"` + HTTPClient HTTPClient `yaml:"prometheus_client"` } func (c *RulesCollector) Verify() error { @@ -122,6 +125,9 @@ func (c *RulesCollector) Verify() error { if c.Period <= 0 { c.Period = model.Duration(defaultMetricCollectorPeriodDuration) } + if c.RetryToGetRules == 0 { + c.RetryToGetRules = 3 + } if c.HTTPClient.URL == nil { return fmt.Errorf("missing Prometheus URL for the rules collector") } diff --git a/docs/configuration.md b/docs/configuration.md index f03539c..a255c01 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -83,6 +83,10 @@ http_client: # It is a client to send the metrics usage to a remote metrics_usage server. [ metric_usage_client: ] +# It is the number of retries the collector will do to get the rules from Prometheus before actually failing. +# Between each retry, the collector will wait first 10 seconds, then 20 seconds, then 30 seconds ...etc. +[ retry_to_get_rules: | default=3 ] + # The prometheus client used to retrieve the rules prometheus_client: ``` diff --git a/source/rules/rules.go b/source/rules/rules.go index 3d8e7bc..a51eb1c 100644 --- a/source/rules/rules.go +++ b/source/rules/rules.go @@ -15,6 +15,7 @@ package rules import ( "context" + "time" "github.com/perses/common/async" "github.com/perses/metrics-usage/config" @@ -45,6 +46,7 @@ func NewCollector(db database.Database, cfg *config.RulesCollector) (async.Simpl metricUsageClient: metricUsageClient, promURL: cfg.HTTPClient.URL.String(), logger: logrus.StandardLogger().WithField("collector", "rules"), + retry: cfg.RetryToGetRules, }, nil } @@ -55,10 +57,11 @@ type rulesCollector struct { metricUsageClient client.Client promURL string logger *logrus.Entry + retry uint } func (c *rulesCollector) Execute(ctx context.Context, _ context.CancelFunc) error { - result, err := c.promClient.Rules(ctx) + result, err := c.getRules(ctx) if err != nil { c.logger.WithError(err).Error("Failed to get rules") return nil @@ -81,6 +84,28 @@ func (c *rulesCollector) String() string { return "rules collector" } +func (c *rulesCollector) getRules(ctx context.Context) (v1.RulesResult, error) { + waitDuration := 10 * time.Second + retry := c.retry + doRetry := true + var err error + var result v1.RulesResult + for doRetry && retry > 0 { + result, err = c.promClient.Rules(ctx) + if err != nil { + doRetry = true + retry-- + c.logger.WithError(err).Debug("Failed to get rules, retrying...") + time.Sleep(waitDuration) + waitDuration = waitDuration + 10*time.Second + } else { + c.logger.Infof("successfuly get the rules") + doRetry = false + } + } + return result, err +} + func extractMetricUsageFromRules(ruleGroups []v1.RuleGroup, source string) map[string]*modelAPIV1.MetricUsage { metricUsage := make(map[string]*modelAPIV1.MetricUsage) for _, ruleGroup := range ruleGroups {