Skip to content

Commit

Permalink
add a retry system for the rules-collector
Browse files Browse the repository at this point in the history
Signed-off-by: Augustin Husson <[email protected]>
  • Loading branch information
Nexucis committed Nov 4, 2024
1 parent d3917fd commit c0ba0f9
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 2 deletions.
8 changes: 7 additions & 1 deletion config/collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,10 @@ type RulesCollector struct {
Period model.Duration `yaml:"period,omitempty"`
// MetricUsageClient is a client to send the metrics usage to a remote metrics_usage server.
MetricUsageClient *HTTPClient `yaml:"metric_usage_client,omitempty"`
HTTPClient HTTPClient `yaml:"prometheus_client"`
// RetryToGetRules is the number of retries the collector will do to get the rules from Prometheus before actually failing.
// Between each retry, the collector will wait first 10 seconds, then 20 seconds, then 30 seconds ...etc.
RetryToGetRules uint `yaml:"retry_to_get_rules,omitempty"`
HTTPClient HTTPClient `yaml:"prometheus_client"`
}

func (c *RulesCollector) Verify() error {
Expand All @@ -122,6 +125,9 @@ func (c *RulesCollector) Verify() error {
if c.Period <= 0 {
c.Period = model.Duration(defaultMetricCollectorPeriodDuration)
}
if c.RetryToGetRules == 0 {
c.RetryToGetRules = 3
}
if c.HTTPClient.URL == nil {
return fmt.Errorf("missing Prometheus URL for the rules collector")
}
Expand Down
4 changes: 4 additions & 0 deletions docs/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,10 @@ http_client: <HTTPClient config>
# It is a client to send the metrics usage to a remote metrics_usage server.
[ metric_usage_client: <HTTPClient config> ]

# It is the number of retries the collector will do to get the rules from Prometheus before actually failing.
# Between each retry, the collector will wait first 10 seconds, then 20 seconds, then 30 seconds ...etc.
[ retry_to_get_rules: <number> | default=3 ]

# The prometheus client used to retrieve the rules
prometheus_client: <HTTPClient config>
```
Expand Down
27 changes: 26 additions & 1 deletion source/rules/rules.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ package rules

import (
"context"
"time"

"github.com/perses/common/async"
"github.com/perses/metrics-usage/config"
Expand Down Expand Up @@ -45,6 +46,7 @@ func NewCollector(db database.Database, cfg *config.RulesCollector) (async.Simpl
metricUsageClient: metricUsageClient,
promURL: cfg.HTTPClient.URL.String(),
logger: logrus.StandardLogger().WithField("collector", "rules"),
retry: cfg.RetryToGetRules,
}, nil
}

Expand All @@ -55,10 +57,11 @@ type rulesCollector struct {
metricUsageClient client.Client
promURL string
logger *logrus.Entry
retry uint
}

func (c *rulesCollector) Execute(ctx context.Context, _ context.CancelFunc) error {
result, err := c.promClient.Rules(ctx)
result, err := c.getRules(ctx)
if err != nil {
c.logger.WithError(err).Error("Failed to get rules")
return nil
Expand All @@ -81,6 +84,28 @@ func (c *rulesCollector) String() string {
return "rules collector"
}

func (c *rulesCollector) getRules(ctx context.Context) (v1.RulesResult, error) {
waitDuration := 10 * time.Second
retry := c.retry
doRetry := true
var err error
var result v1.RulesResult
for doRetry && retry > 0 {
result, err = c.promClient.Rules(ctx)
if err != nil {
doRetry = true
retry--
c.logger.WithError(err).Debug("Failed to get rules, retrying...")
time.Sleep(waitDuration)
waitDuration = waitDuration + 10*time.Second
} else {
c.logger.Infof("successfuly get the rules")
doRetry = false
}
}
return result, err
}

func extractMetricUsageFromRules(ruleGroups []v1.RuleGroup, source string) map[string]*modelAPIV1.MetricUsage {
metricUsage := make(map[string]*modelAPIV1.MetricUsage)
for _, ruleGroup := range ruleGroups {
Expand Down

0 comments on commit c0ba0f9

Please sign in to comment.