From b45b80f70b20c16b8216c99b1d7dd138f055c224 Mon Sep 17 00:00:00 2001 From: Chris Schinnerl Date: Wed, 21 Feb 2024 13:31:44 +0100 Subject: [PATCH 1/2] autopilot: accumulate churn information into single alert --- alerts/alerts.go | 15 ++++++-- autopilot/alerts.go | 58 +++++++++++------------------ autopilot/churn.go | 63 ++++++++++++++++++++++++++++++++ autopilot/contractor.go | 10 ++++- bus/bus.go | 11 +++++- bus/client/alerts.go | 4 +- internal/testing/cluster_test.go | 6 +-- 7 files changed, 117 insertions(+), 50 deletions(-) create mode 100644 autopilot/churn.go diff --git a/alerts/alerts.go b/alerts/alerts.go index b0d4963c6..f11004dbe 100644 --- a/alerts/alerts.go +++ b/alerts/alerts.go @@ -35,6 +35,7 @@ const ( type ( Alerter interface { + Alerts(_ context.Context, opts AlertsOpts) (resp AlertsResponse, err error) RegisterAlert(_ context.Context, a Alert) error DismissAlerts(_ context.Context, ids ...types.Hash256) error } @@ -169,17 +170,18 @@ func (m *Manager) DismissAlerts(ctx context.Context, ids ...types.Hash256) error }) } -// Active returns the host's active alerts. -func (m *Manager) Active(offset, limit int) AlertsResponse { +// Alerts returns the host's active alerts. +func (m *Manager) Alerts(_ context.Context, opts AlertsOpts) (AlertsResponse, error) { m.mu.Lock() defer m.mu.Unlock() + offset, limit := opts.Offset, opts.Limit resp := AlertsResponse{ Total: len(m.alerts), } if offset >= len(m.alerts) { - return resp + return resp, nil } else if limit == -1 { limit = len(m.alerts) } @@ -197,7 +199,7 @@ func (m *Manager) Active(offset, limit int) AlertsResponse { resp.HasMore = true } resp.Alerts = alerts - return resp + return resp, nil } func (m *Manager) RegisterWebhookBroadcaster(b webhooks.Broadcaster) { @@ -231,6 +233,11 @@ func WithOrigin(alerter Alerter, origin string) Alerter { } } +// Alerts implements the Alerter interface. +func (a *originAlerter) Alerts(ctx context.Context, opts AlertsOpts) (resp AlertsResponse, err error) { + return a.alerter.Alerts(ctx, opts) +} + // RegisterAlert implements the Alerter interface. func (a *originAlerter) RegisterAlert(ctx context.Context, alert Alert) error { if alert.Data == nil { diff --git a/autopilot/alerts.go b/autopilot/alerts.go index 292670dc5..f4762c4d4 100644 --- a/autopilot/alerts.go +++ b/autopilot/alerts.go @@ -14,12 +14,13 @@ import ( ) var ( - alertAccountRefillID = frand.Entropy256() // constant until restarted - alertLostSectorsID = frand.Entropy256() // constant until restarted - alertLowBalanceID = frand.Entropy256() // constant until restarted - alertMigrationID = frand.Entropy256() // constant until restarted - alertPruningID = frand.Entropy256() // constant until restarted - alertRenewalFailedID = frand.Entropy256() // constant until restarted + alertAccountRefillID = randomAlertID() // constant until restarted + alertChurnID = randomAlertID() // constant until restarted + alertLostSectorsID = randomAlertID() // constant until restarted + alertLowBalanceID = randomAlertID() // constant until restarted + alertMigrationID = randomAlertID() // constant until restarted + alertPruningID = randomAlertID() // constant until restarted + alertRenewalFailedID = randomAlertID() // constant until restarted ) func alertIDForAccount(alertID [32]byte, id rhpv3.Account) types.Hash256 { @@ -54,6 +55,20 @@ func (ap *Autopilot) DismissAlert(ctx context.Context, ids ...types.Hash256) { } } +func (ap *Autopilot) HasAlert(ctx context.Context, id types.Hash256) bool { + ar, err := ap.alerts.Alerts(ctx, alerts.AlertsOpts{Offset: 0, Limit: -1}) + if err != nil { + ap.logger.Errorf("failed to fetch alerts: %v", err) + return false + } + for _, alert := range ar.Alerts { + if alert.ID == id { + return true + } + } + return false +} + func newAccountLowBalanceAlert(address types.Address, balance, allowance types.Currency, bh, renewWindow, endHeight uint64) alerts.Alert { severity := alerts.SeverityInfo if bh+renewWindow/2 >= endHeight { @@ -137,37 +152,6 @@ func newContractPruningFailedAlert(hk types.PublicKey, version string, fcid type } } -func newContractSetChangeAlert(name string, additions map[types.FileContractID]contractSetAddition, removals map[types.FileContractID]contractSetRemoval) alerts.Alert { - var hint string - if len(removals) > 0 { - hint = "A high churn rate can lead to a lot of unnecessary migrations, it might be necessary to tweak your configuration depending on the reason hosts are being discarded from the set." - } - - removedReasons := make(map[string]string, len(removals)) - for k, v := range removals { - removedReasons[k.String()] = v.Reason - } - - return alerts.Alert{ - ID: randomAlertID(), - Severity: alerts.SeverityInfo, - Message: "Contract set changed", - Data: map[string]any{ - "name": name, - "set_additions": additions, - "set_removals": removals, - "hint": hint, - - // TODO: these fields can be removed on the next major release, they - // contain redundant information - "added": len(additions), - "removed": len(removals), - "removals": removedReasons, - }, - Timestamp: time.Now(), - } -} - func newLostSectorsAlert(hk types.PublicKey, lostSectors uint64) alerts.Alert { return alerts.Alert{ ID: alertIDForHost(alertLostSectorsID, hk), diff --git a/autopilot/churn.go b/autopilot/churn.go new file mode 100644 index 000000000..70c4651c2 --- /dev/null +++ b/autopilot/churn.go @@ -0,0 +1,63 @@ +package autopilot + +import ( + "time" + + "go.sia.tech/core/types" + "go.sia.tech/renterd/alerts" +) + +type ( + accumulatedChurn struct { + additions map[types.FileContractID][]contractSetAddition + removals map[types.FileContractID][]contractSetRemoval + } +) + +func newAccumulatedChurn() *accumulatedChurn { + return &accumulatedChurn{ + additions: make(map[types.FileContractID][]contractSetAddition), + removals: make(map[types.FileContractID][]contractSetRemoval), + } +} + +func (c *accumulatedChurn) Alert(name string) alerts.Alert { + var hint string + if len(c.removals) > 0 { + hint = "A high churn rate can lead to a lot of unnecessary migrations, it might be necessary to tweak your configuration depending on the reason hosts are being discarded from the set." + } + + removedReasons := make(map[string][]string, len(c.removals)) + for fcid, contractRemovals := range c.removals { + for _, removal := range contractRemovals { + removedReasons[fcid.String()] = append(removedReasons[fcid.String()], removal.Reason) + } + } + + return alerts.Alert{ + ID: alertChurnID, + Severity: alerts.SeverityInfo, + Message: "Contract set changed", + Data: map[string]any{ + "name": name, + "set_additions": c.additions, + "set_removals": c.removals, + "hint": hint, + }, + Timestamp: time.Now(), + } +} + +func (c *accumulatedChurn) Apply(additions map[types.FileContractID]contractSetAddition, removals map[types.FileContractID]contractSetRemoval) { + for fcid, addition := range additions { + c.additions[fcid] = append(c.additions[fcid], addition) + } + for fcid, removal := range removals { + c.removals[fcid] = append(c.removals[fcid], removal) + } +} + +func (c *accumulatedChurn) Reset() { + c.additions = make(map[types.FileContractID][]contractSetAddition) + c.removals = make(map[types.FileContractID][]contractSetRemoval) +} diff --git a/autopilot/contractor.go b/autopilot/contractor.go index 092f2a831..7909277f0 100644 --- a/autopilot/contractor.go +++ b/autopilot/contractor.go @@ -85,6 +85,7 @@ const ( type ( contractor struct { ap *Autopilot + churn *accumulatedChurn resolver *ipResolver logger *zap.SugaredLogger @@ -130,7 +131,7 @@ type ( contractSetRemoval struct { Size uint64 `json:"size"` HostKey types.PublicKey `json:"hostKey"` - Reason string `json:"reason"` + Reason string `json:"reasons"` } renewal struct { @@ -143,6 +144,7 @@ type ( func newContractor(ap *Autopilot, revisionSubmissionBuffer uint64, revisionBroadcastInterval time.Duration) *contractor { return &contractor{ ap: ap, + churn: newAccumulatedChurn(), logger: ap.logger.Named("contractor"), revisionBroadcastInterval: revisionBroadcastInterval, @@ -536,7 +538,11 @@ func (c *contractor) computeContractSetChanged(ctx context.Context, name string, ) hasChanged := len(setAdditions)+len(setRemovals) > 0 if hasChanged { - c.ap.RegisterAlert(ctx, newContractSetChangeAlert(name, setAdditions, setRemovals)) + if !c.ap.HasAlert(ctx, alertChurnID) { + c.churn.Reset() + } + c.churn.Apply(setAdditions, setRemovals) + c.ap.RegisterAlert(ctx, c.churn.Alert(name)) } return hasChanged } diff --git a/bus/bus.go b/bus/bus.go index 9ee6e1ba2..e7e6ddaac 100644 --- a/bus/bus.go +++ b/bus/bus.go @@ -1726,7 +1726,10 @@ func (b *bus) gougingParams(ctx context.Context) (api.GougingParams, error) { } func (b *bus) handleGETAlertsDeprecated(jc jape.Context) { - ar := b.alertMgr.Active(0, -1) + ar, err := b.alertMgr.Alerts(jc.Request.Context(), alerts.AlertsOpts{Offset: 0, Limit: -1}) + if jc.Check("failed to fetch alerts", err) != nil { + return + } jc.Encode(ar.Alerts) } @@ -1744,7 +1747,11 @@ func (b *bus) handleGETAlerts(jc jape.Context) { jc.Error(errors.New("offset must be non-negative"), http.StatusBadRequest) return } - jc.Encode(b.alertMgr.Active(offset, limit)) + ar, err := b.alertMgr.Alerts(jc.Request.Context(), alerts.AlertsOpts{Offset: offset, Limit: limit}) + if jc.Check("failed to fetch alerts", err) != nil { + return + } + jc.Encode(ar) } func (b *bus) handlePOSTAlertsDismiss(jc jape.Context) { diff --git a/bus/client/alerts.go b/bus/client/alerts.go index 7f2bf9aa7..7eceaeaed 100644 --- a/bus/client/alerts.go +++ b/bus/client/alerts.go @@ -10,13 +10,13 @@ import ( ) // Alerts fetches the active alerts from the bus. -func (c *Client) Alerts(opts alerts.AlertsOpts) (resp alerts.AlertsResponse, err error) { +func (c *Client) Alerts(ctx context.Context, opts alerts.AlertsOpts) (resp alerts.AlertsResponse, err error) { values := url.Values{} values.Set("offset", fmt.Sprint(opts.Offset)) if opts.Limit != 0 { values.Set("limit", fmt.Sprint(opts.Limit)) } - err = c.c.GET("/alerts?"+values.Encode(), &resp) + err = c.c.WithContext(ctx).GET("/alerts?"+values.Encode(), &resp) return } diff --git a/internal/testing/cluster_test.go b/internal/testing/cluster_test.go index 4fb62ff31..f30a0906a 100644 --- a/internal/testing/cluster_test.go +++ b/internal/testing/cluster_test.go @@ -1923,7 +1923,7 @@ func TestAlerts(t *testing.T) { tt.OK(b.RegisterAlert(context.Background(), alert)) findAlert := func(id types.Hash256) *alerts.Alert { t.Helper() - ar, err := b.Alerts(alerts.AlertsOpts{}) + ar, err := b.Alerts(context.Background(), alerts.AlertsOpts{}) tt.OK(err) for _, alert := range ar.Alerts { if alert.ID == id { @@ -1960,7 +1960,7 @@ func TestAlerts(t *testing.T) { } // try to find with offset = 1 - ar, err := b.Alerts(alerts.AlertsOpts{Offset: 1}) + ar, err := b.Alerts(context.Background(), alerts.AlertsOpts{Offset: 1}) foundAlerts := ar.Alerts tt.OK(err) if len(foundAlerts) != 1 || foundAlerts[0].ID != alert.ID { @@ -1968,7 +1968,7 @@ func TestAlerts(t *testing.T) { } // try to find with limit = 1 - ar, err = b.Alerts(alerts.AlertsOpts{Limit: 1}) + ar, err = b.Alerts(context.Background(), alerts.AlertsOpts{Limit: 1}) foundAlerts = ar.Alerts tt.OK(err) if len(foundAlerts) != 1 || foundAlerts[0].ID != alert2.ID { From c17252a8f250cc0184918ec93dab5e7adeb31791 Mon Sep 17 00:00:00 2001 From: Chris Schinnerl Date: Fri, 23 Feb 2024 13:14:30 +0100 Subject: [PATCH 2/2] autopilot: update churn alert to contain timestamp --- autopilot/churn.go | 41 +++++++++++++++++------------- autopilot/contractor.go | 56 +++++++++++++++++++++++++++++------------ 2 files changed, 63 insertions(+), 34 deletions(-) diff --git a/autopilot/churn.go b/autopilot/churn.go index 70c4651c2..fdc1a0f54 100644 --- a/autopilot/churn.go +++ b/autopilot/churn.go @@ -9,15 +9,15 @@ import ( type ( accumulatedChurn struct { - additions map[types.FileContractID][]contractSetAddition - removals map[types.FileContractID][]contractSetRemoval + additions map[types.FileContractID]contractSetAdditions + removals map[types.FileContractID]contractSetRemovals } ) func newAccumulatedChurn() *accumulatedChurn { return &accumulatedChurn{ - additions: make(map[types.FileContractID][]contractSetAddition), - removals: make(map[types.FileContractID][]contractSetRemoval), + additions: make(map[types.FileContractID]contractSetAdditions), + removals: make(map[types.FileContractID]contractSetRemovals), } } @@ -27,13 +27,6 @@ func (c *accumulatedChurn) Alert(name string) alerts.Alert { hint = "A high churn rate can lead to a lot of unnecessary migrations, it might be necessary to tweak your configuration depending on the reason hosts are being discarded from the set." } - removedReasons := make(map[string][]string, len(c.removals)) - for fcid, contractRemovals := range c.removals { - for _, removal := range contractRemovals { - removedReasons[fcid.String()] = append(removedReasons[fcid.String()], removal.Reason) - } - } - return alerts.Alert{ ID: alertChurnID, Severity: alerts.SeverityInfo, @@ -48,16 +41,28 @@ func (c *accumulatedChurn) Alert(name string) alerts.Alert { } } -func (c *accumulatedChurn) Apply(additions map[types.FileContractID]contractSetAddition, removals map[types.FileContractID]contractSetRemoval) { - for fcid, addition := range additions { - c.additions[fcid] = append(c.additions[fcid], addition) +func (c *accumulatedChurn) Apply(additions map[types.FileContractID]contractSetAdditions, removals map[types.FileContractID]contractSetRemovals) { + for fcid, a := range additions { + if _, exists := c.additions[fcid]; !exists { + c.additions[fcid] = a + } else { + additions := c.additions[fcid] + additions.Additions = append(additions.Additions, a.Additions...) + c.additions[fcid] = additions + } } - for fcid, removal := range removals { - c.removals[fcid] = append(c.removals[fcid], removal) + for fcid, r := range removals { + if _, exists := c.removals[fcid]; !exists { + c.removals[fcid] = r + } else { + removals := c.removals[fcid] + removals.Removals = append(removals.Removals, r.Removals...) + c.removals[fcid] = removals + } } } func (c *accumulatedChurn) Reset() { - c.additions = make(map[types.FileContractID][]contractSetAddition) - c.removals = make(map[types.FileContractID][]contractSetRemoval) + c.additions = make(map[types.FileContractID]contractSetAdditions) + c.removals = make(map[types.FileContractID]contractSetRemovals) } diff --git a/autopilot/contractor.go b/autopilot/contractor.go index 7909277f0..9e2b52cca 100644 --- a/autopilot/contractor.go +++ b/autopilot/contractor.go @@ -123,15 +123,25 @@ type ( recoverable bool } + contractSetAdditions struct { + HostKey types.PublicKey `json:"hostKey"` + Additions []contractSetAddition `json:"additions"` + } + contractSetAddition struct { - Size uint64 `json:"size"` - HostKey types.PublicKey `json:"hostKey"` + Size uint64 `json:"size"` + Time api.TimeRFC3339 `json:"time"` + } + + contractSetRemovals struct { + HostKey types.PublicKey `json:"hostKey"` + Removals []contractSetRemoval `json:"removals"` } contractSetRemoval struct { - Size uint64 `json:"size"` - HostKey types.PublicKey `json:"hostKey"` - Reason string `json:"reasons"` + Size uint64 `json:"size"` + Reason string `json:"reasons"` + Time api.TimeRFC3339 `json:"time"` } renewal struct { @@ -455,8 +465,9 @@ func (c *contractor) computeContractSetChanged(ctx context.Context, name string, } // log added and removed contracts - setAdditions := make(map[types.FileContractID]contractSetAddition) - setRemovals := make(map[types.FileContractID]contractSetRemoval) + setAdditions := make(map[types.FileContractID]contractSetAdditions) + setRemovals := make(map[types.FileContractID]contractSetRemovals) + now := api.TimeNow() for _, contract := range oldSet { _, exists := inNewSet[contract.ID] _, renewed := inNewSet[renewalsFromTo[contract.ID]] @@ -466,11 +477,18 @@ func (c *contractor) computeContractSetChanged(ctx context.Context, name string, reason = "unknown" } - setRemovals[contract.ID] = contractSetRemoval{ - Size: contractData[contract.ID], - HostKey: contract.HostKey, - Reason: reason, + if _, exists := setRemovals[contract.ID]; !exists { + setRemovals[contract.ID] = contractSetRemovals{ + HostKey: contract.HostKey, + } } + removals := setRemovals[contract.ID] + removals.Removals = append(removals.Removals, contractSetRemoval{ + Size: contractData[contract.ID], + Reason: reason, + Time: now, + }) + setRemovals[contract.ID] = removals c.logger.Debugf("contract %v was removed from the contract set, size: %v, reason: %v", contract.ID, contractData[contract.ID], reason) } } @@ -478,10 +496,17 @@ func (c *contractor) computeContractSetChanged(ctx context.Context, name string, _, existed := inOldSet[contract.ID] _, renewed := renewalsToFrom[contract.ID] if !existed && !renewed { - setAdditions[contract.ID] = contractSetAddition{ - Size: contractData[contract.ID], - HostKey: contract.HostKey, + if _, exists := setAdditions[contract.ID]; !exists { + setAdditions[contract.ID] = contractSetAdditions{ + HostKey: contract.HostKey, + } } + additions := setAdditions[contract.ID] + additions.Additions = append(additions.Additions, contractSetAddition{ + Size: contractData[contract.ID], + Time: now, + }) + setAdditions[contract.ID] = additions c.logger.Debugf("contract %v was added to the contract set, size: %v", contract.ID, contractData[contract.ID]) } } @@ -501,7 +526,6 @@ func (c *contractor) computeContractSetChanged(ctx context.Context, name string, } // record churn metrics - now := api.TimeNow() var metrics []api.ContractSetChurnMetric for fcid := range setAdditions { metrics = append(metrics, api.ContractSetChurnMetric{ @@ -516,7 +540,7 @@ func (c *contractor) computeContractSetChanged(ctx context.Context, name string, Name: c.ap.state.cfg.Contracts.Set, ContractID: fcid, Direction: api.ChurnDirRemoved, - Reason: removal.Reason, + Reason: removal.Removals[0].Reason, Timestamp: now, }) }