From 2b68ec4085fb963a8f49e705414748558298e510 Mon Sep 17 00:00:00 2001 From: Yonas Habteab Date: Tue, 21 Nov 2023 14:58:57 +0100 Subject: [PATCH] Restore active incidents from DB gracefully --- internal/incident/db_types.go | 48 --------------- internal/incident/incident.go | 29 ++++----- internal/incident/incidents.go | 107 +++++++++++++++++++-------------- internal/incident/sync.go | 51 ++++++++++------ internal/object/db_types.go | 63 +++---------------- internal/object/object.go | 42 +++++-------- 6 files changed, 135 insertions(+), 205 deletions(-) diff --git a/internal/incident/db_types.go b/internal/incident/db_types.go index c037f7a0d..b5aa227ca 100644 --- a/internal/incident/db_types.go +++ b/internal/incident/db_types.go @@ -1,59 +1,11 @@ package incident import ( - "context" - "fmt" "github.com/icinga/icinga-notifications/internal/event" "github.com/icinga/icinga-notifications/internal/recipient" - "github.com/icinga/icinga-notifications/internal/utils" - "github.com/icinga/icingadb/pkg/icingadb" "github.com/icinga/icingadb/pkg/types" - "github.com/jmoiron/sqlx" ) -type IncidentRow struct { - ID int64 `db:"id"` - ObjectID types.Binary `db:"object_id"` - StartedAt types.UnixMilli `db:"started_at"` - RecoveredAt types.UnixMilli `db:"recovered_at"` - Severity event.Severity `db:"severity"` -} - -// TableName implements the contracts.TableNamer interface. -func (i *IncidentRow) TableName() string { - return "incident" -} - -// Upsert implements the contracts.Upserter interface. -func (i *IncidentRow) Upsert() interface{} { - return &struct { - Severity event.Severity `db:"severity"` - RecoveredAt types.UnixMilli `db:"recovered_at"` - }{Severity: i.Severity, RecoveredAt: i.RecoveredAt} -} - -// Sync synchronizes incidents to the database. -// Fetches the last inserted incident id and modifies this incident's id. -// Returns an error on database failure. -func (i *IncidentRow) Sync(ctx context.Context, tx *sqlx.Tx, db *icingadb.DB, upsert bool) error { - if upsert { - stmt, _ := db.BuildUpsertStmt(i) - _, err := tx.NamedExecContext(ctx, stmt, i) - if err != nil { - return fmt.Errorf("failed to upsert incident: %s", err) - } - } else { - incidentId, err := utils.InsertAndFetchId(ctx, tx, utils.BuildInsertStmtWithout(db, i, "id"), i) - if err != nil { - return err - } - - i.ID = incidentId - } - - return nil -} - // EventRow represents a single incident event database entry. type EventRow struct { IncidentID int64 `db:"incident_id"` diff --git a/internal/incident/incident.go b/internal/incident/incident.go index b0d60aa64..8ba9326e6 100644 --- a/internal/incident/incident.go +++ b/internal/incident/incident.go @@ -24,17 +24,18 @@ type ruleID = int64 type escalationID = int64 type Incident struct { - Object *object.Object - StartedAt time.Time - RecoveredAt time.Time - Severity event.Severity + Id int64 `db:"id"` + ObjectID types.Binary `db:"object_id"` + StartedAt types.UnixMilli `db:"started_at"` + RecoveredAt types.UnixMilli `db:"recovered_at"` + Severity event.Severity `db:"severity"` + + Object *object.Object `db:"-"` EscalationState map[escalationID]*EscalationState Rules map[ruleID]struct{} Recipients map[recipient.Key]*RecipientState - incidentRowID int64 - // timer calls RetriggerEscalations the next time any escalation could be reached on the incident. // // For example, if there are escalations configured for incident_age>=1h and incident_age>=2h, if the incident @@ -73,11 +74,11 @@ func (i *Incident) SeverityString() string { } func (i *Incident) String() string { - return fmt.Sprintf("#%d", i.incidentRowID) + return fmt.Sprintf("#%d", i.ID()) } func (i *Incident) ID() int64 { - return i.incidentRowID + return i.Id } func (i *Incident) HasManager() bool { @@ -190,7 +191,7 @@ func (i *Incident) RetriggerEscalations(ev *event.Event) { i.runtimeConfig.RLock() defer i.runtimeConfig.RUnlock() - if !i.RecoveredAt.IsZero() { + if !i.RecoveredAt.Time().IsZero() { // Incident is recovered in the meantime. return } @@ -276,14 +277,14 @@ func (i *Incident) processSeverityChangedEvent(ctx context.Context, tx *sqlx.Tx, causedByHistoryId = historyId if newSeverity == event.SeverityOK { - i.RecoveredAt = time.Now() + i.RecoveredAt = types.UnixMilli(time.Now()) i.logger.Info("All sources recovered, closing incident") RemoveCurrent(i.Object) history := &HistoryRow{ EventID: utils.ToDBInt(ev.ID), - Time: types.UnixMilli(i.RecoveredAt), + Time: i.RecoveredAt, Type: Closed, } @@ -310,7 +311,7 @@ func (i *Incident) processSeverityChangedEvent(ctx context.Context, tx *sqlx.Tx, } func (i *Incident) processIncidentOpenedEvent(ctx context.Context, tx *sqlx.Tx, ev *event.Event) error { - i.StartedAt = ev.Time + i.StartedAt = types.UnixMilli(ev.Time) i.Severity = ev.Severity if err := i.Sync(ctx, tx); err != nil { i.logger.Errorw("Can't insert incident to the database", zap.Error(err)) @@ -410,7 +411,7 @@ func (i *Incident) evaluateEscalations(eventTime time.Time) ([]*rule.Escalation, i.timer = nil } - filterContext := &rule.EscalationFilter{IncidentAge: eventTime.Sub(i.StartedAt), IncidentSeverity: i.Severity} + filterContext := &rule.EscalationFilter{IncidentAge: eventTime.Sub(i.StartedAt.Time()), IncidentSeverity: i.Severity} var escalations []*rule.Escalation retryAfter := rule.RetryNever @@ -466,7 +467,7 @@ func (i *Incident) evaluateEscalations(eventTime time.Time) ([]*rule.Escalation, i.RetriggerEscalations(&event.Event{ Type: event.TypeInternal, Time: nextEvalAt, - Message: fmt.Sprintf("Incident reached age %v", nextEvalAt.Sub(i.StartedAt)), + Message: fmt.Sprintf("Incident reached age %v", nextEvalAt.Sub(i.StartedAt.Time())), }) }) } diff --git a/internal/incident/incidents.go b/internal/incident/incidents.go index 56d0b1663..2860f68be 100644 --- a/internal/incident/incidents.go +++ b/internal/incident/incidents.go @@ -2,14 +2,12 @@ package incident import ( "context" - "database/sql" "errors" "github.com/icinga/icinga-notifications/internal/config" "github.com/icinga/icinga-notifications/internal/event" "github.com/icinga/icinga-notifications/internal/object" "github.com/icinga/icingadb/pkg/icingadb" "github.com/icinga/icingadb/pkg/logging" - "github.com/icinga/icingadb/pkg/types" "go.uber.org/zap" "sync" "time" @@ -25,31 +23,75 @@ var ( func LoadOpenIncidents(ctx context.Context, db *icingadb.DB, logger *logging.Logger, runtimeConfig *config.RuntimeConfig) error { logger.Info("Loading all active incidents from database") - var objectIDs []types.Binary - err := db.SelectContext(ctx, &objectIDs, `SELECT object_id FROM incident WHERE "recovered_at" IS NULL`) + query := ` + SELECT incident.id, started_at, severity, incident.object_id, object.source_id, object.host, object.service, object.name, object.url, oet.tag, oet.value + FROM incident + INNER JOIN object on object.id = incident.object_id + INNER JOIN public.object_extra_tag oet on object.id = oet.object_id + WHERE "recovered_at" IS NULL + GROUP BY incident.id, object.id, oet.tag, oet.value` + + rows, err := db.QueryxContext(ctx, query) if err != nil { logger.Errorw("Failed to load active incidents from database", zap.Error(err)) return errors.New("failed to fetch open incidents") } + defer func() { _ = rows.Close() }() - for _, objectID := range objectIDs { - obj, err := object.LoadFromDB(ctx, db, objectID) - if err != nil { - logger.Errorw("Failed to retrieve incident object from database", zap.Error(err)) - continue - } + cacheIncident := func(i *Incident) { + currentIncidentsMu.Lock() + defer currentIncidentsMu.Unlock() - incident, _, err := GetCurrent(ctx, db, obj, logger, runtimeConfig, false) - if err != nil { - continue + i.logger = logger.With(zap.String("object", i.Object.DisplayName()), zap.String("incident", i.String())) + if i.restoreEscalationsState(ctx) != nil { + // Error is already logged within the function + return } - incident.RetriggerEscalations(&event.Event{ + i.RetriggerEscalations(&event.Event{ Time: time.Now(), Type: event.TypeInternal, Message: "Incident reevaluation at daemon startup", }) + + object.Cache(i.Object) + + currentIncidents[i.Object] = i + } + + var prevIncident *Incident + var tag, value string + for rows.Next() { + ev := &event.Event{Tags: make(map[string]string), ExtraTags: make(map[string]string)} + i := NewIncident(db, object.NewObject(db, ev), runtimeConfig, nil) + err := rows.Scan(&i.Id, &i.StartedAt, &i.Severity, &i.ObjectID, &i.Object.SourceID, &i.Object.Host, &i.Object.Service, &i.Object.Name, &i.Object.URL, &tag, &value) + if err != nil { + return err + } + + if prevIncident != nil && i.ID() == prevIncident.ID() { + prevIncident.Object.ExtraTags[tag] = value + } else { + i.Object.ID = i.ObjectID + i.Object.Tags["host"] = i.Object.Host + if i.Object.Service.Valid { + i.Object.Tags["service"] = i.Object.Service.String + } + + i.Object.ExtraTags[tag] = value + + if prevIncident == nil { + prevIncident = i + } else { + cacheIncident(prevIncident) + prevIncident = i + } + } + } + + if prevIncident != nil { + cacheIncident(prevIncident) } return nil @@ -65,37 +107,14 @@ func GetCurrent( created := false currentIncident := currentIncidents[obj] - if currentIncident == nil { - ir := &IncidentRow{} - incidentLogger := logger.With(zap.String("object", obj.DisplayName())) - incident := NewIncident(db, obj, runtimeConfig, incidentLogger) - - err := db.QueryRowxContext(ctx, db.Rebind(db.BuildSelectStmt(ir, ir)+` WHERE "object_id" = ? AND "recovered_at" IS NULL`), obj.ID).StructScan(ir) - if err != nil && !errors.Is(err, sql.ErrNoRows) { - logger.Errorw("Failed to load incident from database", zap.String("object", obj.DisplayName()), zap.Error(err)) - - return nil, false, errors.New("failed to load incident from database") - } else if err == nil { - incident.incidentRowID = ir.ID - incident.StartedAt = ir.StartedAt.Time() - incident.Severity = ir.Severity - incident.logger = logger.With(zap.String("object", obj.DisplayName()), zap.String("incident", incident.String())) + if currentIncident == nil && create { + created = true - if err := incident.restoreEscalationsState(ctx); err != nil { - return nil, false, err - } - - currentIncident = incident - } - - if create && currentIncident == nil { - created = true - currentIncident = incident - } + incidentLogger := logger.With(zap.String("object", obj.DisplayName())) + currentIncident = NewIncident(db, obj, runtimeConfig, incidentLogger) + currentIncident.ObjectID = obj.ID - if currentIncident != nil { - currentIncidents[obj] = currentIncident - } + currentIncidents[obj] = currentIncident } if !created && currentIncident != nil { @@ -128,7 +147,7 @@ func GetCurrentIncidents() map[int64]*Incident { m := make(map[int64]*Incident) for _, incident := range currentIncidents { - m[incident.incidentRowID] = incident + m[incident.ID()] = incident } return m } diff --git a/internal/incident/sync.go b/internal/incident/sync.go index b29969dc9..ea13a5a4f 100644 --- a/internal/incident/sync.go +++ b/internal/incident/sync.go @@ -3,6 +3,7 @@ package incident import ( "context" "errors" + "fmt" "github.com/icinga/icinga-notifications/internal/event" "github.com/icinga/icinga-notifications/internal/recipient" "github.com/icinga/icinga-notifications/internal/rule" @@ -13,30 +14,44 @@ import ( "time" ) +// TableName implements the contracts.TableNamer interface. +func (i *Incident) TableName() string { + return "incident" +} + +// Upsert implements the contracts.Upserter interface. +func (i *Incident) Upsert() interface{} { + return &struct { + Severity event.Severity `db:"severity"` + RecoveredAt types.UnixMilli `db:"recovered_at"` + }{Severity: i.Severity, RecoveredAt: i.RecoveredAt} +} + // Sync initiates an *incident.IncidentRow from the current incident state and syncs it with the database. // Before syncing any incident related database entries, this method should be called at least once. // Returns an error on db failure. func (i *Incident) Sync(ctx context.Context, tx *sqlx.Tx) error { - incidentRow := &IncidentRow{ - ID: i.incidentRowID, - ObjectID: i.Object.ID, - StartedAt: types.UnixMilli(i.StartedAt), - RecoveredAt: types.UnixMilli(i.RecoveredAt), - Severity: i.Severity, - } + if i.ID() != 0 { + stmt, _ := i.db.BuildUpsertStmt(i) + _, err := tx.NamedExecContext(ctx, stmt, i) + if err != nil { + return fmt.Errorf("failed to upsert incident: %s", err) + } + } else { + stmt := utils.BuildInsertStmtWithout(i.db, i, "id") + incidentId, err := utils.InsertAndFetchId(ctx, tx, stmt, i) + if err != nil { + return err + } - err := incidentRow.Sync(ctx, tx, i.db, i.incidentRowID != 0) - if err != nil { - return err + i.Id = incidentId } - i.incidentRowID = incidentRow.ID - return nil } func (i *Incident) AddHistory(ctx context.Context, tx *sqlx.Tx, historyRow *HistoryRow, fetchId bool) (types.Int, error) { - historyRow.IncidentID = i.incidentRowID + historyRow.IncidentID = i.ID() stmt := utils.BuildInsertStmtWithout(i.db, historyRow, "id") if fetchId { @@ -57,7 +72,7 @@ func (i *Incident) AddHistory(ctx context.Context, tx *sqlx.Tx, historyRow *Hist } func (i *Incident) AddEscalationTriggered(ctx context.Context, tx *sqlx.Tx, state *EscalationState) error { - state.IncidentID = i.incidentRowID + state.IncidentID = i.ID() stmt, _ := i.db.BuildUpsertStmt(state) _, err := tx.NamedExecContext(ctx, stmt, state) @@ -67,7 +82,7 @@ func (i *Incident) AddEscalationTriggered(ctx context.Context, tx *sqlx.Tx, stat // AddEvent Inserts incident history record to the database and returns an error on db failure. func (i *Incident) AddEvent(ctx context.Context, tx *sqlx.Tx, ev *event.Event) error { - ie := &EventRow{IncidentID: i.incidentRowID, EventID: ev.ID} + ie := &EventRow{IncidentID: i.ID(), EventID: ev.ID} stmt, _ := i.db.BuildInsertStmt(ie) _, err := tx.NamedExecContext(ctx, stmt, ie) @@ -84,7 +99,7 @@ func (i *Incident) AddRecipient(ctx context.Context, tx *sqlx.Tx, escalation *ru for _, escalationRecipient := range escalation.Recipients { r := escalationRecipient.Recipient - cr := &ContactRow{IncidentID: i.incidentRowID, Role: newRole} + cr := &ContactRow{IncidentID: i.ID(), Role: newRole} recipientKey := recipient.ToKey(r) cr.Key = recipientKey @@ -100,7 +115,7 @@ func (i *Incident) AddRecipient(ctx context.Context, tx *sqlx.Tx, escalation *ru i.logger.Infof("Contact %q role changed from %s to %s", r, state.Role.String(), newRole.String()) hr := &HistoryRow{ - IncidentID: i.incidentRowID, + IncidentID: i.ID(), EventID: utils.ToDBInt(eventId), Key: cr.Key, Time: types.UnixMilli(time.Now()), @@ -140,7 +155,7 @@ func (i *Incident) AddRecipient(ctx context.Context, tx *sqlx.Tx, escalation *ru // AddRuleMatched syncs the given *rule.Rule to the database. // Returns an error on database failure. func (i *Incident) AddRuleMatched(ctx context.Context, tx *sqlx.Tx, r *rule.Rule) error { - rr := &RuleRow{IncidentID: i.incidentRowID, RuleID: r.ID} + rr := &RuleRow{IncidentID: i.ID(), RuleID: r.ID} stmt, _ := i.db.BuildUpsertStmt(rr) _, err := tx.NamedExecContext(ctx, stmt, rr) diff --git a/internal/object/db_types.go b/internal/object/db_types.go index e8e6eec05..67e7dbcae 100644 --- a/internal/object/db_types.go +++ b/internal/object/db_types.go @@ -1,9 +1,7 @@ package object import ( - "context" "fmt" - "github.com/icinga/icingadb/pkg/icingadb" "github.com/icinga/icingadb/pkg/types" ) @@ -19,71 +17,26 @@ func (e *ExtraTagRow) TableName() string { return "object_extra_tag" } -type ObjectRow struct { - ID types.Binary `db:"id"` - SourceID int64 `db:"source_id"` - Name string `db:"name"` - Host string `db:"host"` - Service types.String `db:"service"` - URL types.String `db:"url"` -} - // TableName implements the contracts.TableNamer interface. -func (or *ObjectRow) TableName() string { +func (o *Object) TableName() string { return "object" } // Upsert implements the contracts.Upserter interface. -func (or *ObjectRow) Upsert() interface{} { +func (o *Object) Upsert() interface{} { return struct { Name string `db:"name"` URL types.String `db:"url"` }{} } -// LoadFromDB loads objects from the database matching the given id. -// This is only used to load the objects at daemon startup before the listener becomes ready, -// therefore it doesn't lock the objects cache mutex and panics when the given object ID is already -// in the cache. Otherwise, loads all the required data and returns error on database failure. -func LoadFromDB(ctx context.Context, db *icingadb.DB, id types.Binary) (*Object, error) { - if obj, ok := cache[id.String()]; ok { +// Cache adds the given object to the global object cache store. +// This is only used after loading the objects at daemon startup before the listener becomes ready, therefore it +// doesn't lock the objects cache mutex and panics when the given object is already in the cache. +func Cache(obj *Object) { + if obj, ok := cache[obj.ID.String()]; ok { panic(fmt.Sprintf("Object %s is already in cache", obj.DisplayName())) } - objectRow := &ObjectRow{ID: id} - err := db.QueryRowxContext(ctx, db.Rebind(db.BuildSelectStmt(objectRow, objectRow)+` WHERE "id" = ?`), objectRow.ID).StructScan(objectRow) - if err != nil { - return nil, fmt.Errorf("failed to fetch object: %w", err) - } - - tags := map[string]string{"host": objectRow.Host} - if objectRow.Service.Valid { - tags["service"] = objectRow.Service.String - } - - var extraTagRows []*ExtraTagRow - err = db.SelectContext( - ctx, &extraTagRows, - db.Rebind(db.BuildSelectStmt(&ExtraTagRow{}, &ExtraTagRow{})+` WHERE "object_id" = ?`), id, - ) - if err != nil { - return nil, fmt.Errorf("failed to fetch object extra tags: %w", err) - } - - extraTags := map[string]string{} - for _, extraTag := range extraTagRows { - extraTags[extraTag.Tag] = extraTag.Value - } - - obj := &Object{ - db: db, - ID: id, - Name: objectRow.Name, - URL: objectRow.URL.String, - Tags: tags, - ExtraTags: extraTags, - } - cache[id.String()] = obj - - return obj, nil + cache[obj.ID.String()] = obj } diff --git a/internal/object/object.go b/internal/object/object.go index ebccbecc3..0687bc546 100644 --- a/internal/object/object.go +++ b/internal/object/object.go @@ -24,12 +24,14 @@ var ( ) type Object struct { - ID types.Binary - SourceId int64 - Name string - Tags map[string]string - URL string - + ID types.Binary `db:"id"` + SourceID int64 `db:"source_id"` + Name string `db:"name"` + Host string `db:"host"` + Service types.String `db:"service"` + URL types.String `db:"url"` + + Tags map[string]string ExtraTags map[string]string db *icingadb.DB @@ -37,10 +39,10 @@ type Object struct { func NewObject(db *icingadb.DB, ev *event.Event) *Object { return &Object{ - SourceId: ev.SourceId, + SourceID: ev.SourceId, Name: ev.Name, db: db, - URL: ev.URL, + URL: utils.ToDBString(ev.URL), Tags: ev.Tags, ExtraTags: ev.ExtraTags, } @@ -68,20 +70,13 @@ func FromEvent(ctx context.Context, db *icingadb.DB, ev *event.Event) (*Object, } defer func() { _ = tx.Rollback() }() - dbObj := &ObjectRow{ - ID: object.ID, - SourceID: ev.SourceId, - Name: ev.Name, - Host: ev.Tags["host"], - URL: utils.ToDBString(ev.URL), - } - + object.Host = ev.Tags["host"] if service, ok := ev.Tags["service"]; ok { - dbObj.Service = utils.ToDBString(service) + object.Service = utils.ToDBString(service) } - stmt, _ := object.db.BuildUpsertStmt(&ObjectRow{}) - _, err = tx.NamedExecContext(ctx, stmt, dbObj) + stmt, _ := object.db.BuildUpsertStmt(&Object{}) + _, err = tx.NamedExecContext(ctx, stmt, object) if err != nil { return nil, fmt.Errorf("failed to insert object: %w", err) } @@ -104,11 +99,6 @@ func FromEvent(ctx context.Context, db *icingadb.DB, ev *event.Event) (*Object, return nil, fmt.Errorf("can't commit object database transaction: %w", err) } - object.ExtraTags = ev.ExtraTags - object.Tags = ev.Tags - object.Name = ev.Name - object.URL = ev.URL - return object, nil } @@ -136,9 +126,9 @@ func (o *Object) String() string { _, _ = fmt.Fprintf(&b, "\n") } - _, _ = fmt.Fprintf(&b, " Source %d:\n", o.SourceId) + _, _ = fmt.Fprintf(&b, " Source %d:\n", o.SourceID) _, _ = fmt.Fprintf(&b, " Name: %q\n", o.Name) - _, _ = fmt.Fprintf(&b, " URL: %q\n", o.URL) + _, _ = fmt.Fprintf(&b, " URL: %q\n", o.URL.String) _, _ = fmt.Fprintf(&b, " Extra Tags:\n") for tag, value := range o.ExtraTags {