From 01cbeea574ba685f27e140773bc25dc0c640f929 Mon Sep 17 00:00:00 2001 From: "annie.black" Date: Wed, 20 Nov 2024 11:10:16 -0500 Subject: [PATCH 1/2] DEVPROD-12452 account for when the host was last provisioned when checking system failures --- model/event/host_event_finder.go | 17 +++++--- rest/route/host_agent.go | 2 +- rest/route/host_agent_test.go | 74 ++++++++++++++++++++++++++++++++ 3 files changed, 85 insertions(+), 8 deletions(-) diff --git a/model/event/host_event_finder.go b/model/event/host_event_finder.go index 1f3dc36eb77..d291ed10d03 100644 --- a/model/event/host_event_finder.go +++ b/model/event/host_event_finder.go @@ -2,6 +2,7 @@ package event import ( "context" + "time" "github.com/evergreen-ci/evergreen" mgobson "github.com/evergreen-ci/evergreen/db/mgo/bson" @@ -21,13 +22,14 @@ type hostStatusDistro struct { func (s *hostStatusDistro) MarshalBSON() ([]byte, error) { return mgobson.Marshal(s) } func (s *hostStatusDistro) UnmarshalBSON(in []byte) error { return mgobson.Unmarshal(in, s) } -func getRecentStatusesForHost(ctx context.Context, hostId string, n int) (int, []string) { - or := ResourceTypeKeyIs(ResourceTypeHost) - or[TypeKey] = EventHostTaskFinished - or[ResourceIdKey] = hostId +func getRecentStatusesForHost(ctx context.Context, hostId string, hostProvisionTime time.Time, n int) (int, []string) { + query := ResourceTypeKeyIs(ResourceTypeHost) + query[TypeKey] = EventHostTaskFinished + query[ResourceIdKey] = hostId + query[TimestampKey] = bson.M{"$gte": hostProvisionTime} pipeline := []bson.M{ - {"$match": or}, + {"$match": query}, {"$sort": bson.M{TimestampKey: -1}}, {"$limit": n}, {"$group": bson.M{ @@ -64,12 +66,13 @@ func getRecentStatusesForHost(ctx context.Context, hostId string, n int) (int, [ } // AllRecentHostEventsAreSystemFailed returns true if all recent host events are system failures, and false if any are not. -func AllRecentHostEventsAreSystemFailed(ctx context.Context, hostId string, n int) bool { +// Only takes into account events that occurred since the last time the task started running. +func AllRecentHostEventsAreSystemFailed(ctx context.Context, hostId string, hostProvisionTime time.Time, n int) bool { if n == 0 { return false } - count, statuses := getRecentStatusesForHost(ctx, hostId, n) + count, statuses := getRecentStatusesForHost(ctx, hostId, hostProvisionTime, n) if count == 0 { return false } diff --git a/rest/route/host_agent.go b/rest/route/host_agent.go index 45e238a6442..ab6b8852e1d 100644 --- a/rest/route/host_agent.go +++ b/rest/route/host_agent.go @@ -1267,7 +1267,7 @@ func (h *hostAgentEndTask) Run(ctx context.Context) gimlet.Responder { // Disable hosts and prevent them from performing more work if they have // system failed many tasks in a row. - if event.AllRecentHostEventsAreSystemFailed(ctx, currentHost.Id, consecutiveSystemFailureThreshold) { + if event.AllRecentHostEventsAreSystemFailed(ctx, currentHost.Id, currentHost.ProvisionTime, consecutiveSystemFailureThreshold) { msg := fmt.Sprintf("host encountered %d consecutive system failures", consecutiveSystemFailureThreshold) grip.Error(message.WrapError(units.HandlePoisonedHost(ctx, h.env, currentHost, msg), message.Fields{ "message": "unable to disable poisoned host", diff --git a/rest/route/host_agent_test.go b/rest/route/host_agent_test.go index e8c85a07ee7..cc79ed854f6 100644 --- a/rest/route/host_agent_test.go +++ b/rest/route/host_agent_test.go @@ -901,6 +901,39 @@ func TestHostEndTask(t *testing.T) { require.NotZero(t, foundTask) require.Equal(t, evergreen.TaskSystemUnresponse, foundTask.GetDisplayStatus()) }, + "SkipQuarantiningRecentlyProvisionedStaticHostWithFailures": func(ctx context.Context, t *testing.T, handler *hostAgentEndTask, env *mock.Environment) { + h, err := host.FindOneId(ctx, hostId) + require.NoError(t, err) + require.NotZero(t, h) + + for i := 0; i < 10; i++ { + event.LogHostTaskFinished(fmt.Sprintf("some-system-failed-task-%d", i), 0, hostId, evergreen.TaskSystemFailed) + } + require.NoError(t, host.UpdateOne(ctx, host.ById(hostId), bson.M{ + "$set": bson.M{ + host.ProviderKey: evergreen.ProviderNameStatic, + host.ProvisionTimeKey: time.Now(), // i.e. this host was re-provisioned after all of these failures + }, + })) + + details := &apimodels.TaskEndDetail{ + Status: evergreen.TaskFailed, + Type: evergreen.CommandTypeSystem, + } + handler.details = *details + resp := handler.Run(ctx) + require.NotNil(t, resp) + require.Equal(t, http.StatusOK, resp.Status()) + h, err = host.FindOneId(ctx, hostId) + require.NoError(t, err) + require.NotZero(t, h) + assert.NotEqual(t, evergreen.HostQuarantined, h.Status) + + foundTask, err := task.FindOneId(handler.taskID) + require.NoError(t, err) + require.NotZero(t, foundTask) + require.Equal(t, evergreen.TaskSystemFailed, foundTask.GetDisplayStatus()) + }, "DecommissionsDynamicHostWithRepeatedSystemFailedTasks": func(ctx context.Context, t *testing.T, handler *hostAgentEndTask, env *mock.Environment) { h, err := host.FindOneId(ctx, hostId) require.NoError(t, err) @@ -995,6 +1028,45 @@ func TestHostEndTask(t *testing.T) { require.NotZero(t, h) assert.Equal(t, evergreen.HostDecommissioned, h.Status, "dynamic host should be decommissioned for consecutive system failed tasks") + foundTask, err := task.FindOneId(handler.taskID) + require.NoError(t, err) + require.NotZero(t, foundTask) + require.Equal(t, evergreen.TaskSystemUnresponse, foundTask.GetDisplayStatus()) + }, + "SkipDecommissioningRecentlyProvisionedDynamicHostWithFailures": func(ctx context.Context, t *testing.T, handler *hostAgentEndTask, env *mock.Environment) { + h, err := host.FindOneId(ctx, hostId) + require.NoError(t, err) + require.NotZero(t, h) + for i := 0; i < 8; i++ { + event.LogHostTaskFinished(fmt.Sprintf("some-system-failed-task-%d", i), 0, hostId, evergreen.TaskSystemUnresponse) + } + + require.NoError(t, host.UpdateOne(ctx, host.ById(hostId), bson.M{ + "$set": bson.M{ + host.ProviderKey: evergreen.ProviderNameEc2Fleet, + host.ProvisionTimeKey: time.Now(), // i.e. this host was re-provisioned before two of these failures + }, + })) + + for i := 8; i < 10; i++ { + event.LogHostTaskFinished(fmt.Sprintf("some-system-failed-task-%d", i), 0, hostId, evergreen.TaskSystemUnresponse) + } + + details := &apimodels.TaskEndDetail{ + Status: evergreen.TaskFailed, + Type: evergreen.CommandTypeSystem, + TimedOut: true, + Description: evergreen.TaskDescriptionHeartbeat, + } + handler.details = *details + resp := handler.Run(ctx) + require.NotNil(t, resp) + require.Equal(t, http.StatusOK, resp.Status()) + h, err = host.FindOneId(ctx, hostId) + require.NoError(t, err) + require.NotZero(t, h) + assert.NotEqual(t, evergreen.HostDecommissioned, h.Status) + foundTask, err := task.FindOneId(handler.taskID) require.NoError(t, err) require.NotZero(t, foundTask) @@ -1040,6 +1112,7 @@ func TestHostEndTask(t *testing.T) { } require.NoError(t, task1.Insert()) + now := time.Now() sampleHost := host.Host{ Id: hostId, Distro: distro.Distro{ @@ -1051,6 +1124,7 @@ func TestHostEndTask(t *testing.T) { Status: evergreen.HostRunning, AgentRevision: evergreen.AgentVersion, LastTaskCompletedTime: time.Now().Add(-20 * time.Minute).Round(time.Second), + ProvisionTime: now.Add(-time.Hour), // provisioned before any of the events } require.NoError(t, sampleHost.Insert(ctx)) From 8189eb8c6eaeba1b3a529864080044bd4dff34ba Mon Sep 17 00:00:00 2001 From: "annie.black" Date: Mon, 25 Nov 2024 15:48:48 -0500 Subject: [PATCH 2/2] cr --- model/event/host_event_finder.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/model/event/host_event_finder.go b/model/event/host_event_finder.go index d291ed10d03..4f06e2e843e 100644 --- a/model/event/host_event_finder.go +++ b/model/event/host_event_finder.go @@ -22,7 +22,7 @@ type hostStatusDistro struct { func (s *hostStatusDistro) MarshalBSON() ([]byte, error) { return mgobson.Marshal(s) } func (s *hostStatusDistro) UnmarshalBSON(in []byte) error { return mgobson.Unmarshal(in, s) } -func getRecentStatusesForHost(ctx context.Context, hostId string, hostProvisionTime time.Time, n int) (int, []string) { +func getRecentFinishedStatusesForHost(ctx context.Context, hostId string, hostProvisionTime time.Time, n int) (int, []string) { query := ResourceTypeKeyIs(ResourceTypeHost) query[TypeKey] = EventHostTaskFinished query[ResourceIdKey] = hostId @@ -66,13 +66,13 @@ func getRecentStatusesForHost(ctx context.Context, hostId string, hostProvisionT } // AllRecentHostEventsAreSystemFailed returns true if all recent host events are system failures, and false if any are not. -// Only takes into account events that occurred since the last time the task started running. +// Only takes into account task finished events that occurred since the last time the task started running. func AllRecentHostEventsAreSystemFailed(ctx context.Context, hostId string, hostProvisionTime time.Time, n int) bool { if n == 0 { return false } - count, statuses := getRecentStatusesForHost(ctx, hostId, hostProvisionTime, n) + count, statuses := getRecentFinishedStatusesForHost(ctx, hostId, hostProvisionTime, n) if count == 0 { return false }