Skip to content

Commit

Permalink
DEVPROD-12452 account for when the host was last provisioned when che…
Browse files Browse the repository at this point in the history
…cking system failures (#8491)
  • Loading branch information
ablack12 authored Nov 27, 2024
1 parent 35de9a1 commit 5f4700e
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 8 deletions.
17 changes: 10 additions & 7 deletions model/event/host_event_finder.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package event

import (
"context"
"time"

"github.com/evergreen-ci/evergreen"
mgobson "github.com/evergreen-ci/evergreen/db/mgo/bson"
Expand All @@ -21,13 +22,14 @@ type hostStatusDistro struct {
func (s *hostStatusDistro) MarshalBSON() ([]byte, error) { return mgobson.Marshal(s) }
func (s *hostStatusDistro) UnmarshalBSON(in []byte) error { return mgobson.Unmarshal(in, s) }

func getRecentStatusesForHost(ctx context.Context, hostId string, n int) (int, []string) {
or := ResourceTypeKeyIs(ResourceTypeHost)
or[TypeKey] = EventHostTaskFinished
or[ResourceIdKey] = hostId
func getRecentFinishedStatusesForHost(ctx context.Context, hostId string, hostProvisionTime time.Time, n int) (int, []string) {
query := ResourceTypeKeyIs(ResourceTypeHost)
query[TypeKey] = EventHostTaskFinished
query[ResourceIdKey] = hostId
query[TimestampKey] = bson.M{"$gte": hostProvisionTime}

pipeline := []bson.M{
{"$match": or},
{"$match": query},
{"$sort": bson.M{TimestampKey: -1}},
{"$limit": n},
{"$group": bson.M{
Expand Down Expand Up @@ -64,12 +66,13 @@ func getRecentStatusesForHost(ctx context.Context, hostId string, n int) (int, [
}

// AllRecentHostEventsAreSystemFailed returns true if all recent host events are system failures, and false if any are not.
func AllRecentHostEventsAreSystemFailed(ctx context.Context, hostId string, n int) bool {
// Only takes into account task finished events that occurred since the last time the task started running.
func AllRecentHostEventsAreSystemFailed(ctx context.Context, hostId string, hostProvisionTime time.Time, n int) bool {
if n == 0 {
return false
}

count, statuses := getRecentStatusesForHost(ctx, hostId, n)
count, statuses := getRecentFinishedStatusesForHost(ctx, hostId, hostProvisionTime, n)
if count == 0 {
return false
}
Expand Down
2 changes: 1 addition & 1 deletion rest/route/host_agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -1265,7 +1265,7 @@ func (h *hostAgentEndTask) Run(ctx context.Context) gimlet.Responder {

// Disable hosts and prevent them from performing more work if they have
// system failed many tasks in a row.
if event.AllRecentHostEventsAreSystemFailed(ctx, currentHost.Id, consecutiveSystemFailureThreshold) {
if event.AllRecentHostEventsAreSystemFailed(ctx, currentHost.Id, currentHost.ProvisionTime, consecutiveSystemFailureThreshold) {
msg := fmt.Sprintf("host encountered %d consecutive system failures", consecutiveSystemFailureThreshold)
grip.Error(message.WrapError(units.HandlePoisonedHost(ctx, h.env, currentHost, msg), message.Fields{
"message": "unable to disable poisoned host",
Expand Down
74 changes: 74 additions & 0 deletions rest/route/host_agent_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -901,6 +901,39 @@ func TestHostEndTask(t *testing.T) {
require.NotZero(t, foundTask)
require.Equal(t, evergreen.TaskSystemUnresponse, foundTask.GetDisplayStatus())
},
"SkipQuarantiningRecentlyProvisionedStaticHostWithFailures": func(ctx context.Context, t *testing.T, handler *hostAgentEndTask, env *mock.Environment) {
h, err := host.FindOneId(ctx, hostId)
require.NoError(t, err)
require.NotZero(t, h)

for i := 0; i < 10; i++ {
event.LogHostTaskFinished(fmt.Sprintf("some-system-failed-task-%d", i), 0, hostId, evergreen.TaskSystemFailed)
}
require.NoError(t, host.UpdateOne(ctx, host.ById(hostId), bson.M{
"$set": bson.M{
host.ProviderKey: evergreen.ProviderNameStatic,
host.ProvisionTimeKey: time.Now(), // i.e. this host was re-provisioned after all of these failures
},
}))

details := &apimodels.TaskEndDetail{
Status: evergreen.TaskFailed,
Type: evergreen.CommandTypeSystem,
}
handler.details = *details
resp := handler.Run(ctx)
require.NotNil(t, resp)
require.Equal(t, http.StatusOK, resp.Status())
h, err = host.FindOneId(ctx, hostId)
require.NoError(t, err)
require.NotZero(t, h)
assert.NotEqual(t, evergreen.HostQuarantined, h.Status)

foundTask, err := task.FindOneId(handler.taskID)
require.NoError(t, err)
require.NotZero(t, foundTask)
require.Equal(t, evergreen.TaskSystemFailed, foundTask.GetDisplayStatus())
},
"DecommissionsDynamicHostWithRepeatedSystemFailedTasks": func(ctx context.Context, t *testing.T, handler *hostAgentEndTask, env *mock.Environment) {
h, err := host.FindOneId(ctx, hostId)
require.NoError(t, err)
Expand Down Expand Up @@ -995,6 +1028,45 @@ func TestHostEndTask(t *testing.T) {
require.NotZero(t, h)
assert.Equal(t, evergreen.HostDecommissioned, h.Status, "dynamic host should be decommissioned for consecutive system failed tasks")

foundTask, err := task.FindOneId(handler.taskID)
require.NoError(t, err)
require.NotZero(t, foundTask)
require.Equal(t, evergreen.TaskSystemUnresponse, foundTask.GetDisplayStatus())
},
"SkipDecommissioningRecentlyProvisionedDynamicHostWithFailures": func(ctx context.Context, t *testing.T, handler *hostAgentEndTask, env *mock.Environment) {
h, err := host.FindOneId(ctx, hostId)
require.NoError(t, err)
require.NotZero(t, h)
for i := 0; i < 8; i++ {
event.LogHostTaskFinished(fmt.Sprintf("some-system-failed-task-%d", i), 0, hostId, evergreen.TaskSystemUnresponse)
}

require.NoError(t, host.UpdateOne(ctx, host.ById(hostId), bson.M{
"$set": bson.M{
host.ProviderKey: evergreen.ProviderNameEc2Fleet,
host.ProvisionTimeKey: time.Now(), // i.e. this host was re-provisioned before two of these failures
},
}))

for i := 8; i < 10; i++ {
event.LogHostTaskFinished(fmt.Sprintf("some-system-failed-task-%d", i), 0, hostId, evergreen.TaskSystemUnresponse)
}

details := &apimodels.TaskEndDetail{
Status: evergreen.TaskFailed,
Type: evergreen.CommandTypeSystem,
TimedOut: true,
Description: evergreen.TaskDescriptionHeartbeat,
}
handler.details = *details
resp := handler.Run(ctx)
require.NotNil(t, resp)
require.Equal(t, http.StatusOK, resp.Status())
h, err = host.FindOneId(ctx, hostId)
require.NoError(t, err)
require.NotZero(t, h)
assert.NotEqual(t, evergreen.HostDecommissioned, h.Status)

foundTask, err := task.FindOneId(handler.taskID)
require.NoError(t, err)
require.NotZero(t, foundTask)
Expand Down Expand Up @@ -1040,6 +1112,7 @@ func TestHostEndTask(t *testing.T) {
}
require.NoError(t, task1.Insert())

now := time.Now()
sampleHost := host.Host{
Id: hostId,
Distro: distro.Distro{
Expand All @@ -1051,6 +1124,7 @@ func TestHostEndTask(t *testing.T) {
Status: evergreen.HostRunning,
AgentRevision: evergreen.AgentVersion,
LastTaskCompletedTime: time.Now().Add(-20 * time.Minute).Round(time.Second),
ProvisionTime: now.Add(-time.Hour), // provisioned before any of the events
}
require.NoError(t, sampleHost.Insert(ctx))

Expand Down

0 comments on commit 5f4700e

Please sign in to comment.