Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DEVPROD-12452 account for when the host was last provisioned when checking system failures #8491

Merged
merged 3 commits into from
Nov 27, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 10 additions & 7 deletions model/event/host_event_finder.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package event

import (
"context"
"time"

"github.com/evergreen-ci/evergreen"
mgobson "github.com/evergreen-ci/evergreen/db/mgo/bson"
Expand All @@ -21,13 +22,14 @@ type hostStatusDistro struct {
func (s *hostStatusDistro) MarshalBSON() ([]byte, error) { return mgobson.Marshal(s) }
func (s *hostStatusDistro) UnmarshalBSON(in []byte) error { return mgobson.Unmarshal(in, s) }

func getRecentStatusesForHost(ctx context.Context, hostId string, n int) (int, []string) {
or := ResourceTypeKeyIs(ResourceTypeHost)
or[TypeKey] = EventHostTaskFinished
or[ResourceIdKey] = hostId
func getRecentStatusesForHost(ctx context.Context, hostId string, hostProvisionTime time.Time, n int) (int, []string) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: can we rename this to getRecentTaskFinishedStatusesForHost or something similar since this function ignores other host event logs

query := ResourceTypeKeyIs(ResourceTypeHost)
query[TypeKey] = EventHostTaskFinished
query[ResourceIdKey] = hostId
query[TimestampKey] = bson.M{"$gte": hostProvisionTime}

pipeline := []bson.M{
{"$match": or},
{"$match": query},
{"$sort": bson.M{TimestampKey: -1}},
{"$limit": n},
{"$group": bson.M{
Expand Down Expand Up @@ -64,12 +66,13 @@ func getRecentStatusesForHost(ctx context.Context, hostId string, n int) (int, [
}

// AllRecentHostEventsAreSystemFailed returns true if all recent host events are system failures, and false if any are not.
func AllRecentHostEventsAreSystemFailed(ctx context.Context, hostId string, n int) bool {
// Only takes into account events that occurred since the last time the task started running.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: task finished events

func AllRecentHostEventsAreSystemFailed(ctx context.Context, hostId string, hostProvisionTime time.Time, n int) bool {
if n == 0 {
return false
}

count, statuses := getRecentStatusesForHost(ctx, hostId, n)
count, statuses := getRecentStatusesForHost(ctx, hostId, hostProvisionTime, n)
if count == 0 {
return false
}
Expand Down
2 changes: 1 addition & 1 deletion rest/route/host_agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -1267,7 +1267,7 @@ func (h *hostAgentEndTask) Run(ctx context.Context) gimlet.Responder {

// Disable hosts and prevent them from performing more work if they have
// system failed many tasks in a row.
if event.AllRecentHostEventsAreSystemFailed(ctx, currentHost.Id, consecutiveSystemFailureThreshold) {
if event.AllRecentHostEventsAreSystemFailed(ctx, currentHost.Id, currentHost.ProvisionTime, consecutiveSystemFailureThreshold) {
msg := fmt.Sprintf("host encountered %d consecutive system failures", consecutiveSystemFailureThreshold)
grip.Error(message.WrapError(units.HandlePoisonedHost(ctx, h.env, currentHost, msg), message.Fields{
"message": "unable to disable poisoned host",
Expand Down
74 changes: 74 additions & 0 deletions rest/route/host_agent_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -901,6 +901,39 @@ func TestHostEndTask(t *testing.T) {
require.NotZero(t, foundTask)
require.Equal(t, evergreen.TaskSystemUnresponse, foundTask.GetDisplayStatus())
},
"SkipQuarantiningRecentlyProvisionedStaticHostWithFailures": func(ctx context.Context, t *testing.T, handler *hostAgentEndTask, env *mock.Environment) {
h, err := host.FindOneId(ctx, hostId)
require.NoError(t, err)
require.NotZero(t, h)

for i := 0; i < 10; i++ {
event.LogHostTaskFinished(fmt.Sprintf("some-system-failed-task-%d", i), 0, hostId, evergreen.TaskSystemFailed)
}
require.NoError(t, host.UpdateOne(ctx, host.ById(hostId), bson.M{
"$set": bson.M{
host.ProviderKey: evergreen.ProviderNameStatic,
host.ProvisionTimeKey: time.Now(), // i.e. this host was re-provisioned after all of these failures
},
}))

details := &apimodels.TaskEndDetail{
Status: evergreen.TaskFailed,
Type: evergreen.CommandTypeSystem,
}
handler.details = *details
resp := handler.Run(ctx)
require.NotNil(t, resp)
require.Equal(t, http.StatusOK, resp.Status())
h, err = host.FindOneId(ctx, hostId)
require.NoError(t, err)
require.NotZero(t, h)
assert.NotEqual(t, evergreen.HostQuarantined, h.Status)

foundTask, err := task.FindOneId(handler.taskID)
require.NoError(t, err)
require.NotZero(t, foundTask)
require.Equal(t, evergreen.TaskSystemFailed, foundTask.GetDisplayStatus())
},
"DecommissionsDynamicHostWithRepeatedSystemFailedTasks": func(ctx context.Context, t *testing.T, handler *hostAgentEndTask, env *mock.Environment) {
h, err := host.FindOneId(ctx, hostId)
require.NoError(t, err)
Expand Down Expand Up @@ -995,6 +1028,45 @@ func TestHostEndTask(t *testing.T) {
require.NotZero(t, h)
assert.Equal(t, evergreen.HostDecommissioned, h.Status, "dynamic host should be decommissioned for consecutive system failed tasks")

foundTask, err := task.FindOneId(handler.taskID)
require.NoError(t, err)
require.NotZero(t, foundTask)
require.Equal(t, evergreen.TaskSystemUnresponse, foundTask.GetDisplayStatus())
},
"SkipDecommissioningRecentlyProvisionedDynamicHostWithFailures": func(ctx context.Context, t *testing.T, handler *hostAgentEndTask, env *mock.Environment) {
h, err := host.FindOneId(ctx, hostId)
require.NoError(t, err)
require.NotZero(t, h)
for i := 0; i < 8; i++ {
event.LogHostTaskFinished(fmt.Sprintf("some-system-failed-task-%d", i), 0, hostId, evergreen.TaskSystemUnresponse)
}

require.NoError(t, host.UpdateOne(ctx, host.ById(hostId), bson.M{
"$set": bson.M{
host.ProviderKey: evergreen.ProviderNameEc2Fleet,
host.ProvisionTimeKey: time.Now(), // i.e. this host was re-provisioned before two of these failures
},
}))

for i := 8; i < 10; i++ {
event.LogHostTaskFinished(fmt.Sprintf("some-system-failed-task-%d", i), 0, hostId, evergreen.TaskSystemUnresponse)
}

details := &apimodels.TaskEndDetail{
Status: evergreen.TaskFailed,
Type: evergreen.CommandTypeSystem,
TimedOut: true,
Description: evergreen.TaskDescriptionHeartbeat,
}
handler.details = *details
resp := handler.Run(ctx)
require.NotNil(t, resp)
require.Equal(t, http.StatusOK, resp.Status())
h, err = host.FindOneId(ctx, hostId)
require.NoError(t, err)
require.NotZero(t, h)
assert.NotEqual(t, evergreen.HostDecommissioned, h.Status)

foundTask, err := task.FindOneId(handler.taskID)
require.NoError(t, err)
require.NotZero(t, foundTask)
Expand Down Expand Up @@ -1040,6 +1112,7 @@ func TestHostEndTask(t *testing.T) {
}
require.NoError(t, task1.Insert())

now := time.Now()
sampleHost := host.Host{
Id: hostId,
Distro: distro.Distro{
Expand All @@ -1051,6 +1124,7 @@ func TestHostEndTask(t *testing.T) {
Status: evergreen.HostRunning,
AgentRevision: evergreen.AgentVersion,
LastTaskCompletedTime: time.Now().Add(-20 * time.Minute).Round(time.Second),
ProvisionTime: now.Add(-time.Hour), // provisioned before any of the events
}
require.NoError(t, sampleHost.Insert(ctx))

Expand Down
Loading