Skip to content

Commit

Permalink
DEVPROD-12482 Provide rank value breakdown stats (#8485)
Browse files Browse the repository at this point in the history
  • Loading branch information
hadjri authored Nov 27, 2024
1 parent d0a13db commit c7f9ffc
Show file tree
Hide file tree
Showing 11 changed files with 341 additions and 168 deletions.
119 changes: 109 additions & 10 deletions model/task/task.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ import (
"go.mongodb.org/mongo-driver/bson"
"go.mongodb.org/mongo-driver/mongo"
"go.mongodb.org/mongo-driver/mongo/options"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/trace"
"gonum.org/v1/gonum/graph"
"gonum.org/v1/gonum/graph/simple"
"gonum.org/v1/gonum/graph/topo"
Expand Down Expand Up @@ -91,16 +93,16 @@ type Task struct {
// Priority is a specifiable value that adds weight to the prioritization that task will be given in its
// corresponding distro task queue.
Priority int64 `bson:"priority" json:"priority"`
// PriorityRankValue is not persisted to the db, but stored in memory and passed to the task queue document.
// It is a mixture of Priority and the various task queue ranking factors that multiply on top of Priority.
PriorityRankValue int64 `bson:"priority_rank_value" json:"priority_rank_value"`
TaskGroup string `bson:"task_group" json:"task_group"`
TaskGroupMaxHosts int `bson:"task_group_max_hosts,omitempty" json:"task_group_max_hosts,omitempty"`
TaskGroupOrder int `bson:"task_group_order,omitempty" json:"task_group_order,omitempty"`
ResultsService string `bson:"results_service,omitempty" json:"results_service,omitempty"`
HasCedarResults bool `bson:"has_cedar_results,omitempty" json:"has_cedar_results,omitempty"`
ResultsFailed bool `bson:"results_failed,omitempty" json:"results_failed,omitempty"`
MustHaveResults bool `bson:"must_have_results,omitempty" json:"must_have_results,omitempty"`
// SortingValueBreakdown is not persisted to the db, but stored in memory and passed to the task queue document.
// It contains information on what factors led to the overall queue ranking value for the task.
SortingValueBreakdown SortingValueBreakdown `bson:"-" json:"sorting_value_breakdown"`
TaskGroup string `bson:"task_group" json:"task_group"`
TaskGroupMaxHosts int `bson:"task_group_max_hosts,omitempty" json:"task_group_max_hosts,omitempty"`
TaskGroupOrder int `bson:"task_group_order,omitempty" json:"task_group_order,omitempty"`
ResultsService string `bson:"results_service,omitempty" json:"results_service,omitempty"`
HasCedarResults bool `bson:"has_cedar_results,omitempty" json:"has_cedar_results,omitempty"`
ResultsFailed bool `bson:"results_failed,omitempty" json:"results_failed,omitempty"`
MustHaveResults bool `bson:"must_have_results,omitempty" json:"must_have_results,omitempty"`
// only relevant if the task is running. the time of the last heartbeat
// sent back by the agent
LastHeartbeat time.Time `bson:"last_heartbeat" json:"last_heartbeat"`
Expand Down Expand Up @@ -4037,3 +4039,100 @@ func (t *Task) FindAbortingAndResettingDependencies() ([]Task, error) {
})
return FindAll(q)
}

// SortingValueBreakdown is the full breakdown of the final value used to sort on in the queue,
// with accompanying breakdowns of priority and rank value.
type SortingValueBreakdown struct {
TaskGroupLength int64
TotalValue int64
PriorityBreakdown PriorityBreakdown
RankValueBreakdown RankValueBreakdown
}

// PriorityBreakdown contains information on how much various factors impacted the custom
// priority value that is used in the queue sorting value equation.
type PriorityBreakdown struct {
// InitialPriorityImpact represents how much of the total priority can be attributed to the
// original priority set on a task.
InitialPriorityImpact int64
// TaskGroupImpact represents how much of the total priority can be attributed to a
// task being in a task group.
TaskGroupImpact int64
// GeneratorTaskImpact represents how much of the total priority can be attributed to a
// task having a generate.tasks command in it.
GeneratorTaskImpact int64
// CommitQueueImpact represents how much of the total priority can be attributed to a task
// being in the commit queue.
CommitQueueImpact int64
}

// RankValueBreakdown contains information on how much various factors impacted the custom
// rank value that is used in the queue sorting value equation.
type RankValueBreakdown struct {
// CommitQueueImpact represents how much of the total rank value can be attributed to a task
// being in the commit queue.
CommitQueueImpact int64
// NumDependentsImpact represents how much of the total rank value can be attributed to the number
// of tasks that are dependents of a task.
NumDependentsImpact int64
// EstimatedRuntimeImpact represents how much of the total rank value can be attributed to the
// estimated runtime of a task.
EstimatedRuntimeImpact int64
// MainlineWaitTimeImpact represents how much of the total rank value can be attributed to
// how long a mainline task has been waiting for dispatch.
MainlineWaitTimeImpact int64
// StepbackImpact represents how much of the total rank value can be attributed to
// a task being activated by the stepback process.
StepbackImpact int64
// PatchImpact represents how much of the total rank value can be attributed to a task
// being part of a patch.
PatchImpact int64
// PatchWaitTimeImpact represents how much of the total rank value can be attributed to
// how long a patch task has been waiting for dispatch.
PatchWaitTimeImpact int64
}

const (
priorityBreakdownAttributePrefix = "evergreen.priority_breakdown"
rankBreakdownAttributePrefix = "evergreen.rank_breakdown"
priorityScaledRankAttribute = "evergreen.priority_scaled_rank"
)

// SetSortingValueBreakdownAttributes saves a full breakdown which compartmentalizes each factor that played a role in computing the
// overall value used to sort it in the queue, and creates a honeycomb trace with this data to enable dashboards/analysis.
func (t *Task) SetSortingValueBreakdownAttributes(ctx context.Context, breakdown SortingValueBreakdown) {
_, span := tracer.Start(ctx, "queue-factor-breakdown", trace.WithNewRoot())
defer span.End()
span.SetAttributes(
attribute.String(evergreen.DistroIDOtelAttribute, t.DistroId),
attribute.String(evergreen.TaskIDOtelAttribute, t.Id),
attribute.Int64(priorityScaledRankAttribute, breakdown.TotalValue),
// Priority values
attribute.Int64(fmt.Sprintf("%s.base_priority", priorityBreakdownAttributePrefix), breakdown.PriorityBreakdown.InitialPriorityImpact),
attribute.Int64(fmt.Sprintf("%s.task_group", priorityBreakdownAttributePrefix), breakdown.PriorityBreakdown.TaskGroupImpact),
attribute.Int64(fmt.Sprintf("%s.generate_task", priorityBreakdownAttributePrefix), breakdown.PriorityBreakdown.GeneratorTaskImpact),
attribute.Int64(fmt.Sprintf("%s.commit_queue", priorityBreakdownAttributePrefix), breakdown.PriorityBreakdown.CommitQueueImpact),
// Rank values
attribute.Int64(fmt.Sprintf("%s.commit_queue", rankBreakdownAttributePrefix), breakdown.RankValueBreakdown.CommitQueueImpact),
attribute.Int64(fmt.Sprintf("%s.patch", rankBreakdownAttributePrefix), breakdown.RankValueBreakdown.PatchImpact),
attribute.Int64(fmt.Sprintf("%s.patch_wait_time", rankBreakdownAttributePrefix), breakdown.RankValueBreakdown.PatchWaitTimeImpact),
attribute.Int64(fmt.Sprintf("%s.mainline_wait_time", rankBreakdownAttributePrefix), breakdown.RankValueBreakdown.MainlineWaitTimeImpact),
attribute.Int64(fmt.Sprintf("%s.stepback", rankBreakdownAttributePrefix), breakdown.RankValueBreakdown.StepbackImpact),
attribute.Int64(fmt.Sprintf("%s.num_dependents", rankBreakdownAttributePrefix), breakdown.RankValueBreakdown.NumDependentsImpact),
attribute.Int64(fmt.Sprintf("%s.estimated_runtime", rankBreakdownAttributePrefix), breakdown.RankValueBreakdown.EstimatedRuntimeImpact),
// Priority percentage values
attribute.Float64(fmt.Sprintf("%s.base_priority_pct", priorityBreakdownAttributePrefix), float64(breakdown.PriorityBreakdown.InitialPriorityImpact/breakdown.TotalValue*100)),
attribute.Float64(fmt.Sprintf("%s.task_group_pct", priorityBreakdownAttributePrefix), float64(breakdown.PriorityBreakdown.TaskGroupImpact/breakdown.TotalValue*100)),
attribute.Float64(fmt.Sprintf("%s.generate_task_pct", priorityBreakdownAttributePrefix), float64(breakdown.PriorityBreakdown.GeneratorTaskImpact/breakdown.TotalValue*100)),
attribute.Float64(fmt.Sprintf("%s.commit_queue_pct", priorityBreakdownAttributePrefix), float64(breakdown.PriorityBreakdown.CommitQueueImpact/breakdown.TotalValue*100)),
// Rank value percentage values
attribute.Float64(fmt.Sprintf("%s.patch_pct", rankBreakdownAttributePrefix), float64(breakdown.RankValueBreakdown.PatchImpact/breakdown.TotalValue*100)),
attribute.Float64(fmt.Sprintf("%s.patch_wait_time_pct", rankBreakdownAttributePrefix), float64(breakdown.RankValueBreakdown.PatchWaitTimeImpact/breakdown.TotalValue*100)),
attribute.Float64(fmt.Sprintf("%s.commit_queue_pct", rankBreakdownAttributePrefix), float64(breakdown.RankValueBreakdown.CommitQueueImpact/breakdown.TotalValue*100)),
attribute.Float64(fmt.Sprintf("%s.mainline_wait_time_pct", rankBreakdownAttributePrefix), float64(breakdown.RankValueBreakdown.MainlineWaitTimeImpact/breakdown.TotalValue*100)),
attribute.Float64(fmt.Sprintf("%s.stepback_pct", rankBreakdownAttributePrefix), float64(breakdown.RankValueBreakdown.StepbackImpact/breakdown.TotalValue*100)),
attribute.Float64(fmt.Sprintf("%s.num_dependents_pct", rankBreakdownAttributePrefix), float64(breakdown.RankValueBreakdown.NumDependentsImpact/breakdown.TotalValue*100)),
attribute.Float64(fmt.Sprintf("%s.estimated_runtime_pct", rankBreakdownAttributePrefix), float64(breakdown.RankValueBreakdown.EstimatedRuntimeImpact/breakdown.TotalValue*100)),
)
t.SortingValueBreakdown = breakdown
}
37 changes: 19 additions & 18 deletions model/task_queue.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"time"

"github.com/evergreen-ci/evergreen/db"
"github.com/evergreen-ci/evergreen/model/task"
"github.com/mongodb/anser/bsonutil"
adb "github.com/mongodb/anser/db"
"github.com/mongodb/grip"
Expand Down Expand Up @@ -99,24 +100,24 @@ type TaskDep struct {
}

type TaskQueueItem struct {
Id string `bson:"_id" json:"_id"`
IsDispatched bool `bson:"dispatched" json:"dispatched"`
DisplayName string `bson:"display_name" json:"display_name"`
Group string `bson:"group_name" json:"group_name"`
GroupMaxHosts int `bson:"group_max_hosts,omitempty" json:"group_max_hosts,omitempty"`
GroupIndex int `bson:"group_index,omitempty" json:"group_index,omitempty"`
Version string `bson:"version" json:"version"`
BuildVariant string `bson:"build_variant" json:"build_variant"`
RevisionOrderNumber int `bson:"order" json:"order"`
Requester string `bson:"requester" json:"requester"`
Revision string `bson:"gitspec" json:"gitspec"`
Project string `bson:"project" json:"project"`
ExpectedDuration time.Duration `bson:"exp_dur" json:"exp_dur"`
Priority int64 `bson:"priority" json:"priority"`
PriorityRankValue int64 `bson:"priority_rank_value" json:"priority_rank_value"`
Dependencies []string `bson:"dependencies" json:"dependencies"`
DependenciesMet bool `bson:"dependencies_met" json:"dependencies_met"`
ActivatedBy string `bson:"activated_by" json:"activated_by"`
Id string `bson:"_id" json:"_id"`
IsDispatched bool `bson:"dispatched" json:"dispatched"`
DisplayName string `bson:"display_name" json:"display_name"`
Group string `bson:"group_name" json:"group_name"`
GroupMaxHosts int `bson:"group_max_hosts,omitempty" json:"group_max_hosts,omitempty"`
GroupIndex int `bson:"group_index,omitempty" json:"group_index,omitempty"`
Version string `bson:"version" json:"version"`
BuildVariant string `bson:"build_variant" json:"build_variant"`
RevisionOrderNumber int `bson:"order" json:"order"`
Requester string `bson:"requester" json:"requester"`
Revision string `bson:"gitspec" json:"gitspec"`
Project string `bson:"project" json:"project"`
ExpectedDuration time.Duration `bson:"exp_dur" json:"exp_dur"`
Priority int64 `bson:"priority" json:"priority"`
SortingValueBreakdown task.SortingValueBreakdown `bson:"sorting_value_breakdown" json:"sorting_value_breakdown"`
Dependencies []string `bson:"dependencies" json:"dependencies"`
DependenciesMet bool `bson:"dependencies_met" json:"dependencies_met"`
ActivatedBy string `bson:"activated_by" json:"activated_by"`
}

// must not no-lint these values
Expand Down
34 changes: 17 additions & 17 deletions model/task_queue_service_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1962,23 +1962,23 @@ func (s *taskDAGDispatchServiceSuite) refreshTaskQueue(service *basicCachedDAGDi
dependencies = append(dependencies, d.TaskId)
}
taskQueue = append(taskQueue, TaskQueueItem{
Id: t.Id,
DisplayName: t.DisplayName,
BuildVariant: t.BuildVariant,
RevisionOrderNumber: t.RevisionOrderNumber,
Requester: t.Requester,
Revision: t.Revision,
Project: t.Project,
ExpectedDuration: t.ExpectedDuration,
Priority: t.Priority,
PriorityRankValue: t.PriorityRankValue,
Group: t.TaskGroup,
GroupMaxHosts: t.TaskGroupMaxHosts,
GroupIndex: t.TaskGroupOrder,
Version: t.Version,
ActivatedBy: t.ActivatedBy,
Dependencies: dependencies,
DependenciesMet: t.HasDependenciesMet(),
Id: t.Id,
DisplayName: t.DisplayName,
BuildVariant: t.BuildVariant,
RevisionOrderNumber: t.RevisionOrderNumber,
Requester: t.Requester,
Revision: t.Revision,
Project: t.Project,
ExpectedDuration: t.ExpectedDuration,
Priority: t.Priority,
SortingValueBreakdown: t.SortingValueBreakdown,
Group: t.TaskGroup,
GroupMaxHosts: t.TaskGroupMaxHosts,
GroupIndex: t.TaskGroupOrder,
Version: t.Version,
ActivatedBy: t.ActivatedBy,
Dependencies: dependencies,
DependenciesMet: t.HasDependenciesMet(),
})
}
err = service.rebuild(taskQueue)
Expand Down
2 changes: 1 addition & 1 deletion rest/data/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ func CompareTasks(ctx context.Context, taskIds []string, useLegacy bool) ([]stri
return nil, nil, errors.Errorf("distro '%s' not found", distroId)
}
taskPlan := scheduler.PrepareTasksForPlanning(d, tasks)
tasks = taskPlan.Export()
tasks = taskPlan.Export(ctx)
}
prioritizedIds := []string{}
for _, t := range tasks {
Expand Down
12 changes: 8 additions & 4 deletions scheduler/distro_alias_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package scheduler

import (
"context"
"testing"

"github.com/evergreen-ci/evergreen"
Expand Down Expand Up @@ -41,6 +42,9 @@ func TestDistroAliases(t *testing.T) {
require.NoError(t, db.Clear(model.VersionCollection))
require.NoError(t, (&model.Version{Id: "foo"}).Insert())

ctx, cancel := context.WithCancel(context.Background())
defer cancel()

t.Run("VerifyPrimaryQueue", func(t *testing.T) {
distroOne := &distro.Distro{
Id: "one",
Expand All @@ -50,7 +54,7 @@ func TestDistroAliases(t *testing.T) {
require.NoError(t, db.Clear(model.TaskQueuesCollection))

distroOne.PlannerSettings.Version = evergreen.PlannerVersionTunable
output, err := PrioritizeTasks(distroOne, tasks, TaskPlannerOptions{ID: "tunable-0"})
output, err := PrioritizeTasks(ctx, distroOne, tasks, TaskPlannerOptions{ID: "tunable-0"})
require.NoError(t, err)
require.Len(t, output, 2)
require.Equal(t, "one", output[0].Id)
Expand All @@ -68,7 +72,7 @@ func TestDistroAliases(t *testing.T) {
require.NoError(t, db.Clear(model.TaskQueuesCollection))

distroOne.PlannerSettings.Version = evergreen.PlannerVersionLegacy
output, err := PrioritizeTasks(distroOne, tasks, TaskPlannerOptions{ID: "legacy-1"})
output, err := PrioritizeTasks(ctx, distroOne, tasks, TaskPlannerOptions{ID: "legacy-1"})
require.NoError(t, err)
require.Len(t, output, 2)
require.Equal(t, "one", output[0].Id)
Expand Down Expand Up @@ -96,7 +100,7 @@ func TestDistroAliases(t *testing.T) {
require.NoError(t, db.Clear(model.TaskSecondaryQueuesCollection))

distroTwo.PlannerSettings.Version = evergreen.PlannerVersionTunable
output, err := PrioritizeTasks(distroTwo, tasks, TaskPlannerOptions{ID: "tunable-0", IsSecondaryQueue: true})
output, err := PrioritizeTasks(ctx, distroTwo, tasks, TaskPlannerOptions{ID: "tunable-0", IsSecondaryQueue: true})
require.NoError(t, err)
require.Len(t, output, 2)
require.Equal(t, "one", output[0].Id)
Expand All @@ -115,7 +119,7 @@ func TestDistroAliases(t *testing.T) {
require.NoError(t, db.Clear(model.TaskSecondaryQueuesCollection))

distroTwo.PlannerSettings.Version = evergreen.PlannerVersionLegacy
output, err := PrioritizeTasks(distroTwo, tasks, TaskPlannerOptions{ID: "legacy-0", IsSecondaryQueue: true})
output, err := PrioritizeTasks(ctx, distroTwo, tasks, TaskPlannerOptions{ID: "legacy-0", IsSecondaryQueue: true})
require.NoError(t, err)
require.Len(t, output, 2)
require.Equal(t, "one", output[0].Id)
Expand Down
Loading

0 comments on commit c7f9ffc

Please sign in to comment.