diff --git a/internal/scheduler/metrics/state_metrics.go b/internal/scheduler/metrics/state_metrics.go index 3e25deb892b..fcb54ea8a5f 100644 --- a/internal/scheduler/metrics/state_metrics.go +++ b/internal/scheduler/metrics/state_metrics.go @@ -90,7 +90,7 @@ func newJobStateMetrics(errorRegexes []*regexp.Regexp, trackedResourceNames []v1 ) jobErrorsByNode := prometheus.NewCounterVec( prometheus.CounterOpts{ - Name: prefix + "error_classification_by_node", + Name: prefix + "job_error_classification_by_node", Help: "Failed jobs ey error classification at the node level", }, []string{nodeLabel, poolLabel, clusterLabel, errorCategoryLabel, errorSubcategoryLabel}, @@ -188,7 +188,8 @@ func (m *jobStateMetrics) ReportStateTransitions( m.completedRunDurations.WithLabelValues(job.Queue(), run.Pool()).Observe(duration) jobRunError := jobRunErrorsByRunId[run.Id()] category, subCategory := m.failedCategoryAndSubCategoryFromJob(jobRunError) - m.jobErrorsByQueue.WithLabelValues(job.Queue(), run.Executor(), category, subCategory).Inc() + m.jobErrorsByQueue.WithLabelValues(job.Queue(), run.Pool(), category, subCategory).Inc() + m.jobErrorsByNode.WithLabelValues(run.NodeName(), run.Pool(), run.Executor(), category, subCategory).Inc() } if jst.Succeeded { duration, priorState := stateDuration(job, run, run.TerminatedTime()) diff --git a/internal/scheduler/metrics/state_metrics_test.go b/internal/scheduler/metrics/state_metrics_test.go index aed4d510892..820d1fe41fe 100644 --- a/internal/scheduler/metrics/state_metrics_test.go +++ b/internal/scheduler/metrics/state_metrics_test.go @@ -10,6 +10,7 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/testutil" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" v1 "k8s.io/api/core/v1" "github.com/armadaproject/armada/internal/scheduler/jobdb" @@ -343,6 +344,45 @@ func TestReportJobStateTransitions(t *testing.T) { } } +func TestCategoriseErrors(t *testing.T) { + run := baseRun. + WithExecutor(testCluster). + WithNodeName(testNode). + WithPool(testPool) + + job := baseJob.WithUpdatedRun(run) + + r, err := regexp.Compile("generic pod error") + require.NoError(t, err) + + jobRunErrorsByRunId := map[string]*armadaevents.Error{ + run.Id(): { + Terminal: true, + Reason: &armadaevents.Error_PodError{ + PodError: &armadaevents.PodError{ + Message: "generic pod error", + }, + }, + }, + } + + jsts := []jobdb.JobStateTransitions{ + { + Job: job, + Failed: true, + }, + } + + metrics := newJobStateMetrics([]*regexp.Regexp{r}, []v1.ResourceName{"cpu"}, 12*time.Hour) + metrics.ReportStateTransitions(jsts, jobRunErrorsByRunId) + + actualjobErrorsByQueue := testutil.ToFloat64(metrics.jobErrorsByQueue.WithLabelValues(testQueue, testPool, "podError", "generic pod error")) + assert.InDelta(t, 1, actualjobErrorsByQueue, epsilon) + + actualjobErrorsByNode := testutil.ToFloat64(metrics.jobErrorsByNode.WithLabelValues(testNode, testPool, testCluster, "podError", "generic pod error")) + assert.InDelta(t, 1, actualjobErrorsByNode, epsilon) +} + func TestReset(t *testing.T) { byQueueLabels := []string{testQueue, testPool, "running", "pending"} byNodeLabels := []string{testNode, testPool, testCluster, "running", "pending"}