From c537fcba7b82962c65642a3596d7bd5e07ddf5bd Mon Sep 17 00:00:00 2001 From: Haoming Meng Date: Mon, 25 Mar 2024 21:09:20 +0000 Subject: [PATCH 1/2] Conditional reporting warning of director test timeout --- metrics/health.go | 22 +++++++++++++++++++--- origin_ui/origin_api.go | 11 ++++++++--- 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/metrics/health.go b/metrics/health.go index 7fac47766..420536452 100644 --- a/metrics/health.go +++ b/metrics/health.go @@ -19,6 +19,7 @@ package metrics import ( + "fmt" "sync" "time" @@ -50,6 +51,7 @@ type ( HealthStatusComponent string ) +// HealthStatusEnum are stored as Prometheus values and internal struct const ( StatusCritical HealthStatusEnum = iota + 1 StatusWarning @@ -75,7 +77,7 @@ const ( ) var ( - healthStatus = sync.Map{} + healthStatus = sync.Map{} // In-memory map of component health status, key is HealthStatusComponent, value is componentStatusInternal PelicanHealthStatus = promauto.NewGaugeVec(prometheus.GaugeOpts{ Name: "pelican_component_health_status", @@ -110,7 +112,7 @@ func (component HealthStatusComponent) String() string { // use only, please try to avoid setting this as your component status func SetComponentHealthStatus(name HealthStatusComponent, state HealthStatusEnum, msg string) { now := time.Now() - healthStatus.Store(name.String(), componentStatusInternal{state, msg, now}) + healthStatus.Store(name, componentStatusInternal{state, msg, now}) PelicanHealthStatus.With( prometheus.Labels{"component": name.String()}). @@ -121,7 +123,7 @@ func SetComponentHealthStatus(name HealthStatusComponent, state HealthStatusEnum } func DeleteComponentHealthStatus(name HealthStatusComponent) { - healthStatus.Delete(name.String()) + healthStatus.Delete(name) } func GetHealthStatus() HealthStatus { @@ -153,3 +155,17 @@ func GetHealthStatus() HealthStatus { status.OverallStatus = overallStatus.String() return status } + +// Get the current health status of a component. +// Status can be critical|warning|ok|unknown +func GetComponentStatus(comp HealthStatusComponent) (status string, err error) { + component, ok := healthStatus.Load(comp) + if !ok { + return "", fmt.Errorf("component %s does not exist", comp.String()) + } + statusInt, ok := component.(componentStatusInternal) + if !ok { + return "", fmt.Errorf("wrong format of component status for component %s", comp.String()) + } + return statusInt.Status.String(), nil +} diff --git a/origin_ui/origin_api.go b/origin_ui/origin_api.go index 4bdccf84b..e9d279a46 100644 --- a/origin_ui/origin_api.go +++ b/origin_ui/origin_api.go @@ -73,10 +73,15 @@ func LaunchPeriodicDirectorTimeout(ctx context.Context, egrp *errgroup.Group) { select { case <-directorTimeoutTicker.C: // Timer fired because no message was received in time. - log.Warningln("No director test report received within the time limit") - metrics.SetComponentHealthStatus(metrics.OriginCache_Director, metrics.StatusCritical, "No director test report received within the time limit") + status, err := metrics.GetComponentStatus(metrics.OriginCache_Federation) + if err == nil && status == "critical" { + metrics.SetComponentHealthStatus(metrics.OriginCache_Director, metrics.StatusCritical, "Failed to advertise to the director. Tests are not expected") + } else { + log.Warningln("No director test report received within the time limit") + metrics.SetComponentHealthStatus(metrics.OriginCache_Director, metrics.StatusCritical, "No director test report received within the time limit") + } case <-nChan: - log.Debugln("Got notification from director") + log.Debugln("Received director report of health test result") directorTimeoutTicker.Reset(directorTimeoutDuration) case <-ctx.Done(): log.Infoln("Director health test timeout loop has been terminated") From 7b4c37e4b55289a710e1d4ddb0bde0c6785a7898 Mon Sep 17 00:00:00 2001 From: Haoming Meng Date: Tue, 26 Mar 2024 15:21:37 +0000 Subject: [PATCH 2/2] Fix wrong type for map key --- metrics/health.go | 8 ++++---- origin_ui/origin_api.go | 3 ++- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/metrics/health.go b/metrics/health.go index 420536452..a9cdcde2a 100644 --- a/metrics/health.go +++ b/metrics/health.go @@ -42,8 +42,8 @@ type ( } HealthStatus struct { - OverallStatus string `json:"status"` - ComponentStatus map[string]ComponentStatus `json:"components"` + OverallStatus string `json:"status"` + ComponentStatus map[HealthStatusComponent]ComponentStatus `json:"components"` } HealthStatusEnum int @@ -135,12 +135,12 @@ func GetHealthStatus() HealthStatus { if !ok { return true } - componentString, ok := component.(string) + componentString, ok := component.(HealthStatusComponent) if !ok { return true } if status.ComponentStatus == nil { - status.ComponentStatus = make(map[string]ComponentStatus) + status.ComponentStatus = make(map[HealthStatusComponent]ComponentStatus) } status.ComponentStatus[componentString] = ComponentStatus{ componentStatus.Status.String(), diff --git a/origin_ui/origin_api.go b/origin_ui/origin_api.go index e9d279a46..249187b3e 100644 --- a/origin_ui/origin_api.go +++ b/origin_ui/origin_api.go @@ -72,11 +72,12 @@ func LaunchPeriodicDirectorTimeout(ctx context.Context, egrp *errgroup.Group) { for { select { case <-directorTimeoutTicker.C: - // Timer fired because no message was received in time. + // If origin can't contact the director, record the error without warning status, err := metrics.GetComponentStatus(metrics.OriginCache_Federation) if err == nil && status == "critical" { metrics.SetComponentHealthStatus(metrics.OriginCache_Director, metrics.StatusCritical, "Failed to advertise to the director. Tests are not expected") } else { + // Timer fired because no message was received in time. log.Warningln("No director test report received within the time limit") metrics.SetComponentHealthStatus(metrics.OriginCache_Director, metrics.StatusCritical, "No director test report received within the time limit") }