diff --git a/metrics/health.go b/metrics/health.go index 7fac47766..a9cdcde2a 100644 --- a/metrics/health.go +++ b/metrics/health.go @@ -19,6 +19,7 @@ package metrics import ( + "fmt" "sync" "time" @@ -41,8 +42,8 @@ type ( } HealthStatus struct { - OverallStatus string `json:"status"` - ComponentStatus map[string]ComponentStatus `json:"components"` + OverallStatus string `json:"status"` + ComponentStatus map[HealthStatusComponent]ComponentStatus `json:"components"` } HealthStatusEnum int @@ -50,6 +51,7 @@ type ( HealthStatusComponent string ) +// HealthStatusEnum are stored as Prometheus values and internal struct const ( StatusCritical HealthStatusEnum = iota + 1 StatusWarning @@ -75,7 +77,7 @@ const ( ) var ( - healthStatus = sync.Map{} + healthStatus = sync.Map{} // In-memory map of component health status, key is HealthStatusComponent, value is componentStatusInternal PelicanHealthStatus = promauto.NewGaugeVec(prometheus.GaugeOpts{ Name: "pelican_component_health_status", @@ -110,7 +112,7 @@ func (component HealthStatusComponent) String() string { // use only, please try to avoid setting this as your component status func SetComponentHealthStatus(name HealthStatusComponent, state HealthStatusEnum, msg string) { now := time.Now() - healthStatus.Store(name.String(), componentStatusInternal{state, msg, now}) + healthStatus.Store(name, componentStatusInternal{state, msg, now}) PelicanHealthStatus.With( prometheus.Labels{"component": name.String()}). @@ -121,7 +123,7 @@ func SetComponentHealthStatus(name HealthStatusComponent, state HealthStatusEnum } func DeleteComponentHealthStatus(name HealthStatusComponent) { - healthStatus.Delete(name.String()) + healthStatus.Delete(name) } func GetHealthStatus() HealthStatus { @@ -133,12 +135,12 @@ func GetHealthStatus() HealthStatus { if !ok { return true } - componentString, ok := component.(string) + componentString, ok := component.(HealthStatusComponent) if !ok { return true } if status.ComponentStatus == nil { - status.ComponentStatus = make(map[string]ComponentStatus) + status.ComponentStatus = make(map[HealthStatusComponent]ComponentStatus) } status.ComponentStatus[componentString] = ComponentStatus{ componentStatus.Status.String(), @@ -153,3 +155,17 @@ func GetHealthStatus() HealthStatus { status.OverallStatus = overallStatus.String() return status } + +// Get the current health status of a component. +// Status can be critical|warning|ok|unknown +func GetComponentStatus(comp HealthStatusComponent) (status string, err error) { + component, ok := healthStatus.Load(comp) + if !ok { + return "", fmt.Errorf("component %s does not exist", comp.String()) + } + statusInt, ok := component.(componentStatusInternal) + if !ok { + return "", fmt.Errorf("wrong format of component status for component %s", comp.String()) + } + return statusInt.Status.String(), nil +} diff --git a/server_utils/monitor.go b/server_utils/monitor.go index 81986733c..0e45f1a81 100644 --- a/server_utils/monitor.go +++ b/server_utils/monitor.go @@ -58,11 +58,17 @@ func LaunchPeriodicDirectorTimeout(ctx context.Context, egrp *errgroup.Group, nC for { select { case <-directorTimeoutTicker.C: - // Timer fired because no message was received in time. - log.Warningln("No director test report received within the time limit") - metrics.SetComponentHealthStatus(metrics.OriginCache_Director, metrics.StatusCritical, "No director test report received within the time limit") + // If origin can't contact the director, record the error without warning + status, err := metrics.GetComponentStatus(metrics.OriginCache_Federation) + if err == nil && status == "critical" { + metrics.SetComponentHealthStatus(metrics.OriginCache_Director, metrics.StatusCritical, "Failed to advertise to the director. Tests are not expected") + } else { + // Timer fired because no message was received in time. + log.Warningln("No director test report received within the time limit") + metrics.SetComponentHealthStatus(metrics.OriginCache_Director, metrics.StatusCritical, "No director test report received within the time limit") + } case <-nChan: - log.Debugln("Got notification from director") + log.Debugln("Received director report of health test result") directorTimeoutTicker.Reset(directorTimeoutDuration) case <-ctx.Done(): log.Infoln("Director health test timeout loop has been terminated")