diff --git a/jobs/loghost_alerts/spec b/jobs/loghost_alerts/spec index 2e9aae3..9e1adb3 100644 --- a/jobs/loghost_alerts/spec +++ b/jobs/loghost_alerts/spec @@ -13,6 +13,9 @@ properties: loghost_alerts.nologs.directors: description: "List of director origin to check in nolog alert" default: [] + loghost_alerts.dropped.evaluation_time: + description: "Dropped logs alerts evaluation time" + default: 15m loghost_alerts.security.enabled: decription: "Enable security alerts" default: false diff --git a/jobs/loghost_alerts/templates/loghost.alerts.yml b/jobs/loghost_alerts/templates/loghost.alerts.yml index 5e065f3..eb16d27 100644 --- a/jobs/loghost_alerts/templates/loghost.alerts.yml +++ b/jobs/loghost_alerts/templates/loghost.alerts.yml @@ -43,6 +43,22 @@ groups: Resolution: - contact Cloud Foundry administrator team + - alert: LoghostDroppedMessages + expr: | + increase(loghost_logservice_dropped[15m]) != 0 + for: <%= p('loghost_alerts.dropped.evaluation_time') %> + labels: + service: loghost + severity: warning + annotations: + summary: "Logs sent to `{{$labels.target}}` have been dropped in the last 15 minutes" + description: |- + Impact: + - some logs sent to `{{$labels.target}}` are being lost + + Possible causes: + - network issue between the components + - the target is not able to process the messages properly <% if p('loghost_alerts.security.enabled') %> - alert: SecurityTooManyAuthFailures diff --git a/manifests/operations/loghost-exporter-enable.yml b/manifests/operations/loghost-exporter-enable.yml index 62a7845..3c4fb12 100644 --- a/manifests/operations/loghost-exporter-enable.yml +++ b/manifests/operations/loghost-exporter-enable.yml @@ -65,6 +65,23 @@ domain: "{{.domain}}" user: "{{.user}}" status: "{{.status}}" + # This is intended to expose failing transactions to log services until + # https://github.com/cloudfoundry/loggregator-agent-release/issues/64 is resolved. + # + # Does appear on diego-cells at: + # /var/vcap/sys/log/loggr-syslog-agent/loggr-syslog-agent.stderr.log + # And in the loghosts at: + # /var/vcap/store/loghost/*/cf/router*?.log + - name: loghost_logservice_dropped + type: counter + help: number of dropped enveloppes + match: >- + ^%{NOTSPACE} %{NOTSPACE} loggr-syslog-agent\[rs2\] %{NOTSPACE} %{NOTSPACE} + failed to write to %{HOSTPORT:target}, retrying in [^,]+, err: + scope: local + labels: + <<: *labels + target: '{{.target}}' local-directors: - ((director_name)) remote-directors: []