diff --git a/jobs/loghost_alerts/spec b/jobs/loghost_alerts/spec index 0a0e2de..2e9aae3 100644 --- a/jobs/loghost_alerts/spec +++ b/jobs/loghost_alerts/spec @@ -10,7 +10,9 @@ properties: loghost_alerts.nologs.evaluation_time: description: "No received logs alerts evaluation time" default: 15m - + loghost_alerts.nologs.directors: + description: "List of director origin to check in nolog alert" + default: [] loghost_alerts.security.enabled: decription: "Enable security alerts" default: false diff --git a/jobs/loghost_alerts/templates/loghost.alerts.yml b/jobs/loghost_alerts/templates/loghost.alerts.yml index 296e755..445cd21 100644 --- a/jobs/loghost_alerts/templates/loghost.alerts.yml +++ b/jobs/loghost_alerts/templates/loghost.alerts.yml @@ -3,10 +3,7 @@ groups: rules: - alert: LoghostNoLogReceived expr: | - ( - sum(increase(loghost_total[15m])) - or vector(0) - ) == 0 + sum(increase(loghost_total[15m])) by (director) == 0 for: <%= p('loghost_alerts.nologs.evaluation_time') %> labels: service: loghost @@ -25,6 +22,28 @@ groups: Resolution: - contact Cloud Foundry administrator team + - alert: LoghostNotEnoughSources + expr: | + count(sum(loghost_total) by (director)) != <%= p('loghost_alerts.nologs.directors').length() %> + for: <%= p('loghost_alerts.nologs.evaluation_time') %> + labels: + service: loghost + severity: warning + annotations: + summary: "No logs were received from some directors at instance `{{$labels.instance}}` in the last 15 minutes" + description: | + Logs are not concetrated and locally stored on loghost instance `{{$labels.instance}}` + + Impact: + - moderate impact and is not a production emergency + + Possible causes: + - misconfiguration of rsyslog and loghost_exporter on loghost instance + - prometheus is not scraping loghost_exporter as it should + + Resolution: + - contact Cloud Foundry administrator team + <% if p('loghost_alerts.security.enabled') %> - alert: SecurityTooManyAuthFailures expr: | diff --git a/jobs/loghost_dashboards/templates/loghost_system.json b/jobs/loghost_dashboards/templates/loghost_system.json index 12e6865..a99db86 100644 --- a/jobs/loghost_dashboards/templates/loghost_system.json +++ b/jobs/loghost_dashboards/templates/loghost_system.json @@ -108,7 +108,7 @@ "tableColumn": "", "targets": [ { - "expr": "count(sum(increase(loghost_total[15m]))) or vector(0)", + "expr": "count(sum(increase(loghost_total{director=~\"$director\", deployment=~\"$deployment\", group=~\"$group\"}[15m]))) or vector(0)", "format": "table", "instant": true, "refId": "A" @@ -291,7 +291,7 @@ "tableColumn": "", "targets": [ { - "expr": "sum(irate(loghost_total[10m]))", + "expr": "sum(irate(loghost_total{director=~\"$director\", deployment=~\"$deployment\", group=~\"$group\"}[10m]))", "format": "table", "instant": false, "legendFormat": "", diff --git a/jobs/loghost_exporter/spec b/jobs/loghost_exporter/spec index e3e1846..564d6b9 100644 --- a/jobs/loghost_exporter/spec +++ b/jobs/loghost_exporter/spec @@ -27,13 +27,18 @@ properties: description: "path to the SSL key file for protocol https. It is optional. If omitted, a hard-coded default key will be used." loghost_exporter.metrics: description: "list of metrics to generate, given in grok_exporter config metrics format" + default: [] example: - name: loghost_total - type: counter - help: number of logs - match: '.*' - labels: - my-label: value + - name: loghost_total + type: counter + help: number of logs + match: '.*' + # local: only site-local logs are processed by the metric + # remote: only remote-site logs are processed by the metric + # global: all logs are processed by the metric + scope: "local|remote|global" + labels: + my-label: value loghost_exporter.base: description: "Root directory to mount on loghost_exporter container to access logs" default: /var/vcap/store/loghost @@ -45,5 +50,8 @@ properties: - cf - prometheus - concourse - loghost_exporter.directors: - description: "list of directors to watch" + loghost_exporter.local-directors: + description: "list of directors to watch as local-scoped logs" + loghost_exporter.remote-directors: + description: "list of directors to watch as remote-scoped logs" + default: [] diff --git a/jobs/loghost_exporter/templates/config.yml.erb b/jobs/loghost_exporter/templates/config.yml.erb index 054813c..0dfc5ad 100644 --- a/jobs/loghost_exporter/templates/config.yml.erb +++ b/jobs/loghost_exporter/templates/config.yml.erb @@ -3,10 +3,20 @@ require 'yaml' # compute inputs base = p('loghost_exporter.base') -paths = [] -p('loghost_exporter.directors').each do |director| - p('loghost_exporter.deployments').each do |deployment| - paths << File.join(base, director, deployment, "*.log") +scopes = { + "local" => [], + "remote" => [], + "global" => [], +} + +p('loghost_exporter.deployments').each do |deployment| + p('loghost_exporter.local-directors').each do |director| + scopes["local"] << File.join(base, director, deployment, "*.log") + scopes["global"] << File.join(base, director, deployment, "*.log") + end + p('loghost_exporter.remote-directors').each do |director| + scopes["remote"] << File.join(base, director, deployment, "*.log") + scopes["global"] << File.join(base, director, deployment, "*.log") end end @@ -37,13 +47,21 @@ end config = { "global" => { "config_version" => 3 }, "imports" => imports, - "metrics" => p('loghost_exporter.metrics'), + "metrics" => [], "input" => { "type" => "file", "fail_on_missing_logfile" => false, - "paths" => paths + "paths" => scopes["global"] }, "server" => server } + +p('loghost_exporter.metrics').each do |metric| + scope = metric.fetch("scope", "local") + metric["paths"] = scopes[scope] + metric.delete("scope") + config["metrics"].push(metric) +end + %> <%= config.to_yaml %> diff --git a/jobs/loghost_exporter/templates/pre-start.erb b/jobs/loghost_exporter/templates/pre-start.erb index dc19971..76ae695 100644 --- a/jobs/loghost_exporter/templates/pre-start.erb +++ b/jobs/loghost_exporter/templates/pre-start.erb @@ -1,6 +1,12 @@ #!/bin/bash -<% p('loghost_exporter.directors').each do |director| %> +<% p('loghost_exporter.local-directors').each do |director| %> +<% p('loghost_exporter.deployments').each do |deployment| %> +su vcap -c "mkdir -p <%= File.join(p('loghost_exporter.base'), director, deployment) %>" +<% end %> +<% end %> + +<% p('loghost_exporter.remote-directors').each do |director| %> <% p('loghost_exporter.deployments').each do |deployment| %> su vcap -c "mkdir -p <%= File.join(p('loghost_exporter.base'), director, deployment) %>" <% end %> diff --git a/manifests/operations/loghost-exporter-enable-security.yml b/manifests/operations/loghost-exporter-enable-security.yml index 94e6d32..47f7eb7 100644 --- a/manifests/operations/loghost-exporter-enable-security.yml +++ b/manifests/operations/loghost-exporter-enable-security.yml @@ -5,6 +5,7 @@ type: counter help: number of successful uaa login for users match: "INFO --- Audit: IdentityProviderAuthenticationSuccess \\('%{DATA:login}'\\): principal=%{DATA:guid}, origin=\\[remoteAddress=%{DATA:ip}, clientId=%{DATA:principal}\\], identityZoneId=\\[uaa\\], authenticationType=\\[ldap\\]" + scope: local labels: director: '{{gsub .logfile "/var/vcap/store/loghost/(.*?)/(.*?)/(.*?)\\.log" "\\1"}}' deployment: '{{gsub .logfile "/var/vcap/store/loghost/(.*?)/(.*?)/(.*?)\\.log" "\\2"}}' @@ -19,6 +20,7 @@ type: counter help: number of uaa login failures for users match: "INFO --- Audit: IdentityProviderAuthenticationFailure \\('%{DATA:login}'\\): principal=null, origin=\\[remoteAddress=%{DATA:ip}, clientId=%{DATA:principal}\\], identityZoneId=\\[uaa\\], authenticationType=\\[ldap\\]" + scope: local labels: director: '{{gsub .logfile "/var/vcap/store/loghost/(.*?)/(.*?)/(.*?)\\.log" "\\1"}}' deployment: '{{gsub .logfile "/var/vcap/store/loghost/(.*?)/(.*?)/(.*?)\\.log" "\\2"}}' @@ -34,6 +36,7 @@ type: counter help: number of uaa login failures for clients match: "INFO --- Audit: ClientAuthenticationFailure \\('Bad credentials'\\): principal=%{DATA:login}, origin=\\[remoteAddress=%{DATA:ip}, clientId=%{DATA:principal}\\], identityZoneId=\\[uaa\\]" + scope: local labels: director: '{{gsub .logfile "/var/vcap/store/loghost/(.*?)/(.*?)/(.*?)\\.log" "\\1"}}' deployment: '{{gsub .logfile "/var/vcap/store/loghost/(.*?)/(.*?)/(.*?)\\.log" "\\2"}}' @@ -48,6 +51,7 @@ type: counter help: number of uaa login success for clients match: "INFO --- Audit: ClientAuthenticationSuccess \\('Client authentication success'\\): principal=%{DATA:login}, origin=\\[remoteAddress=%{DATA:ip}, clientId=%{DATA:principal}\\], identityZoneId=\\[uaa\\]" + scope: local labels: director: '{{gsub .logfile "/var/vcap/store/loghost/(.*?)/(.*?)/(.*?)\\.log" "\\1"}}' deployment: '{{gsub .logfile "/var/vcap/store/loghost/(.*?)/(.*?)/(.*?)\\.log" "\\2"}}' @@ -62,6 +66,7 @@ type: counter help: number of system auth failures match: "type=USER_(AUTH|ERR) msg=audit\\(%{DATA}\\): pid=%{NUMBER} uid=%{NUMBER} auid=%{NUMBER} ses=%{NUMBER} msg='op=PAM:(bad_ident|authentication) acct=\"%{DATA}\" exe=\"%{DATA:source}\" hostname=%{DATA} addr=%{DATA:ip} terminal=%{DATA} res=failed'" + scope: local labels: director: '{{gsub .logfile "/var/vcap/store/loghost/(.*?)/(.*?)/(.*?)\\.log" "\\1"}}' deployment: '{{gsub .logfile "/var/vcap/store/loghost/(.*?)/(.*?)/(.*?)\\.log" "\\2"}}' @@ -76,6 +81,7 @@ type: counter help: number of system authentication success match: "type=USER_ACCT msg=audit\\(%{DATA}\\): pid=%{NUMBER} uid=%{NUMBER} auid=%{NUMBER} ses=%{NUMBER} msg='op=PAM:accounting acct=\"%{DATA:username}\" exe=\"%{DATA:source}\" hostname=%{DATA} addr=%{DATA:ip} terminal=%{DATA} res=success'" + scope: local labels: director: '{{gsub .logfile "/var/vcap/store/loghost/(.*?)/(.*?)/(.*?)\\.log" "\\1"}}' deployment: '{{gsub .logfile "/var/vcap/store/loghost/(.*?)/(.*?)/(.*?)\\.log" "\\2"}}' @@ -91,6 +97,7 @@ type: counter help: number of diego ssh authentication success match: "ssh_proxy\\[rs2\\] {\"timestamp\":\"%{DATA}\",\"level\":\"info\",\"source\":\"ssh-proxy\",\"message\":\"ssh-proxy.cf-authenticate.app-access-success\",\"data\":{\"app\":\"%{DATA:appname}/%{NUMBER:appindex}\",\"principal\":\"%{DATA}\",\"session\":\"%{DATA},\"username\":\"%{DATA:username}\"}}" + scope: local labels: director: '{{gsub .logfile "/var/vcap/store/loghost/(.*?)/(.*?)/(.*?)\\.log" "\\1"}}' deployment: '{{gsub .logfile "/var/vcap/store/loghost/(.*?)/(.*?)/(.*?)\\.log" "\\2"}}' @@ -107,10 +114,9 @@ type: counter help: number of diego ssh authentication failures match: "ssh_proxy\\[rs2\\] {\"timestamp\":\"%{DATA}\",\"level\":\"error\",\"source\":\"ssh-proxy\",\"message\":\"ssh-proxy.authentication-failed\",\"data\":{\"error\":\"%{DATA}\",\"user\":\"%{DATA:user}\"}}" + scope: local labels: director: '{{gsub .logfile "/var/vcap/store/loghost/(.*?)/(.*?)/(.*?)\\.log" "\\1"}}' deployment: '{{gsub .logfile "/var/vcap/store/loghost/(.*?)/(.*?)/(.*?)\\.log" "\\2"}}' group: '{{gsub .logfile "/var/vcap/store/loghost/(.*?)/(.*?)/(.*?)\\.log" "\\3"}}' user: '{{.user}}' - - diff --git a/manifests/operations/loghost-exporter-enable.yml b/manifests/operations/loghost-exporter-enable.yml index 3e7874a..30de600 100644 --- a/manifests/operations/loghost-exporter-enable.yml +++ b/manifests/operations/loghost-exporter-enable.yml @@ -17,6 +17,7 @@ type: counter help: number of errors logs match: '(level=error|level":"error|Subscribe error|lvl=err|lvl=eror)' + scope: local labels: &labels # base dir dep grp director: '{{gsub .logfile "/var/vcap/store/loghost/(.*?)/(.*?)/(.*?)\\.log" "\\1"}}' @@ -26,10 +27,12 @@ type: counter help: number of logs match: '.*' + scope: global labels: <<: *labels - directors: + local-directors: - ((director_name)) + remote-directors: [] - type: replace path: /releases/name=bpm? diff --git a/manifests/operations/prometheus/loghost-enable.yml b/manifests/operations/prometheus/loghost-enable.yml index 9a1c1fb..90558ca 100644 --- a/manifests/operations/prometheus/loghost-enable.yml +++ b/manifests/operations/prometheus/loghost-enable.yml @@ -42,6 +42,10 @@ loghost_alerts: security: enabled: false + nologs: + directors: + - ((site))-((name)) + - type: replace path: /instance_groups/name=prometheus2/jobs/name=prometheus2/properties/prometheus/rule_files/- value: