Skip to content

Commit

Permalink
handle per-source metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
psycofdj committed Jun 8, 2020
1 parent 6dfde81 commit 0f58527
Show file tree
Hide file tree
Showing 9 changed files with 91 additions and 25 deletions.
4 changes: 3 additions & 1 deletion jobs/loghost_alerts/spec
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@ properties:
loghost_alerts.nologs.evaluation_time:
description: "No received logs alerts evaluation time"
default: 15m

loghost_alerts.nologs.directors:
description: "List of director origin to check in nolog alert"
default: []
loghost_alerts.security.enabled:
decription: "Enable security alerts"
default: false
Expand Down
27 changes: 23 additions & 4 deletions jobs/loghost_alerts/templates/loghost.alerts.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,7 @@ groups:
rules:
- alert: LoghostNoLogReceived
expr: |
(
sum(increase(loghost_total[15m]))
or vector(0)
) == 0
sum(increase(loghost_total[15m])) by (director) == 0
for: <%= p('loghost_alerts.nologs.evaluation_time') %>
labels:
service: loghost
Expand All @@ -25,6 +22,28 @@ groups:
Resolution:
- contact Cloud Foundry administrator team
- alert: LoghostNotEnoughSources
expr: |
count(sum(loghost_total) by (director)) != <%= p('loghost_alerts.nologs.directors').length() %>
for: <%= p('loghost_alerts.nologs.evaluation_time') %>
labels:
service: loghost
severity: warning
annotations:
summary: "No logs were received from some directors at instance `{{$labels.instance}}` in the last 15 minutes"
description: |
Logs are not concetrated and locally stored on loghost instance `{{$labels.instance}}`
Impact:
- moderate impact and is not a production emergency
Possible causes:
- misconfiguration of rsyslog and loghost_exporter on loghost instance
- prometheus is not scraping loghost_exporter as it should
Resolution:
- contact Cloud Foundry administrator team
<% if p('loghost_alerts.security.enabled') %>
- alert: SecurityTooManyAuthFailures
expr: |
Expand Down
4 changes: 2 additions & 2 deletions jobs/loghost_dashboards/templates/loghost_system.json
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@
"tableColumn": "",
"targets": [
{
"expr": "count(sum(increase(loghost_total[15m]))) or vector(0)",
"expr": "count(sum(increase(loghost_total{director=~\"$director\", deployment=~\"$deployment\", group=~\"$group\"}[15m]))) or vector(0)",
"format": "table",
"instant": true,
"refId": "A"
Expand Down Expand Up @@ -291,7 +291,7 @@
"tableColumn": "",
"targets": [
{
"expr": "sum(irate(loghost_total[10m]))",
"expr": "sum(irate(loghost_total{director=~\"$director\", deployment=~\"$deployment\", group=~\"$group\"}[10m]))",
"format": "table",
"instant": false,
"legendFormat": "",
Expand Down
24 changes: 16 additions & 8 deletions jobs/loghost_exporter/spec
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,18 @@ properties:
description: "path to the SSL key file for protocol https. It is optional. If omitted, a hard-coded default key will be used."
loghost_exporter.metrics:
description: "list of metrics to generate, given in grok_exporter config metrics format"
default: []
example:
name: loghost_total
type: counter
help: number of logs
match: '.*'
labels:
my-label: value
- name: loghost_total
type: counter
help: number of logs
match: '.*'
# local: only site-local logs are processed by the metric
# remote: only remote-site logs are processed by the metric
# global: all logs are processed by the metric
scope: "local|remote|global"
labels:
my-label: value
loghost_exporter.base:
description: "Root directory to mount on loghost_exporter container to access logs"
default: /var/vcap/store/loghost
Expand All @@ -45,5 +50,8 @@ properties:
- cf
- prometheus
- concourse
loghost_exporter.directors:
description: "list of directors to watch"
loghost_exporter.local-directors:
description: "list of directors to watch as local-scoped logs"
loghost_exporter.remote-directors:
description: "list of directors to watch as remote-scoped logs"
default: []
30 changes: 24 additions & 6 deletions jobs/loghost_exporter/templates/config.yml.erb
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,20 @@ require 'yaml'

# compute inputs
base = p('loghost_exporter.base')
paths = []
p('loghost_exporter.directors').each do |director|
p('loghost_exporter.deployments').each do |deployment|
paths << File.join(base, director, deployment, "*.log")
scopes = {
"local" => [],
"remote" => [],
"global" => [],
}

p('loghost_exporter.deployments').each do |deployment|
p('loghost_exporter.local-directors').each do |director|
scopes["local"] << File.join(base, director, deployment, "*.log")
scopes["global"] << File.join(base, director, deployment, "*.log")
end
p('loghost_exporter.remote-directors').each do |director|
scopes["remote"] << File.join(base, director, deployment, "*.log")
scopes["global"] << File.join(base, director, deployment, "*.log")
end
end

Expand Down Expand Up @@ -37,13 +47,21 @@ end
config = {
"global" => { "config_version" => 3 },
"imports" => imports,
"metrics" => p('loghost_exporter.metrics'),
"metrics" => [],
"input" => {
"type" => "file",
"fail_on_missing_logfile" => false,
"paths" => paths
"paths" => scopes["global"]
},
"server" => server
}

p('loghost_exporter.metrics').each do |metric|
scope = metric.fetch("scope", "local")
metric["paths"] = scopes[scope]
metric.delete("scope")
config["metrics"].push(metric)
end

%>
<%= config.to_yaml %>
8 changes: 7 additions & 1 deletion jobs/loghost_exporter/templates/pre-start.erb
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
#!/bin/bash

<% p('loghost_exporter.directors').each do |director| %>
<% p('loghost_exporter.local-directors').each do |director| %>
<% p('loghost_exporter.deployments').each do |deployment| %>
su vcap -c "mkdir -p <%= File.join(p('loghost_exporter.base'), director, deployment) %>"
<% end %>
<% end %>

<% p('loghost_exporter.remote-directors').each do |director| %>
<% p('loghost_exporter.deployments').each do |deployment| %>
su vcap -c "mkdir -p <%= File.join(p('loghost_exporter.base'), director, deployment) %>"
<% end %>
Expand Down
10 changes: 8 additions & 2 deletions manifests/operations/loghost-exporter-enable-security.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
type: counter
help: number of successful uaa login for users
match: "INFO --- Audit: IdentityProviderAuthenticationSuccess \\('%{DATA:login}'\\): principal=%{DATA:guid}, origin=\\[remoteAddress=%{DATA:ip}, clientId=%{DATA:principal}\\], identityZoneId=\\[uaa\\], authenticationType=\\[ldap\\]"
scope: local
labels:
director: '{{gsub .logfile "/var/vcap/store/loghost/(.*?)/(.*?)/(.*?)\\.log" "\\1"}}'
deployment: '{{gsub .logfile "/var/vcap/store/loghost/(.*?)/(.*?)/(.*?)\\.log" "\\2"}}'
Expand All @@ -19,6 +20,7 @@
type: counter
help: number of uaa login failures for users
match: "INFO --- Audit: IdentityProviderAuthenticationFailure \\('%{DATA:login}'\\): principal=null, origin=\\[remoteAddress=%{DATA:ip}, clientId=%{DATA:principal}\\], identityZoneId=\\[uaa\\], authenticationType=\\[ldap\\]"
scope: local
labels:
director: '{{gsub .logfile "/var/vcap/store/loghost/(.*?)/(.*?)/(.*?)\\.log" "\\1"}}'
deployment: '{{gsub .logfile "/var/vcap/store/loghost/(.*?)/(.*?)/(.*?)\\.log" "\\2"}}'
Expand All @@ -34,6 +36,7 @@
type: counter
help: number of uaa login failures for clients
match: "INFO --- Audit: ClientAuthenticationFailure \\('Bad credentials'\\): principal=%{DATA:login}, origin=\\[remoteAddress=%{DATA:ip}, clientId=%{DATA:principal}\\], identityZoneId=\\[uaa\\]"
scope: local
labels:
director: '{{gsub .logfile "/var/vcap/store/loghost/(.*?)/(.*?)/(.*?)\\.log" "\\1"}}'
deployment: '{{gsub .logfile "/var/vcap/store/loghost/(.*?)/(.*?)/(.*?)\\.log" "\\2"}}'
Expand All @@ -48,6 +51,7 @@
type: counter
help: number of uaa login success for clients
match: "INFO --- Audit: ClientAuthenticationSuccess \\('Client authentication success'\\): principal=%{DATA:login}, origin=\\[remoteAddress=%{DATA:ip}, clientId=%{DATA:principal}\\], identityZoneId=\\[uaa\\]"
scope: local
labels:
director: '{{gsub .logfile "/var/vcap/store/loghost/(.*?)/(.*?)/(.*?)\\.log" "\\1"}}'
deployment: '{{gsub .logfile "/var/vcap/store/loghost/(.*?)/(.*?)/(.*?)\\.log" "\\2"}}'
Expand All @@ -62,6 +66,7 @@
type: counter
help: number of system auth failures
match: "type=USER_(AUTH|ERR) msg=audit\\(%{DATA}\\): pid=%{NUMBER} uid=%{NUMBER} auid=%{NUMBER} ses=%{NUMBER} msg='op=PAM:(bad_ident|authentication) acct=\"%{DATA}\" exe=\"%{DATA:source}\" hostname=%{DATA} addr=%{DATA:ip} terminal=%{DATA} res=failed'"
scope: local
labels:
director: '{{gsub .logfile "/var/vcap/store/loghost/(.*?)/(.*?)/(.*?)\\.log" "\\1"}}'
deployment: '{{gsub .logfile "/var/vcap/store/loghost/(.*?)/(.*?)/(.*?)\\.log" "\\2"}}'
Expand All @@ -76,6 +81,7 @@
type: counter
help: number of system authentication success
match: "type=USER_ACCT msg=audit\\(%{DATA}\\): pid=%{NUMBER} uid=%{NUMBER} auid=%{NUMBER} ses=%{NUMBER} msg='op=PAM:accounting acct=\"%{DATA:username}\" exe=\"%{DATA:source}\" hostname=%{DATA} addr=%{DATA:ip} terminal=%{DATA} res=success'"
scope: local
labels:
director: '{{gsub .logfile "/var/vcap/store/loghost/(.*?)/(.*?)/(.*?)\\.log" "\\1"}}'
deployment: '{{gsub .logfile "/var/vcap/store/loghost/(.*?)/(.*?)/(.*?)\\.log" "\\2"}}'
Expand All @@ -91,6 +97,7 @@
type: counter
help: number of diego ssh authentication success
match: "ssh_proxy\\[rs2\\] {\"timestamp\":\"%{DATA}\",\"level\":\"info\",\"source\":\"ssh-proxy\",\"message\":\"ssh-proxy.cf-authenticate.app-access-success\",\"data\":{\"app\":\"%{DATA:appname}/%{NUMBER:appindex}\",\"principal\":\"%{DATA}\",\"session\":\"%{DATA},\"username\":\"%{DATA:username}\"}}"
scope: local
labels:
director: '{{gsub .logfile "/var/vcap/store/loghost/(.*?)/(.*?)/(.*?)\\.log" "\\1"}}'
deployment: '{{gsub .logfile "/var/vcap/store/loghost/(.*?)/(.*?)/(.*?)\\.log" "\\2"}}'
Expand All @@ -107,10 +114,9 @@
type: counter
help: number of diego ssh authentication failures
match: "ssh_proxy\\[rs2\\] {\"timestamp\":\"%{DATA}\",\"level\":\"error\",\"source\":\"ssh-proxy\",\"message\":\"ssh-proxy.authentication-failed\",\"data\":{\"error\":\"%{DATA}\",\"user\":\"%{DATA:user}\"}}"
scope: local
labels:
director: '{{gsub .logfile "/var/vcap/store/loghost/(.*?)/(.*?)/(.*?)\\.log" "\\1"}}'
deployment: '{{gsub .logfile "/var/vcap/store/loghost/(.*?)/(.*?)/(.*?)\\.log" "\\2"}}'
group: '{{gsub .logfile "/var/vcap/store/loghost/(.*?)/(.*?)/(.*?)\\.log" "\\3"}}'
user: '{{.user}}'


5 changes: 4 additions & 1 deletion manifests/operations/loghost-exporter-enable.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
type: counter
help: number of errors logs
match: '(level=error|level":"error|Subscribe error|lvl=err|lvl=eror)'
scope: local
labels: &labels
# base dir dep grp
director: '{{gsub .logfile "/var/vcap/store/loghost/(.*?)/(.*?)/(.*?)\\.log" "\\1"}}'
Expand All @@ -26,10 +27,12 @@
type: counter
help: number of logs
match: '.*'
scope: global
labels:
<<: *labels
directors:
local-directors:
- ((director_name))
remote-directors: []

- type: replace
path: /releases/name=bpm?
Expand Down
4 changes: 4 additions & 0 deletions manifests/operations/prometheus/loghost-enable.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@
loghost_alerts:
security:
enabled: false
nologs:
directors:
- ((site))-((name))

- type: replace
path: /instance_groups/name=prometheus2/jobs/name=prometheus2/properties/prometheus/rule_files/-
value:
Expand Down

0 comments on commit 0f58527

Please sign in to comment.