From d78ef35671fbcff7426a89eba19c324ceebc4ac3 Mon Sep 17 00:00:00 2001 From: Bruce Becker Date: Sun, 11 Dec 2022 00:00:36 +0100 Subject: [PATCH] feat(loki): standalone local loki configuration (#15) feat(prometheus): add alerting rules and upgrade prometheus fix(promtail): add a healtcheck to promtail ci: add loki local instance template fix(prometheus): append protocol to prometheus endpoint feat(promtail): add journal log scraping target fix(promtail): use dynamic port for service and add grpc check fix(fabio): make fabio version a variable chore: add loki version variable fix(loki): add a urlprefix with strip to use fabio feat(loki): add grpc service to loki job fix(loki): use a separate spaces key for writing files Signed-off-by: Bruce Becker --- fabio.nomad | 11 ++++-- loki/loki-local.yml.tpl | 37 ++++++++++++++++++ loki/loki.nomad | 22 +++++------ loki/loki.yml.tpl | 58 ++++++++++++++++++---------- loki/main.tf | 19 ++++------ loki/variables.tf | 6 +++ prometheus.nomad | 83 ++++++++++++++++++++++++++++++++++++++--- promtail.nomad | 67 +++++++++++++++++++++------------ promtail.yml.tpl | 13 ++++++- 9 files changed, 240 insertions(+), 76 deletions(-) create mode 100644 loki/loki-local.yml.tpl diff --git a/fabio.nomad b/fabio.nomad index 71dbaac..7f360f7 100644 --- a/fabio.nomad +++ b/fabio.nomad @@ -1,3 +1,8 @@ +variable "fabio_version" { + type = string + default = "1.6.3" + description = "Version of Fabio to use" +} job "fabio" { datacenters = ["dc1"] type = "system" @@ -17,15 +22,15 @@ job "fabio" { } } restart { - attempts = 3 - interval = "10m" + attempts = 1 + interval = "2m" delay = "15s" mode = "delay" } task "fabio" { artifact { - source = "https://github.com/fabiolb/fabio/releases/download/v1.6.0/fabio-1.6.0-linux_${attr.cpu.arch}" + source = "https://github.com/fabiolb/fabio/releases/download/v${var.fabio_version}/fabio-${var.fabio_version}-linux_${attr.cpu.arch}" destination = "local/fabio" mode = "file" } diff --git a/loki/loki-local.yml.tpl b/loki/loki-local.yml.tpl new file mode 100644 index 0000000..16942b9 --- /dev/null +++ b/loki/loki-local.yml.tpl @@ -0,0 +1,37 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + +ingester: + lifecycler: + address: 127.0.0.1 + ring: + kvstore: + store: inmemory + replication_factor: 1 + final_sleep: 0s + chunk_idle_period: 5m + chunk_retain_period: 30s + +schema_config: + configs: + - from: 2020-05-15 + store: boltdb + object_store: filesystem + schema: v11 + index: + prefix: index_ + period: 168h + +storage_config: + boltdb: + directory: /tmp/loki/index + + filesystem: + directory: /tmp/loki/chunks + +limits_config: + enforce_metric_name: false + reject_old_samples: true + reject_old_samples_max_age: 168h diff --git a/loki/loki.nomad b/loki/loki.nomad index 934c6dc..8dbefc0 100644 --- a/loki/loki.nomad +++ b/loki/loki.nomad @@ -8,21 +8,21 @@ variable "secret_key" { variable "loki_version" { type = string - default = "v2.6.0" + default = "v2.7.1" } job "loki" { datacenters = ["dc1"] type = "service" name = "loki" - // migrate {} + meta { auto-backup = true backup-schedule = "@hourly" backup-target-db = "postgres" } update { - max_parallel = 1 + max_parallel = 2 health_check = "checks" min_healthy_time = "5s" healthy_deadline = "300s" @@ -45,18 +45,10 @@ job "loki" { } service { name = "loki-http-server" - tags = ["logs", "loki", "observability", "urlprefix-/loki"] + tags = ["urlprefix-/loki strip=/loki"] port = "http" on_update = "require_healthy" - check { - name = "loki_alive" - type = "grpc" - port = "grpc" - interval = "10s" - timeout = "3s" - } - check { name = "loki_ready" type = "http" @@ -66,6 +58,12 @@ job "loki" { timeout = "3s" } } + + service { + name = "loki-grpc" + port = "grpc" + } + task "server" { driver = "exec" env { diff --git a/loki/loki.yml.tpl b/loki/loki.yml.tpl index 3cae90d..d012855 100644 --- a/loki/loki.yml.tpl +++ b/loki/loki.yml.tpl @@ -3,31 +3,49 @@ auth_enabled: false server: http_listen_port: {{ env "NOMAD_PORT_http" }} grpc_listen_port: {{ env "NOMAD_PORT_grpc" }} -memberlist: - join_members: - - loki-http-server + register_instrumentation: true + http_server_read_timeout: "40s" + http_server_write_timeout: "50s" +distributor: + ring: + kvstore: + store: consul + prefix: loki/collectors +ingester: + lifecycler: + address: loki-grpc.service.consul + ring: + kvstore: + store: consul + prefix: loki/collectors + replication_factor: 1 + final_sleep: 0s + chunk_idle_period: 1m + chunk_retain_period: 30s schema_config: configs: - - from: 2022-01-01 - store: boltdb-shipper + - from: 2020-01-01 + store: aws object_store: s3 schema: v11 index: - prefix: index_ - period: 24h -common: - path_prefix: local/ - replication_factor: 1 - storage: - s3: - endpoint: {{ key "jobs/loki/s3_endpoint" }} - bucketnames: {{ key "jobs/loki/logs_bucket" }} - access_key_id: {{ env "access_key" }} - secret_access_key: {{ env "secret_key" }} - s3forcepathstyle: true - ring: - kvstore: - store: consul + prefix: loki_ + +storage_config: + aws: + region: ams3 + endpoint: https://{{ key "jobs/loki/s3_endpoint" }} + bucketnames: {{ key "jobs/loki/logs_bucket" }} + access_key_id: {{ env "access_key" }} + secret_access_key: {{ env "secret_key" }} + s3forcepathstyle: true + insecure: false + dynamodb: + dynamodb_url: inmemory:///index + boltdb_shipper: + active_index_directory: /loki/index + cache_location: /loki/index_cache + shared_store: s3 ruler: storage: s3: diff --git a/loki/main.tf b/loki/main.tf index b9f3772..ab0868b 100644 --- a/loki/main.tf +++ b/loki/main.tf @@ -38,13 +38,9 @@ provider "digitalocean" { spaces_secret_key = jsondecode(data.vault_kv_secret_v2.digitalocean.data_json)["spaces_secret"] } -provider "nomad" { +provider "nomad" {} -} - -provider "consul" { - -} +provider "consul" {} resource "digitalocean_spaces_bucket" "logs" { region = var.doregion @@ -79,7 +75,7 @@ resource "consul_keys" "endpoint" { key { path = "jobs/loki/s3_endpoint" - value = "https://${digitalocean_spaces_bucket.logs.region}.digitaloceanspaces.com" + value = "${digitalocean_spaces_bucket.logs.region}.digitaloceanspaces.com" } } @@ -90,10 +86,11 @@ resource "nomad_job" "loki" { enabled = true allow_fs = true vars = { - "access_key" = jsondecode(data.vault_kv_secret_v2.digitalocean.data_json)["spaces_key"] - "secret_key" = jsondecode(data.vault_kv_secret_v2.digitalocean.data_json)["spaces_secret"] + "access_key" = jsondecode(data.vault_kv_secret_v2.digitalocean.data_json)["loki_spaces_key"] + "secret_key" = jsondecode(data.vault_kv_secret_v2.digitalocean.data_json)["loki_spaces_secret"] } } - purge_on_destroy = true - detach = false + purge_on_destroy = false + detach = true + deregister_on_destroy = false } diff --git a/loki/variables.tf b/loki/variables.tf index 3466e43..1264a35 100644 --- a/loki/variables.tf +++ b/loki/variables.tf @@ -3,3 +3,9 @@ variable "doregion" { default = "ams3" type = string } + +variable "loki_version" { + description = "Version of Grafana Loki to deploy. See " + type = string + default = "v2.7.1" +} diff --git a/prometheus.nomad b/prometheus.nomad index 0b05c11..5edc12d 100644 --- a/prometheus.nomad +++ b/prometheus.nomad @@ -20,7 +20,7 @@ job "prometheus" { } group "monitoring" { - count = 2 + count = 1 network { port "prometheus_ui" { @@ -41,11 +41,11 @@ job "prometheus" { task "prometheus" { artifact { - source = "https://github.com/prometheus/prometheus/releases/download/v2.36.2/prometheus-2.36.2.linux-arm64.tar.gz" + source = "https://github.com/prometheus/prometheus/releases/download/v2.40.2/prometheus-2.40.2.linux-arm64.tar.gz" destination = "local" options { - checksum = "sha256:302abfe197f40572b42c7b765f1a37beb7272f985165e5769519fe0a789dcc98" + checksum = "sha256:9f39cf29756106ee4c43fe31d346dcfca58fc275c751dce9f6b50eb3ee31356c" } } template { @@ -56,7 +56,8 @@ job "prometheus" { global: scrape_interval: 20s evaluation_interval: 60s - +rule_files: + - 'node-rules.yml' scrape_configs: - job_name: 'instance_metrics' static_configs: @@ -87,13 +88,85 @@ scrape_configs: metrics_path: /v1/metrics params: format: ['prometheus'] + - job_name: 'nomad_metrics' + nomad_sd_configs: + - server: http://nomad.service.consul:4646 EOH } + template { + change_mode = "restart" + destination = "local/node-rules.yml" + left_delimiter = "[[" + right_delimiter = "]]" + data = < 1000 + for: 2m + labels: + severity: warning + annotations: + summary: Host memory under memory pressure (instance {{ $labels.instance }}) + description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: HostUnusualNetworkThroughputIn + expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100 + for: 5m + labels: + severity: warning + annotations: + summary: Host unusual network throughput in (instance {{ $labels.instance }}) + description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - name: prom.rules + rules: + - alert: PrometheusJobMissing + expr: absent(up{job="prometheus"}) + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus job missing (instance {{ $labels.instance }}) + description: "A Prometheus job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: PrometheusTargetMissing + expr: up == 0 + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus target missing (instance {{ $labels.instance }}) + description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - name: consul.rules + rules: + - alert: ConsulServiceHealthcheckFailed + expr: consul_catalog_service_node_healthy == 0 + for: 1m + labels: + severity: critical + annotations: + summary: Consul service healthcheck failed (instance {{ $labels.instance }}) + description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: ConsulAgentUnhealthy + expr: consul_health_node_status{status="critical"} == 1 + for: 0m + labels: + severity: critical + annotations: + summary: Consul agent unhealthy (instance {{ $labels.instance }}) + description: "A Consul agent is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" +EOH + } driver = "exec" config { - command = "local/prometheus-2.36.2.linux-arm64/prometheus" + command = "local/prometheus-2.40.2.linux-arm64/prometheus" args = [ "--config.file=local/prometheus.yml", "--web.external-url=http://0.0.0.0:9090/prometheus" diff --git a/promtail.nomad b/promtail.nomad index fec1dc9..c613638 100644 --- a/promtail.nomad +++ b/promtail.nomad @@ -1,16 +1,9 @@ -# There can only be a single job definition per file. This job is named -# "example" so it will create a job with the ID and Name "example". - -# The "job" stanza is the top-most configuration option in the job -# specification. A job is a declarative specification of tasks that Nomad -# should run. Jobs have a globally unique name, one or many task groups, which -# are themselves collections of one or many tasks. -# -# For more information and examples on the "job" stanza, please see -# the online documentation at: -# -# https://www.nomadproject.io/docs/job-specification/job -# +variable "promtail_version" { + description = "Version of Promtail to deploy" + type = string + default = "v2.5.0" +} + job "promtail" { meta { @@ -32,13 +25,17 @@ job "promtail" { network { port "http" { - static = 9080 + to = 9080 + } + + port "grpc" { + to = 9050 } } service { - name = "promtail" - tags = ["logs", "promtail", "observability"] + name = "http" + tags = ["logs", "promtail", "observability", "http"] port = "http" check { @@ -48,10 +45,39 @@ job "promtail" { timeout = "2s" } + check { + name = "Promtail HTTP" + type = "http" + path = "/targets" + interval = "10s" + timeout = "5s" + + check_restart { + limit = 2 + grace = "60s" + ignore_warnings = false + } + } + } + + service { + name = "grpc" + tags = ["logs", "promtail", "observability", "grpc"] + port = "grpc" + + check { + name = "promtail-grpc" + grpc_service = "" + type = "grpc" + interval = "15s" + timeout = "5s" + grpc_use_tls = false + tls_skip_verify = true + } } restart { - attempts = 3 + attempts = 2 interval = "10m" delay = "15s" mode = "delay" @@ -71,13 +97,8 @@ job "promtail" { args = ["-config.file=local/promtail.yml"] } - // artifact { - // source = "http://minio-api.service.consul:9000/loki-bin/promtail-linux-${attr.cpu.arch}.zip" - // destination = "local/promtail" - // mode = "file" - // } artifact { - source = "https://github.com/grafana/loki/releases/download/v2.5.0/promtail-linux-arm64.zip" + source = "https://github.com/grafana/loki/releases/download/v2.5.0/promtail-linux-${attr.cpu.arch}.zip" destination = "local/promtail" mode = "file" } diff --git a/promtail.yml.tpl b/promtail.yml.tpl index a9b169c..fc1e127 100644 --- a/promtail.yml.tpl +++ b/promtail.yml.tpl @@ -1,9 +1,10 @@ server: + log_level: info http_listen_port: 9080 - grpc_listen_port: 0 + grpc_listen_port: 9095 positions: - filename: /tmp/positions.yaml + filename: /data/positions.yaml clients: - url: http://loki-http-server.service.consul:3100/loki/api/v1/push @@ -23,3 +24,11 @@ scrape_configs: labels: job: nomad __path__: /var/log/nomad*.log +- job_name: journal + journal: + max_age: 12h + labels: + job: systemd-journal + relabel_configs: + - source_labsl: ['__journal__systemd_unit'] + target_label: 'unit'