Skip to content

Commit

Permalink
feat(loki): standalone local loki configuration (#15)
Browse files Browse the repository at this point in the history
feat(prometheus): add alerting rules and upgrade prometheus
fix(promtail): add a healtcheck to promtail
ci: add loki local instance template
fix(prometheus): append protocol to prometheus endpoint
feat(promtail): add journal log scraping target
fix(promtail): use dynamic port for service and add grpc check
fix(fabio): make fabio version a variable
chore: add loki version variable
fix(loki): add a urlprefix with strip to use fabio
feat(loki): add grpc service to loki job
fix(loki): use a separate spaces key for writing files

Signed-off-by: Bruce Becker <[email protected]>
  • Loading branch information
brucellino authored Dec 10, 2022
1 parent 38c9cbd commit d78ef35
Show file tree
Hide file tree
Showing 9 changed files with 240 additions and 76 deletions.
11 changes: 8 additions & 3 deletions fabio.nomad
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
variable "fabio_version" {
type = string
default = "1.6.3"
description = "Version of Fabio to use"
}
job "fabio" {
datacenters = ["dc1"]
type = "system"
Expand All @@ -17,15 +22,15 @@ job "fabio" {
}
}
restart {
attempts = 3
interval = "10m"
attempts = 1
interval = "2m"
delay = "15s"
mode = "delay"
}

task "fabio" {
artifact {
source = "https://github.com/fabiolb/fabio/releases/download/v1.6.0/fabio-1.6.0-linux_${attr.cpu.arch}"
source = "https://github.com/fabiolb/fabio/releases/download/v${var.fabio_version}/fabio-${var.fabio_version}-linux_${attr.cpu.arch}"
destination = "local/fabio"
mode = "file"
}
Expand Down
37 changes: 37 additions & 0 deletions loki/loki-local.yml.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
auth_enabled: false

server:
http_listen_port: 3100

ingester:
lifecycler:
address: 127.0.0.1
ring:
kvstore:
store: inmemory
replication_factor: 1
final_sleep: 0s
chunk_idle_period: 5m
chunk_retain_period: 30s

schema_config:
configs:
- from: 2020-05-15
store: boltdb
object_store: filesystem
schema: v11
index:
prefix: index_
period: 168h

storage_config:
boltdb:
directory: /tmp/loki/index

filesystem:
directory: /tmp/loki/chunks

limits_config:
enforce_metric_name: false
reject_old_samples: true
reject_old_samples_max_age: 168h
22 changes: 10 additions & 12 deletions loki/loki.nomad
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,21 @@ variable "secret_key" {

variable "loki_version" {
type = string
default = "v2.6.0"
default = "v2.7.1"
}

job "loki" {
datacenters = ["dc1"]
type = "service"
name = "loki"
// migrate {}

meta {
auto-backup = true
backup-schedule = "@hourly"
backup-target-db = "postgres"
}
update {
max_parallel = 1
max_parallel = 2
health_check = "checks"
min_healthy_time = "5s"
healthy_deadline = "300s"
Expand All @@ -45,18 +45,10 @@ job "loki" {
}
service {
name = "loki-http-server"
tags = ["logs", "loki", "observability", "urlprefix-/loki"]
tags = ["urlprefix-/loki strip=/loki"]
port = "http"
on_update = "require_healthy"

check {
name = "loki_alive"
type = "grpc"
port = "grpc"
interval = "10s"
timeout = "3s"
}

check {
name = "loki_ready"
type = "http"
Expand All @@ -66,6 +58,12 @@ job "loki" {
timeout = "3s"
}
}

service {
name = "loki-grpc"
port = "grpc"
}

task "server" {
driver = "exec"
env {
Expand Down
58 changes: 38 additions & 20 deletions loki/loki.yml.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -3,31 +3,49 @@ auth_enabled: false
server:
http_listen_port: {{ env "NOMAD_PORT_http" }}
grpc_listen_port: {{ env "NOMAD_PORT_grpc" }}
memberlist:
join_members:
- loki-http-server
register_instrumentation: true
http_server_read_timeout: "40s"
http_server_write_timeout: "50s"
distributor:
ring:
kvstore:
store: consul
prefix: loki/collectors
ingester:
lifecycler:
address: loki-grpc.service.consul
ring:
kvstore:
store: consul
prefix: loki/collectors
replication_factor: 1
final_sleep: 0s
chunk_idle_period: 1m
chunk_retain_period: 30s
schema_config:
configs:
- from: 2022-01-01
store: boltdb-shipper
- from: 2020-01-01
store: aws
object_store: s3
schema: v11
index:
prefix: index_
period: 24h
common:
path_prefix: local/
replication_factor: 1
storage:
s3:
endpoint: {{ key "jobs/loki/s3_endpoint" }}
bucketnames: {{ key "jobs/loki/logs_bucket" }}
access_key_id: {{ env "access_key" }}
secret_access_key: {{ env "secret_key" }}
s3forcepathstyle: true
ring:
kvstore:
store: consul
prefix: loki_

storage_config:
aws:
region: ams3
endpoint: https://{{ key "jobs/loki/s3_endpoint" }}
bucketnames: {{ key "jobs/loki/logs_bucket" }}
access_key_id: {{ env "access_key" }}
secret_access_key: {{ env "secret_key" }}
s3forcepathstyle: true
insecure: false
dynamodb:
dynamodb_url: inmemory:///index
boltdb_shipper:
active_index_directory: /loki/index
cache_location: /loki/index_cache
shared_store: s3
ruler:
storage:
s3:
Expand Down
19 changes: 8 additions & 11 deletions loki/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,9 @@ provider "digitalocean" {
spaces_secret_key = jsondecode(data.vault_kv_secret_v2.digitalocean.data_json)["spaces_secret"]
}

provider "nomad" {
provider "nomad" {}

}

provider "consul" {

}
provider "consul" {}

resource "digitalocean_spaces_bucket" "logs" {
region = var.doregion
Expand Down Expand Up @@ -79,7 +75,7 @@ resource "consul_keys" "endpoint" {

key {
path = "jobs/loki/s3_endpoint"
value = "https://${digitalocean_spaces_bucket.logs.region}.digitaloceanspaces.com"
value = "${digitalocean_spaces_bucket.logs.region}.digitaloceanspaces.com"
}
}

Expand All @@ -90,10 +86,11 @@ resource "nomad_job" "loki" {
enabled = true
allow_fs = true
vars = {
"access_key" = jsondecode(data.vault_kv_secret_v2.digitalocean.data_json)["spaces_key"]
"secret_key" = jsondecode(data.vault_kv_secret_v2.digitalocean.data_json)["spaces_secret"]
"access_key" = jsondecode(data.vault_kv_secret_v2.digitalocean.data_json)["loki_spaces_key"]
"secret_key" = jsondecode(data.vault_kv_secret_v2.digitalocean.data_json)["loki_spaces_secret"]
}
}
purge_on_destroy = true
detach = false
purge_on_destroy = false
detach = true
deregister_on_destroy = false
}
6 changes: 6 additions & 0 deletions loki/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,9 @@ variable "doregion" {
default = "ams3"
type = string
}

variable "loki_version" {
description = "Version of Grafana Loki to deploy. See "
type = string
default = "v2.7.1"
}
83 changes: 78 additions & 5 deletions prometheus.nomad
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ job "prometheus" {
}

group "monitoring" {
count = 2
count = 1

network {
port "prometheus_ui" {
Expand All @@ -41,11 +41,11 @@ job "prometheus" {

task "prometheus" {
artifact {
source = "https://github.com/prometheus/prometheus/releases/download/v2.36.2/prometheus-2.36.2.linux-arm64.tar.gz"
source = "https://github.com/prometheus/prometheus/releases/download/v2.40.2/prometheus-2.40.2.linux-arm64.tar.gz"
destination = "local"

options {
checksum = "sha256:302abfe197f40572b42c7b765f1a37beb7272f985165e5769519fe0a789dcc98"
checksum = "sha256:9f39cf29756106ee4c43fe31d346dcfca58fc275c751dce9f6b50eb3ee31356c"
}
}
template {
Expand All @@ -56,7 +56,8 @@ job "prometheus" {
global:
scrape_interval: 20s
evaluation_interval: 60s
rule_files:
- 'node-rules.yml'
scrape_configs:
- job_name: 'instance_metrics'
static_configs:
Expand Down Expand Up @@ -87,13 +88,85 @@ scrape_configs:
metrics_path: /v1/metrics
params:
format: ['prometheus']
- job_name: 'nomad_metrics'
nomad_sd_configs:
- server: http://nomad.service.consul:4646
EOH
}

template {
change_mode = "restart"
destination = "local/node-rules.yml"
left_delimiter = "[["
right_delimiter = "]]"
data = <<EOH
---
groups:
- name: node.rules
rules:
- alert: InstanceDown
expr: up{job="instance_metrics"} == 0
for: 10m
- alert: InstancesDown
expr: avg(up{job="instance_metrics"}) BY (job)
- alert: HostMemoryUnderMemoryPressure
expr: rate(node_vmstat_pgmajfault[1m]) > 1000
for: 2m
labels:
severity: warning
annotations:
summary: Host memory under memory pressure (instance {{ $labels.instance }})
description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualNetworkThroughputIn
expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual network throughput in (instance {{ $labels.instance }})
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- name: prom.rules
rules:
- alert: PrometheusJobMissing
expr: absent(up{job="prometheus"})
for: 0m
labels:
severity: warning
annotations:
summary: Prometheus job missing (instance {{ $labels.instance }})
description: "A Prometheus job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusTargetMissing
expr: up == 0
for: 0m
labels:
severity: critical
annotations:
summary: Prometheus target missing (instance {{ $labels.instance }})
description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- name: consul.rules
rules:
- alert: ConsulServiceHealthcheckFailed
expr: consul_catalog_service_node_healthy == 0
for: 1m
labels:
severity: critical
annotations:
summary: Consul service healthcheck failed (instance {{ $labels.instance }})
description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ConsulAgentUnhealthy
expr: consul_health_node_status{status="critical"} == 1
for: 0m
labels:
severity: critical
annotations:
summary: Consul agent unhealthy (instance {{ $labels.instance }})
description: "A Consul agent is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
EOH
}
driver = "exec"

config {
command = "local/prometheus-2.36.2.linux-arm64/prometheus"
command = "local/prometheus-2.40.2.linux-arm64/prometheus"
args = [
"--config.file=local/prometheus.yml",
"--web.external-url=http://0.0.0.0:9090/prometheus"
Expand Down
Loading

0 comments on commit d78ef35

Please sign in to comment.