Skip to content

Commit

Permalink
feat(monitoring): grafana agent (#125)
Browse files Browse the repository at this point in the history
feat(grafana-agent): add initial configuration of grafana agent
fix(monitoring): add vault and update template
fix(monitoring): define http and grpc listen ports for agent
fix(monitoring): add update and restart stanzas to  grafana agent
chore(monitoring): set identity ttl to 1h
chore: add requirement limits to grafana agent

---------

Signed-off-by: Bruce Becker <[email protected]>
  • Loading branch information
brucellino authored Apr 7, 2024
1 parent af8643c commit 20304fb
Show file tree
Hide file tree
Showing 2 changed files with 183 additions and 0 deletions.
102 changes: 102 additions & 0 deletions grafana-agent/grafana-agent.nomad
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
variable "graf_agent_rel_url" {
description = "Base URL for grafana release packages."
type = string
default = "https://github.com/grafana/agent/releases/download"
}
variable "graf_agent_version" {
description = "Grafana Agent version to be used."
type = string
default = "0.40.3"
}

variable "scrape_interval" {
description = "Default scrape interval"
type = string
default = "60s"
}

job "grafana-agent" {
vault {}
type = "system"
group "nodes" {
restart {
render_templates = true
attempts = 2
interval = "5m"
mode = "delay"
}
update {
max_parallel = 3
health_check = "checks"
min_healthy_time = "10s"
healthy_deadline = "5m"
progress_deadline = "10m"
auto_revert = true
auto_promote = true
canary = 1
}
network {
port "http" {}
port "grpc" {}
}

task "agent" {
resources {
memory = 25
cpu = 25
}
identity {
name = "vault"
aud = ["vault.io"]
env = true
file = true
change_mode = "restart"
ttl = "1h"
}

service {
port = "http"
name = "grafana-agent-http"
check {
type = "http"
name = "agent_health"
path = "/-/healthy"
interval = "20s"
timeout = "5s"
}
}

service {
port = "grpc"
name = "grafana-agent-grpc"
check {
type = "tcp"
interval = "20s"
timeout = "5s"
}
}
env {
HOSTNAME = attr.unique.hostname
}
driver = "raw_exec"
template {
data = file("grafana-agent.yml.tmpl")
destination = "local/agent.yml"
}
artifact {
source = "${var.graf_agent_rel_url}/v${var.graf_agent_version}/grafana-agent-linux-${attr.cpu.arch}.zip"
destination = "local/grafana-agent"
mode = "file"
}
config {
command = "local/grafana-agent"
args = [
"-config.file", "local/agent.yml",
"-server.http.address", "${NOMAD_ADDR_http}",
"-server.grpc.address", "${NOMAD_ADDR_grpc}",
"-disable-reporting"
]
}
}
}
}
81 changes: 81 additions & 0 deletions grafana-agent/grafana-agent.yml.tmpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
{{ with secret "hashiatho.me-v2/grafana_cloud" }}
server:
log_level: info
metrics:
wal_directory: /tmp/wal
global:
scrape_interval: 60s
remote_write:
- url: {{ .Data.data.metrics_url }}/api/prom/push
basic_auth:
password: '{{ .Data.data.api_key }}'
username: '{{ .Data.data.metrics_id }}'
logs:
configs:
- name: basic
clients:
- basic_auth:
username: '{{ .Data.data.logs_id }}'
password: '{{ .Data.data.api_key }}'
url: {{ .Data.data.logs_url }}/loki/api/v1/push
positions:
filename: /tmp/positions.yaml
scrape_configs:
- job_name: integrations/node_exporter_journal_scrape
journal:
max_age: 24h
labels:
instance: '{{ env "HOSTNAME" }}'
job: integrations/node_exporter
relabel_configs:
- source_labels: ['__journal__systemd_unit']
target_label: 'unit'
- source_labels: ['__journal__boot_id']
target_label: 'boot_id'
- source_labels: ['__journal__transport']
target_label: 'transport'
- source_labels: ['__journal_priority_keyword']
target_label: 'level'
- job_name: integrations/node_exporter_direct_scrape
static_configs:
- targets:
- localhost
labels:
instance: '{{ env "HOSTNAME" }}'
__path__: /var/log/{syslog,messages,*.log}
job: integrations/node_exporter

integrations:
prometheus_remote_write:
- basic_auth:
password: '{{ .Data.data.api_key }}'
username: '{{ .Data.data.metrics_id }}'
url: '{{ .Data.data.metrics_url }}/api/prom/push'
node_exporter:
enabled: true
# disable unused collectors
disable_collectors:
- ipvs #high cardinality on kubelet
- btrfs
- infiniband
- xfs
- zfs
# exclude dynamic interfaces
netclass_ignored_devices: "^(veth.*|cali.*|[a-f0-9]{15})$"
netdev_device_exclude: "^(veth.*|cali.*|[a-f0-9]{15})$"
# disable tmpfs
filesystem_fs_types_exclude: "^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|tmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$"
# drop extensive scrape statistics
relabel_configs:
- replacement: '{{ env "HOSTNAME" }}'
target_label: instance
metric_relabel_configs:
- action: drop
regex: node_scrape_collector_.+
source_labels:
- __name__
- action: keep
regex: node_arp_entries|node_boot_time_seconds|node_context_switches_total|node_cpu_seconds_total|node_disk_io_time_seconds_total|node_disk_io_time_weighted_seconds_total|node_disk_read_bytes_total|node_disk_read_time_seconds_total|node_disk_reads_completed_total|node_disk_write_time_seconds_total|node_disk_writes_completed_total|node_disk_written_bytes_total|node_filefd_allocated|node_filefd_maximum|node_filesystem_avail_bytes|node_filesystem_device_error|node_filesystem_files|node_filesystem_files_free|node_filesystem_readonly|node_filesystem_size_bytes|node_intr_total|node_load1|node_load15|node_load5|node_md_disks|node_md_disks_required|node_memory_Active_anon_bytes|node_memory_Active_bytes|node_memory_Active_file_bytes|node_memory_AnonHugePages_bytes|node_memory_AnonPages_bytes|node_memory_Bounce_bytes|node_memory_Buffers_bytes|node_memory_Cached_bytes|node_memory_CommitLimit_bytes|node_memory_Committed_AS_bytes|node_memory_DirectMap1G_bytes|node_memory_DirectMap2M_bytes|node_memory_DirectMap4k_bytes|node_memory_Dirty_bytes|node_memory_HugePages_Free|node_memory_HugePages_Rsvd|node_memory_HugePages_Surp|node_memory_HugePages_Total|node_memory_Hugepagesize_bytes|node_memory_Inactive_anon_bytes|node_memory_Inactive_bytes|node_memory_Inactive_file_bytes|node_memory_Mapped_bytes|node_memory_MemAvailable_bytes|node_memory_MemFree_bytes|node_memory_MemTotal_bytes|node_memory_SReclaimable_bytes|node_memory_SUnreclaim_bytes|node_memory_ShmemHugePages_bytes|node_memory_ShmemPmdMapped_bytes|node_memory_Shmem_bytes|node_memory_Slab_bytes|node_memory_SwapTotal_bytes|node_memory_VmallocChunk_bytes|node_memory_VmallocTotal_bytes|node_memory_VmallocUsed_bytes|node_memory_WritebackTmp_bytes|node_memory_Writeback_bytes|node_netstat_Icmp6_InErrors|node_netstat_Icmp6_InMsgs|node_netstat_Icmp6_OutMsgs|node_netstat_Icmp_InErrors|node_netstat_Icmp_InMsgs|node_netstat_Icmp_OutMsgs|node_netstat_IpExt_InOctets|node_netstat_IpExt_OutOctets|node_netstat_TcpExt_ListenDrops|node_netstat_TcpExt_ListenOverflows|node_netstat_TcpExt_TCPSynRetrans|node_netstat_Tcp_InErrs|node_netstat_Tcp_InSegs|node_netstat_Tcp_OutRsts|node_netstat_Tcp_OutSegs|node_netstat_Tcp_RetransSegs|node_netstat_Udp6_InDatagrams|node_netstat_Udp6_InErrors|node_netstat_Udp6_NoPorts|node_netstat_Udp6_OutDatagrams|node_netstat_Udp6_RcvbufErrors|node_netstat_Udp6_SndbufErrors|node_netstat_UdpLite_InErrors|node_netstat_Udp_InDatagrams|node_netstat_Udp_InErrors|node_netstat_Udp_NoPorts|node_netstat_Udp_OutDatagrams|node_netstat_Udp_RcvbufErrors|node_netstat_Udp_SndbufErrors|node_network_carrier|node_network_info|node_network_mtu_bytes|node_network_receive_bytes_total|node_network_receive_compressed_total|node_network_receive_drop_total|node_network_receive_errs_total|node_network_receive_fifo_total|node_network_receive_multicast_total|node_network_receive_packets_total|node_network_speed_bytes|node_network_transmit_bytes_total|node_network_transmit_compressed_total|node_network_transmit_drop_total|node_network_transmit_errs_total|node_network_transmit_fifo_total|node_network_transmit_multicast_total|node_network_transmit_packets_total|node_network_transmit_queue_length|node_network_up|node_nf_conntrack_entries|node_nf_conntrack_entries_limit|node_os_info|node_sockstat_FRAG6_inuse|node_sockstat_FRAG_inuse|node_sockstat_RAW6_inuse|node_sockstat_RAW_inuse|node_sockstat_TCP6_inuse|node_sockstat_TCP_alloc|node_sockstat_TCP_inuse|node_sockstat_TCP_mem|node_sockstat_TCP_mem_bytes|node_sockstat_TCP_orphan|node_sockstat_TCP_tw|node_sockstat_UDP6_inuse|node_sockstat_UDPLITE6_inuse|node_sockstat_UDPLITE_inuse|node_sockstat_UDP_inuse|node_sockstat_UDP_mem|node_sockstat_UDP_mem_bytes|node_sockstat_sockets_used|node_softnet_dropped_total|node_softnet_processed_total|node_softnet_times_squeezed_total|node_systemd_unit_state|node_textfile_scrape_error|node_time_zone_offset_seconds|node_timex_estimated_error_seconds|node_timex_maxerror_seconds|node_timex_offset_seconds|node_timex_sync_status|node_uname_info|node_vmstat_oom_kill|node_vmstat_pgfault|node_vmstat_pgmajfault|node_vmstat_pgpgin|node_vmstat_pgpgout|node_vmstat_pswpin|node_vmstat_pswpout|process_max_fds|process_open_fds
source_labels:
- __name__
{{ end }}

0 comments on commit 20304fb

Please sign in to comment.