Skip to content

Commit

Permalink
add more collectors and analyzers (#1398)
Browse files Browse the repository at this point in the history
  • Loading branch information
adamancini authored Oct 31, 2024
1 parent f9eae1d commit 7024616
Showing 1 changed file with 195 additions and 15 deletions.
210 changes: 195 additions & 15 deletions pkg/goods/support/host-support-bundle.tmpl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,19 @@ spec:
collectorName: top
command: top
args: ['-b', '-n', '1']
- run:
collectorName: uname
command: uname
args: ['-a']
- run:
collectorName: "hostnames"
command: "sh"
args:
- -c
- |
echo "hostname = $(hostname)"
echo "/proc/sys/kernel/hostname = $(cat /proc/sys/kernel/hostname)"
echo "uname -n = $(uname -n)"
- run:
collectorName: df
command: df
Expand All @@ -88,6 +101,82 @@ spec:
- run:
collectorName: uptime
command: uptime
- run:
collectorName: sestatus
command: sestatus
args: []
- run:
collectorName: apparmor-status
command: apparmor_status
args: []
- run:
collectorName: iptables
command: iptables
args: ["-L", "-v"]
- run:
collectorName: iptables-version
command: iptables
args: ["--version"]
- run:
collectorName: nftables-list
command: nft
args: ["list", "table", "filter"]
- run:
collectorName: "ipvsadm"
command: "ipvsadm"
args: ["-l", "-n"]
- run:
collectorName: "lsblk"
command: "lsblk"
args: ["--fs"]
- run:
collectorName: lvm
command: pvdisplay
args: []
- run:
collectorName: lvm
command: vgdisplay
args: []
- run:
collectorName: lvm
command: lvdisplay
args: []
- run:
collectorName: "netstat-ports"
command: "netstat"
args: ["-t", "-u", "-l", "-p", "-n"]
- run:
collectorName: "netstat-route-table"
command: "netstat"
args: ["-r", "-n"]
- run:
collectorName: "resolvectl-status"
command: "resolvectl"
args: ["status"]
- run:
collectorName: "resolv-conf"
command: "cat"
args: ["/etc/resolv.conf"]
- run:
collectorName: "systemd-resolved-conf"
command: "cat"
args: ["/etc/systemd/resolved.conf"]
- run:
collectorName: "nsswitch-conf"
command: "cat"
args: ["/etc/nsswitch.conf"]
- run:
collectorName: "hosts"
command: "cat"
args: ["/etc/hosts"]
- run:
collectorName: "ip-route-table"
command: "ip"
args: ["route"]
- run:
collectorName: "sysctl"
command: "sysctl"
args: ["-a"]
- run:
collectorName: k0s-version
command: /usr/local/bin/k0s
Expand Down Expand Up @@ -116,6 +205,41 @@ spec:
- copy:
collectorName: runtime-config
path: /etc/embedded-cluster/*
- run:
collectorName: "systemctl-firewalld-status"
command: "systemctl"
args: ["status", "firewalld"]
- run:
collectorName: "systemctl-resolved-status"
command: "systemctl"
args: ["status", "systemd-resolved"]
# Systemd Service Configurations for CRI, Kubelet
- run:
collectorName: "systemctl-cat-journald"
command: "systemctl"
args: ["cat", "systemd-journald"]
- run:
collectorName: "systemctl-cat-resolved"
command: "systemctl"
args: ["cat", "systemd-resolved"]
- run:
collectorName: "systemctl-cat-k0scontroller"
command: "systemctl"
args: ["cat", "k0scontroller.service"]
- run:
collectorName: "systemctl-cat-k0sworker"
command: "systemctl"
args: ["cat", "k0sworker.service"]
- run:
collectorName: "journalctl-dmesg"
command: "journalctl"
args: ["--dmesg", "--no-pager", "-S", "7 days ago"]
- copy:
collectorName: "syslog"
path: /var/log/syslog
- copy:
collectorName: "syslog" # Copy the previous syslog file as well in case the current one is rotated
path: /var/log/syslog.1
- run:
collectorName: network-manager-logs
command: journalctl
Expand Down Expand Up @@ -156,9 +280,33 @@ spec:
collectorName: 'check-umount'
command: 'sh'
args: ['-c', 'command -v umount']
- run:
collectorName: "mount"
command: "mount"
args: ["-l"]
- copy:
collectorName: installer/lam-service-config
path: /etc/systemd/system/local-artifact-mirror.service.d/*
- run:
collectorName: "ps-high-load"
command: "sh"
args: ["-c", "ps -eo s,user,cmd | grep ^[RD] | sort | uniq -c | sort -nbr | head -20"]
- run:
collectorName: "ps-detect-antivirus-and-security-tools"
command: "sh"
args: [-c, "ps -ef | grep -E 'clamav|sophos|esets_daemon|fsav|symantec|mfend|ds_agent|kav|bdagent|s1agent|falcon|illumio|xagt' | grep -v grep"]
- filesystemPerformance:
collectorName: filesystem-write-latency-etcd
timeout: 5m
directory: {{ .K0sDataDir }}/etcd
fileSize: 22Mi
operationSize: 2300
datasync: true
runTime: "0" # let it run to completion
- run:
collectorName: "localhost-ips"
command: "sh"
args: ["-c", "host localhost"]
hostAnalyzers:
- ipv4Interfaces:
outcomes:
Expand All @@ -173,9 +321,9 @@ spec:
outcomes:
- fail:
when: "< 2G"
message: At least 2G of memory is recommended
message: At least 2GB of memory is required, but less is present
- pass:
message: The system has at least 2G of memory
message: At least 2GB of memory is present
- diskUsage:
checkName: Root disk usage
collectorName: root-disk-usage
Expand Down Expand Up @@ -243,21 +391,21 @@ spec:
outcomes:
- fail:
when: "false"
message: Kubernetes API probing is reporting a failure
message: Kubernetes API probing reported a failure
- pass:
when: "true"
message: Kubernetes API probing is reporting success
message: Kubernetes API probing reported success
- textAnalyze:
checkName: NetworkManager managing calico interfaces
fileName: host-collectors/run-host/network-manager-logs.txt
regex: 'device .*cali.+: state change: config'
outcomes:
- fail:
when: "true"
message: NetworkManager seems to be managing calico interfaces
message: NetworkManager is managing Calico interfaces
- pass:
when: "false"
message: NetworkManager isn't managing calico interfaces
message: NetworkManager isn't managing Calico interfaces
- hostServices:
checkName: "Local Artifact Mirror"
outcomes:
Expand All @@ -272,13 +420,13 @@ spec:
outcomes:
- fail:
when: 'ntp == unsynchronized+inactive'
message: 'System clock is not synchronized'
message: NTP is inactive and the system clock is not synchronized. Enable NTP and synchronize the system clock to continue.
- fail:
when: 'ntp == unsynchronized+active'
message: System clock is not yet synchronized
message: NTP is enabled but the system clock is not synchronized. Synchronize the system clock to continue.
- pass:
when: 'ntp == synchronized+active'
message: 'System clock is synchronized'
message: NTP is enabled and the system clock is synchronized
- fail:
message: 'Unable to determine system clock status'
- jsonCompare:
Expand Down Expand Up @@ -395,7 +543,7 @@ spec:
message: "/proc filesystem is mounted"
- fail:
when: "false"
message: "/proc filesystem is not mounted"
message: /proc filesystem must be mounted, but it currently is not
- textAnalyze:
checkName: Check if 'modprobe' command exists in PATH
fileName: host-collectors/run-host/check-modprobe.txt
Expand All @@ -406,7 +554,7 @@ spec:
message: "'modprobe' command exists in PATH"
- fail:
when: "false"
message: "'modprobe' command does not exist in PATH"
message: "'modprobe' command must exist in PATH"
- textAnalyze:
checkName: Check if 'mount' command exists in PATH
fileName: host-collectors/run-host/check-mount.txt
Expand All @@ -417,7 +565,7 @@ spec:
message: "'mount' command exists in PATH"
- fail:
when: "false"
message: "'mount' command does not exist in PATH"
message: "'mount' command must exist in PATH"
- textAnalyze:
checkName: Check if 'umount' command exists in PATH
fileName: host-collectors/run-host/check-umount.txt
Expand All @@ -428,15 +576,15 @@ spec:
message: "'umount' command exists in PATH"
- fail:
when: "false"
message: "'umount' command does not exist in PATH"
message: "'umount' command must exist in PATH"
- hostOS:
checkName: Check minimum kernel version
outcomes:
- pass:
when: "kernelVersion >= 3.10"
message: "Minimum kernel version of 3.10 has been met"
message: Kernel version must be at least 3.10
- fail:
message: "Minimum kernel version of 3.10 has not been met"
message: Kernel version is at least 3.10
- textAnalyze:
checkName: Hostname Mismatch
fileName: host-collectors/run-host/k0scontroller-logs.txt
Expand All @@ -448,3 +596,35 @@ spec:
- pass:
when: "false"
message: "No signs of hostname changes found"
- textAnalyze:
checkName: Check if localhost resolves to 127.0.0.1
fileName: host-collectors/run-host/localhost-ips.txt
regex: 'localhost has address 127.0.0.1'
outcomes:
- fail:
when: "false"
message: "'localhost' does not resolve to 127.0.0.1. Ensure your /etc/hosts file contains an entry for 'localhost' with a loopback address of 127.0.0.1."
- pass:
when: "true"
message: "'localhost' resolves to 127.0.0.1"
- textAnalyze:
checkName: "Detect Threat Management and Network Security Tools"
fileName: host-collectors/run-host/ps-detect-antivirus-and-security-tools.txt
regex: '\b(clamav|sophos|esets_daemon|fsav|symantec|mfend|ds_agent|kav|bdagent|s1agent|falcon|illumio|xagt)\b'
ignoreIfNoFiles: true
outcomes:
- fail:
when: "true"
message: "Antivirus or network security tools detected. These tools are known to interfere with Kubernetes operation in various ways. If problems persist, disable these tools, or consult with your organization's system administrator to ensure that exceptions are made for Kubernetes operation."
- pass:
when: "false"
message: "No antivirus or network security tools detected."
- filesystemPerformance:
checkName: Filesystem Write Latency
collectorName: filesystem-write-latency-etcd
outcomes:
- pass:
when: "p99 < 10ms"
message: 'P99 write latency for the disk at {{ .K0sDataDir }}/etcd is {{ "{{" }} .P99 {{ "}}" }}, which is better than the 10 ms requirement.'
- fail:
message: 'P99 write latency for the disk at {{ .K0sDataDir }}/etcd is {{ "{{" }} .P99 {{ "}}" }}, but it must be less than 10 ms. A higher-performance disk is required.'

0 comments on commit 7024616

Please sign in to comment.