diff --git a/pkg/supportbundle/staticspecs/defaultspec.yaml b/pkg/supportbundle/staticspecs/defaultspec.yaml index db10366ed8..64b6c43e5f 100644 --- a/pkg/supportbundle/staticspecs/defaultspec.yaml +++ b/pkg/supportbundle/staticspecs/defaultspec.yaml @@ -54,11 +54,6 @@ spec: name: kots/admin_console selector: - app=kotsadm - - logs: - collectorName: kurl-proxy-kotsadm - name: kots/admin_console - selector: - - app=kurl-proxy-kotsadm - logs: collectorName: kotsadm-dex name: kots/admin_console @@ -79,12 +74,6 @@ spec: name: kotsadm-replicated-registry # NOTE: this will not live under the kots/ directory like other collectors includeValue: false key: .dockerconfigjson - - logs: - collectorName: kube-flannel - selector: - - app=flannel - namespace: kube-flannel - name: kots/kurl/flannel - exec: args: - "http://goldpinger.kurl.svc.cluster.local:80/check_all" @@ -97,7 +86,6 @@ spec: - app=kotsadm timeout: 10s - nodeMetrics: {} - analyzers: - clusterVersion: outcomes: @@ -122,18 +110,6 @@ spec: - deploymentStatus: {} - jobStatus: {} - replicasetStatus: {} - - textAnalyze: - checkName: Inter-pod Networking - exclude: "" - ignoreIfNoFiles: true - fileName: kots/goldpinger/*/kotsadm-*/goldpinger-statistics-stdout.txt - outcomes: - - fail: - when: "OK = false" - message: Some nodes have pod communication issues - - pass: - message: Goldpinger can communicate properly - regexGroups: '"OK": ?(?P\w+)' - nodeResources: checkName: Node status check outcomes: @@ -145,10 +121,6 @@ spec: message: "Not all nodes are online." - pass: message: "All nodes are online." - - clusterPodStatuses: - checkName: contour pods unhealthy - namespaces: - - projectcontour outcomes: - fail: when: "!= Healthy" # Catch all unhealthy pods. A pod is considered healthy if it has a status of Completed, or Running and all of its containers are ready. @@ -174,4 +146,4 @@ spec: when: "pvcUsedPercentage >= 80%" message: "kotsadm-minio PVC using more than 80% of storage" - pass: - message: "kotsadm-minio PVC is using less than 80% of storage" \ No newline at end of file + message: "kotsadm-minio PVC is using less than 80% of storage" diff --git a/pkg/supportbundle/staticspecs/kurlspec.yaml b/pkg/supportbundle/staticspecs/kurlspec.yaml index c2a1e72316..af20427859 100644 --- a/pkg/supportbundle/staticspecs/kurlspec.yaml +++ b/pkg/supportbundle/staticspecs/kurlspec.yaml @@ -3,13 +3,87 @@ kind: SupportBundle metadata: name: kurl-supportbundle spec: + uri: "https://raw.githubusercontent.com/replicatedhq/kots/main/pkg/supportbundle/staticspecs/kurlspec.yaml" collectors: - - ceph: {} - - longhorn: {} + - http: + collectorName: replicated.app-health-check + get: + url: https://replicated.app/healthz + - logs: + collectorName: weave-net + selector: + - name=weave-net + namespace: kube-system + name: kots/kurl/weave + - logs: + collectorName: registry + name: kots/kurl + selector: + - app=registry + namespace: kurl + - logs: + collectorName: ekc-operator + name: kots/kurl + selector: + - app=ekc-operator + namespace: kurl + - logs: + collectorName: velero-logs + namespace: velero + name: kots/velero - logs: collectorName: rook-ceph-logs namespace: rook-ceph name: kots/rook + - logs: + collectorName: kurl-control-plane + name: kots/kurl/control-plane + selector: + - tier=control-plane + - logs: + collectorName: kurl-proxy-kotsadm + name: kots/admin_console + selector: + - app=kurl-proxy-kotsadm + - logs: + collectorName: coredns-logs + namespace: kube-system + name: logs/coredns + selector: + - k8s-app=kube-dns + - logs: + collectorName: minio + selector: + - app=minio + namespace: minio + name: kots/kurl/minio + - logs: + collectorName: ha-minio + selector: + - app=ha-minio + namespace: minio + name: kots/kurl/ha-minio + - logs: + collectorName: projectcontour-logs + namespace: projectcontour + name: projectcontour/logs + - logs: + collectorName: kube-flannel + selector: + - app=flannel + namespace: kube-flannel + name: kots/kurl/flannel + - exec: + args: + - "http://goldpinger.kurl.svc.cluster.local:80/check_all" + collectorName: goldpinger-statistics + command: + - curl + containerName: kotsadm + name: kots/goldpinger + selector: + - app=kotsadm + timeout: 10s - exec: collectorName: weave-status command: @@ -38,24 +112,80 @@ spec: selector: - name=weave-net timeout: 10s - - logs: + - runPod: + name: rqlite-status + namespace: default + podSpec: + containers: + - name: rqlite-status + image: busybox:1 + command: ["wget"] + args: ["-q", "-T", "5", "http://kotsadm-rqlite:4001/status?pretty", "-O-"] + - runPod: + name: rqlite-nodes + namespace: default + podSpec: + containers: + - name: rqlite-nodes + image: busybox:1 + command: ["wget"] + args: ["-q", "-T", "5", "http://kotsadm-rqlite:4001/nodes?pretty&ver=2", "-O-"] + - copyFromHost: + collectorName: kurl-host-preflights + name: kots/kurl/host-preflights + hostPath: /var/lib/kurl/host-preflights + extractArchive: true + image: alpine + imagePullPolicy: IfNotPresent + timeout: 1m + serviceAccount: ekco + - copyFromHost: + collectorName: "copy apiserver audit logs" + image: alpine + hostPath: "/var/log/apiserver/" + name: "logs" + extractArchive: true + - copyFromHost: + collectorName: "copy kURL logs" + image: alpine + hostPath: "/var/log/kurl/" + name: "logs" + extractArchive: true + - configMap: + collectorName: coredns + name: coredns + namespace: kube-system + includeAllData: true + - configMap: + collectorName: kube-proxy + name: kube-proxy + namespace: kube-system + includeAllData: true + - configMap: + collectorName: kubeadm-config + name: kubeadm-config + namespace: kube-system + includeAllData: true + - configMap: + collectorName: kubelet-config + name: kubelet-config + namespace: kube-system + includeAllData: true + - configMap: + collectorName: kurl-config + name: kurl-config + namespace: kube-system + includeAllData: true + - configMap: collectorName: weave-net - selector: - - name=weave-net + name: weave-net namespace: kube-system - name: kots/kurl/weave - - logs: - collectorName: registry - name: kots/kurl - selector: - - app=registry - namespace: kurl - - logs: - collectorName: ekc-operator - name: kots/kurl - selector: - - app=ekc-operator + includeAllData: true + - configMap: + collectorName: ekco-config + name: ekco-config namespace: kurl + includeAllData: true - configMap: collectorName: kurl-current-config name: kurl-current-config # NOTE: this will not live under the kots/ directory like other collectors @@ -66,15 +196,9 @@ spec: name: kurl-last-config # NOTE: this will not live under the kots/ directory like other collectors namespace: kurl includeAllData: true - - copyFromHost: - collectorName: kurl-host-preflights - name: kots/kurl/host-preflights - hostPath: /var/lib/kurl/host-preflights - extractArchive: true - image: alpine - imagePullPolicy: IfNotPresent - timeout: 1m - + - ceph: {} + - longhorn: {} + - nodeMetrics: {} analyzers: - cephStatus: {} - longhorn: {} @@ -138,3 +262,203 @@ spec: when: "false" message: "has access" regex: 'the server does not allow access to the requested resource' + - deploymentStatus: + checkName: Check EKCO is operational + name: ekc-operator + namespace: kurl + outcomes: + - fail: + when: absent + message: EKCO is not installed - please add the EKCO component to your kURL spec and re-run the installer script + - fail: + when: "< 1" + message: EKCO does not have any Ready pods + - pass: + message: EKCO is installed and running + - textAnalyze: + checkName: Check installed EKCO version for critical fixes + ignoreIfNoFiles: true + fileName: cluster-resources/deployments/kurl.json + regexGroups: '"image": "replicated/ekco:v(?P\d+)\.(?P\d+)\.(?P\d+)"' + outcomes: + - warn: + when: "Minor < 4" + message: A critical update for cluster certificate rotation has been released in EKCO 0.4.0. Please upgrade to the latest available version. + - warn: + when: "Minor < 19" + message: A critical fix for registry certificate rotation has been released in EKCO 0.19.3. Please upgrade to the latest available version. + - pass: + when: "Minor > 20" + message: EKCO version is recent + - clusterPodStatuses: + outcomes: + - fail: + when: "!= Healthy" + message: "Status: {{ .Status.Reason }}" + - statefulsetStatus: {} + - deploymentStatus: {} + - jobStatus: {} + - replicasetStatus: {} + - weaveReport: + reportFileGlob: kots/kurl/weave/kube-system/*/weave-report-stdout.txt + - textAnalyze: + checkName: Weave Status + exclude: "" + ignoreIfNoFiles: true + fileName: kots/kurl/weave/kube-system/weave-net-*/weave-status-stdout.txt + outcomes: + - fail: + message: Weave is not ready + - pass: + message: Weave is ready + regex: 'Status: ready' + - textAnalyze: + checkName: Weave Report + exclude: "" + ignoreIfNoFiles: true + fileName: kots/kurl/weave/kube-system/weave-net-*/weave-report-stdout.txt + outcomes: + - fail: + message: Weave is not ready + - pass: + message: Weave is ready + regex: '"Ready": true' + - textAnalyze: + checkName: Weave IP Allocation + exclude: "" + ignoreIfNoFiles: true + regex: 'IP allocation was seeded by different peers' + fileName: kots/kurl/weave/weave-net-*/weave.log + outcomes: + - fail: + when: "true" + message: IP Allocation issues detected. Please run `rm /var/lib/weave/weave-netdata.db && reboot` on each node to resolve this. + - pass: + when: "false" + message: Weave is ready, there are no IP allocation issues. + - textAnalyze: + checkName: Inter-pod Networking + exclude: "" + ignoreIfNoFiles: true + fileName: kots/goldpinger/*/kotsadm-*/goldpinger-statistics-stdout.txt + outcomes: + - fail: + when: "OK = false" + message: Some nodes have pod communication issues + - pass: + message: Goldpinger can communicate properly + regexGroups: '"OK": ?(?P\w+)' + - textAnalyze: + checkName: Etcdserver Database Size Exceeded + exclude: "" + ignoreIfNoFiles: true + fileName: "*" + regex: '(etcdserver)?.*mvcc.*database space exceeded' + outcomes: + - fail: + when: "true" + message: "etcdserver database has grown too large. See https://community.replicated.com/t/kubernetes-cluster-is-down-and-reporting-etcdserver-mvcc-database-size-exceeded/1428" + - pass: + when: "false" + message: etcdserver database is not too large + - nodeResources: + checkName: Node status check + outcomes: + - fail: + when: "nodeCondition(Ready) == False" + message: "Not all nodes are online." + - fail: + when: "nodeCondition(Ready) == Unknown" + message: "Not all nodes are online." + - pass: + message: "All nodes are online." + - textAnalyze: + checkName: longhorn multipath conflict + ignoreIfNoFiles: true + fileName: longhorn/longhorn-system/logs/longhorn-csi-plugin-*/longhorn-csi-plugin.log + outcomes: + - fail: + when: "true" + uri: "https://longhorn.io/kb/troubleshooting-volume-with-multipath/" + message: "Longhorn volumes may be in use by system multipath." + - pass: + when: "false" + message: "No block-device conflicts detected" + regex: '.*is apparently in use by the system;.*' + - textAnalyze: + checkName: Minio disk full + fileName: cluster-resources/pods/logs/kurl/registry-*/registry.log + ignoreIfNoFiles: true + regex: '.*XMinioStorageFull: Storage backend has reached its minimum free disk threshold.*' + outcomes: + - fail: + when: "true" + message: "Minio Disk Full" + - pass: + when: "false" + message: "Minio Disk Ok" + - textAnalyze: + checkName: Known issue with Rook < 1.4 + exclude: "" + ignoreIfNoFiles: true + fileName: /ceph/status.json + regex: '\"ceph_release\": \"nautilus\"|\"status\": \"HEALTH_WARN\"' + outcomes: + - fail: + when: "true" + message: "If you have been removing and adding nodes then, you might want ensure that you are not facing the scenario described in the community topic: https://community.replicated.com/t/1099" + - pass: + when: "false" + message: "You are not using a Rook versions < 1.4 and/or your Ceph status is OK" + - textAnalyze: + checkName: Rook rbd filesystem consistency + fileName: /kots/rook/rook-ceph-agent-*.log + ignoreIfNoFiles: true + regex: 'UNEXPECTED INCONSISTENCY; RUN fsck MANUALLY.' + outcomes: + - fail: + when: "true" + message: "One or more rook rbd(s) were detected to have filesystem inconsistencies and require manual intervention" + - pass: + when: "false" + message: "Rook filesystem consistency ok" + - jsonCompare: + checkName: https://replicated.app host health check + fileName: replicated.app-health-check.json + path: "response.status" + value: "200" + outcomes: + - fail: + when: "false" + message: https://replicated.app is unhealthy. License and software update checks from replicated will fail. If this is locked down environment, please check your proxy settings. + uri: https://kurl.sh/docs/install-with-kurl/proxy-installs + - pass: + when: "true" + message: https://replicated.app host is healthy + - storageClass: + checkName: Check for default storage class + outcomes: + - fail: + message: No default storage class found + - pass: + message: Default storage class found + - nodeMetrics: + checkName: Check for PVCs using more than 80% storage in the entire cluster + outcomes: + - fail: + when: "pvcUsedPercentage >= 80" + message: "There are PVCs using more than 80% of storage: {{ .PVC.ConcatenatedNames }}" + - pass: + message: "No PVCs are using more than 80% of storage" + - event: + checkName: event-oom-check + namespace: default + reason: "OOMKilling" + kind: Node + outcomes: + - fail: + when: "true" + message: Event {{ .Reason }} by object {{ .InvolvedObject.Name }} kind {{ .InvolvedObject.Kind }} has message {{ .Message }} + - pass: + when: "false" + message: No OOMKilling event detected