From f85b941b60d502658787e73717ef8bf6981734de Mon Sep 17 00:00:00 2001 From: Sohamdg081992 <31517098+Sohamdg081992@users.noreply.github.com> Date: Mon, 13 May 2024 16:08:51 -0700 Subject: [PATCH] Add tests for pod getting scheduled on all nodes (#875) [comment]: # (Note that your PR title should follow the conventional commit format: https://conventionalcommits.org/en/v1.0.0/#summary) # PR Description Add tests for pod getting scheduled on all and fips nodes [comment]: # (The below checklist is for PRs adding new features. If a box is not checked, add a reason why it's not needed.) # New Feature Checklist - [ ] List telemetry added about the feature. - [ ] Link to the one-pager about the feature. - [ ] List any tasks necessary for release (3P docs, AKS RP chart changes, etc.) after merging the PR. - [ ] Attach results of scale and perf testing. [comment]: # (The below checklist is for code changes. Not all boxes necessarily need to be checked. Build, doc, and template changes do not need to fill out the checklist.) # Tests Checklist - [X ] Have end-to-end Ginkgo tests been run on your cluster and passed? To bootstrap your cluster to run the tests, follow [these instructions](/otelcollector/test/README.md#bootstrap-a-dev-cluster-to-run-ginkgo-tests). Yes - Labels used when running the tests on your cluster: - [ ] `operator` - [ ] `windows` - [ ] `arm64` - [ ] `arc-extension` - [ X] Have new tests been added?Yes For features, have tests been added for this feature? For fixes, is there a test that could have caught this issue and could validate that the fix works? - [ ] Is a new scrape job needed? - [ ] The scrape job was added to the folder [test-cluster-yamls](/otelcollector/test/test-cluster-yamls/) in the correct configmap or as a CR. - [X ] Was a new test label added?Yes - [X ] A string constant for the label was added to [constants.go](/otelcollector/test/utils/constants.go). - [X ] The label and description was added to the [test README](/otelcollector/test/README.md). - [X ] The label was added to this [PR checklist](/.github/pull_request_template). - [ ] The label was added as needed to [testkube-test-crs.yaml](/otelcollector/test/testkube/testkube-test-crs.yaml). - [ X] Are additional API server permissions needed for the new tests? NO - [ ] These permissions have been added to [api-server-permissions.yaml](/otelcollector/test/testkube/api-server-permissions.yaml). - [ ] Was a new test suite (a new folder under `/tests`) added? - [ ] The new test suite is included in [testkube-test-crs.yaml](/otelcollector/test/testkube/testkube-test-crs.yaml). --- .github/pull_request_template.md | 1 + otelcollector/test/README.md | 7 ++ .../containerstatus/container_status_test.go | 30 +++++++ otelcollector/test/utils/constants.go | 16 ++-- .../test/utils/kubernetes_api_utils.go | 84 +++++++++++++++++++ 5 files changed, 131 insertions(+), 7 deletions(-) diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 900711982..4c7e904a4 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -19,6 +19,7 @@ - [ ] `windows` - [ ] `arm64` - [ ] `arc-extension` + - [ ] `fips` - [ ] Have new tests been added? For features, have tests been added for this feature? For fixes, is there a test that could have caught this issue and could validate that the fix works? - [ ] Is a new scrape job needed? - [ ] The scrape job was added to the folder [test-cluster-yamls](/otelcollector/test/test-cluster-yamls/) in the correct configmap or as a CR. diff --git a/otelcollector/test/README.md b/otelcollector/test/README.md index 8308aabd6..e722e1ccb 100644 --- a/otelcollector/test/README.md +++ b/otelcollector/test/README.md @@ -21,6 +21,12 @@ - ama-metrics-ksm replicaset - ama-metrics-targets-operator replicaset `label=operator` - prometheus-node-exporter daemonset `label=arc-extension` + - All Daemonset pods are scheduled in all the nodes. Nodes include: + - FIPS + - ARM64 + - All Daemonset pods have all containers running in all nodes. Nodes include: + - FIPS + - ARM64 - Liveness Probe - When processes aren't running on the `prometheus-collector` replicaset container, the container should restart. Processes include: - otelcollector @@ -53,6 +59,7 @@ - `windows`: Tests that should only run on clusters that have Windows nodes. - `arm64`: Tests that should only run on clusters taht have ARM64 nodes. - `linux-daemonset-custom-config`: Tests that should only run on clusters that have the ama-metrics-config-node configmap. +- `fips`: Tests that should only run on clusters taht have FIPS nodes. # File Directory Structure ``` diff --git a/otelcollector/test/containerstatus/container_status_test.go b/otelcollector/test/containerstatus/container_status_test.go index e069153b6..da2a8bc5b 100644 --- a/otelcollector/test/containerstatus/container_status_test.go +++ b/otelcollector/test/containerstatus/container_status_test.go @@ -26,6 +26,34 @@ var _ = DescribeTable("The containers should be running", Entry("when checking the prometheus-node-exporter pod", "kube-system", "app", "prometheus-node-exporter", Label(utils.ArcExtensionLabel)), ) +/* + * For each of the DS pods that we deploy in our chart, ensure that all nodes have been used to schedule these pods. + * The label and values are provided to get a list of pods only with that label. + * The osLabel is provided to check on all DS pods based on the OS. + */ +var _ = DescribeTable("The pods should be scheduled in all nodes", + func(namespace string, controllerLabelName string, controllerLabelValue string, osLabel string) { + err := utils.CheckIfAllPodsScheduleOnNodes(K8sClient, namespace, controllerLabelName, controllerLabelValue, osLabel) + Expect(err).NotTo(HaveOccurred()) + }, + Entry("when checking the ama-metrics-node", "kube-system", "dsName", "ama-metrics-node", "linux"), + Entry("when checking the ama-metrics-win-node pod", "kube-system", "dsName", "ama-metrics-win-node", "windows", Label(utils.WindowsLabel)), +) + +/* + * For each of the DS pods that we deploy in our chart, ensure that all specific nodes like ARM64,FIPS have been used to schedule these pods. + * The label and values are provided to get a list of pods only with that label. + */ +var _ = DescribeTable("The pods should be scheduled in all Fips and ARM64 nodes", + func(namespace string, controllerLabelName string, controllerLabelValue string, nodeLabelKey string, nodeLabelValue string) { + err := utils.CheckIfAllPodsScheduleOnSpecificNodesLabels(K8sClient, namespace, controllerLabelName, controllerLabelValue, nodeLabelKey, nodeLabelValue) + Expect(err).NotTo(HaveOccurred()) + }, + Entry("when checking the ama-metrics-node", "kube-system", "dsName", "ama-metrics-node", "kubernetes.azure.com/fips_enabled", "true", Label(utils.FIPSLabel)), + Entry("when checking the ama-metrics-win-node pod", "kube-system", "dsName", "ama-metrics-win-node", "kubernetes.azure.com/fips_enabled", "true", Label(utils.WindowsLabel), Label(utils.FIPSLabel)), + Entry("when checking the ama-metrics-node", "kube-system", "dsName", "ama-metrics-node", "kubernetes.io/arch", "arm64", Label(utils.ARM64Label)), +) + /* * For each of the pods that have the prometheus-collector container, check all expected processes are running. * The linux replicaset and daemonset will should have the same processes running. @@ -95,6 +123,8 @@ var _ = DescribeTable("The container logs should not contain errors", }, Entry("when checking the ama-metrics replica pods", "kube-system", "rsName", "ama-metrics"), Entry("when checking the ama-metrics-node", "kube-system", "dsName", "ama-metrics-node"), + Entry("when checking the ama-metrics replica pods", "kube-system", "rsName", "ama-metrics", Label(utils.ARM64Label)), + Entry("when checking the ama-metrics-node", "kube-system", "dsName", "ama-metrics-node", Label(utils.ARM64Label)), Entry("when checking the ama-metrics-win-node", "kube-system", "dsName", "ama-metrics-win-node", Label(utils.WindowsLabel)), Entry("when checking the ama-metrics-ksm pod", "kube-system", "app.kubernetes.io/name", "ama-metrics-ksm"), Entry("when checking the ama-metrics-operator-targets pod", "kube-system", "rsName", "ama-metrics-operator-targets", Label(utils.OperatorLabel)), diff --git a/otelcollector/test/utils/constants.go b/otelcollector/test/utils/constants.go index 476a042d3..f6723cfc7 100644 --- a/otelcollector/test/utils/constants.go +++ b/otelcollector/test/utils/constants.go @@ -1,5 +1,6 @@ package utils + var( // Slices can't be constants LogLineErrorsToExclude = [...]string{ @@ -16,10 +17,11 @@ var( } ) -const( - OperatorLabel = "operator" - ArcExtensionLabel = "arc-extension" - WindowsLabel = "windows" - ARM64Label = "arm64" - LinuxDaemonsetCustomConfig = "linux-daemonset-custom-config" -) \ No newline at end of file +const ( + OperatorLabel = "operator" + ArcExtensionLabel = "arc-extension" + WindowsLabel = "windows" + ARM64Label = "arm64" + FIPSLabel = "fips" + LinuxDaemonsetCustomConfig = "linux-daemonset-custom-config" +) diff --git a/otelcollector/test/utils/kubernetes_api_utils.go b/otelcollector/test/utils/kubernetes_api_utils.go index 4dff0fee7..8d4922dba 100644 --- a/otelcollector/test/utils/kubernetes_api_utils.go +++ b/otelcollector/test/utils/kubernetes_api_utils.go @@ -306,6 +306,90 @@ func CheckIfAllContainersAreRunning(clientset *kubernetes.Clientset, namespace, return nil } +/* + * Check that pods with the specified namespace and label value are scheduled in all the nodes. If a node has no schduled pod on it, return an error. + * Also check that the containers are scheduled and running on those nodes. + */ +func CheckIfAllPodsScheduleOnNodes(clientset *kubernetes.Clientset, namespace, labelKey string, labelValue string, osLabel string) error { + + // Get list of all nodes + nodes, err := clientset.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{}) + + if err != nil { + return errors.New(fmt.Sprintf("Error getting nodes with the specified labels: %v", err)) + } + + for _, node := range nodes.Items { + if node.Labels["beta.kubernetes.io/os"] == osLabel { + // Get list of pods scheduled on this node + pods, err := clientset.CoreV1().Pods(namespace).List(context.TODO(), metav1.ListOptions{ + FieldSelector: "spec.nodeName=" + node.Name, + LabelSelector: labelKey + "=" + labelValue, + }) + + if err != nil || pods == nil || len(pods.Items) == 0 { + return errors.New(fmt.Sprintf("Error getting pods on node %s:", node.Name)) + } + + for _, pod := range pods.Items { + if pod.Status.Phase != corev1.PodRunning { + return errors.New(fmt.Sprintf("Pod is not runinng. Phase is: %v", pod.Status.Phase)) + } + + for _, containerStatus := range pod.Status.ContainerStatuses { + if containerStatus.State.Running == nil { + return errors.New(fmt.Sprintf("Container %s is not running", containerStatus.Name)) + } + } + } + } + } + + return nil +} + +/* + * Check that pods with the specified namespace and label value are scheduled in all the Fips and ARM64 nodes. If a node has no schduled pod on it, return an error. + * Also check that the containers are scheduled and running on those nodes. + */ +func CheckIfAllPodsScheduleOnSpecificNodesLabels(clientset *kubernetes.Clientset, namespace, labelKey string, labelValue string, nodeLabelKey string, nodeLabelValue string) error { + + // Get list of all nodes + nodes, err := clientset.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{}) + + if err != nil { + return errors.New(fmt.Sprintf("Error getting nodes with the specified labels: %v", err)) + } + + for _, node := range nodes.Items { + if value, ok := node.Labels[nodeLabelKey]; ok && value == nodeLabelValue { + + // Get list of pods scheduled on this node + pods, err := clientset.CoreV1().Pods(namespace).List(context.TODO(), metav1.ListOptions{ + FieldSelector: "spec.nodeName=" + node.Name, + LabelSelector: labelKey + "=" + labelValue, + }) + + if err != nil || pods == nil || len(pods.Items) == 0 { + return errors.New(fmt.Sprintf("Error getting pods on node %s:", node.Name)) + } + for _, pod := range pods.Items { + if pod.Status.Phase != corev1.PodRunning { + return errors.New(fmt.Sprintf("Pod is not runinng. Phase is: %v", pod.Status.Phase)) + } + + for _, containerStatus := range pod.Status.ContainerStatuses { + if containerStatus.State.Running == nil { + return errors.New(fmt.Sprintf("Container %s is not running", containerStatus.Name)) + } + } + } + } + } + + return nil +} + /* * Update an unused field in configmap with a random value to cause a configmap update event. */