Merge pull request #1 from grdryn/prom-test-things

chore: remove need for extract_alerts.sh
opendatahub-io · Nov 15, 2024 · 5a453e9 · 5a453e9
2 parents 291f0a4 + e8509c9
commit 5a453e9
Show file tree

Hide file tree

Showing 17 changed files with 62 additions and 85 deletions.
diff --git a/.github/workflows/prometheus-unit-tests.yaml b/.github/workflows/prometheus-unit-tests.yaml
@@ -10,6 +10,10 @@ jobs:
     steps:
     - name: Checkout
       uses: actions/checkout@v4
+    - name: Setup Go
+      uses: actions/setup-go@v4
+      with:
+        go-version-file: go.mod
     - name: Install Promtool
       run: |
         sudo apt-get update && sudo apt-get install -y prometheus

diff --git a/.gitignore b/.gitignore
@@ -62,6 +62,3 @@ local.mk
 
 # Ignore temporary files created by the Makefile
 *.mktmp.*
-
-#Ignore temporary alert yaml files created by the Makefile
-*_alerts.yaml
diff --git a/Makefile b/Makefile
@@ -97,8 +97,10 @@ IMAGE_BUILD_FLAGS ?= --build-arg USE_LOCAL=false
 
 # Prometheus-Unit Tests Parameters
 PROMETHEUS_CONFIG_YAML = ./config/monitoring/prometheus/apps/prometheus-configs.yaml
-PROMETHEUS_CONFIG_DIR = ./config/monitoring/prometheus/apps/
-GENERATED_ALERT_DIR = ./tests/prometheus_unit_tests/
+PROMETHEUS_CONFIG_DIR = ./config/monitoring/prometheus/apps
+PROMETHEUS_TEST_DIR = ./tests/prometheus_unit_tests
+PROMETHEUS_ALERT_TESTS = $(shell find $(PROMETHEUS_TEST_DIR) -name "*.unit-tests.yaml")
+
 ALERT_SEVERITY = critical
 
 # Read any custom variables overrides from a local.mk file.  This will only be read if it exists in the
@@ -256,11 +258,13 @@ controller-gen: $(CONTROLLER_GEN) ## Download controller-gen locally if necessar
 $(CONTROLLER_GEN): $(LOCALBIN)
 	test -s $(CONTROLLER_GEN) || GOBIN=$(LOCALBIN) go install sigs.k8s.io/controller-tools/cmd/controller-gen@$(CONTROLLER_GEN_VERSION)
 
-.PHONY: yq
-yq: $(YQ) ## Download yq locally if necessary.
+## Download yq locally if necessary.
 $(YQ): $(LOCALBIN)
 	test -s $(YQ) || GOBIN=$(LOCALBIN) go install github.com/mikefarah/yq/v4@$(YQ_VERSION)
 
+.PHONY: yq
+yq: $(YQ)
+
 OPERATOR_SDK_DL_URL ?= https://github.com/operator-framework/operator-sdk/releases/download/$(OPERATOR_SDK_VERSION)
 .PHONY: operator-sdk
 operator-sdk: $(OPERATOR_SDK) ## Download and install operator-sdk
@@ -380,19 +384,18 @@ unit-test: envtest
 	OPERATOR_NAMESPACE=$(OPERATOR_NAMESPACE) KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test $(TEST_SRC) -v  -coverprofile cover.out
 CLEANFILES += cover.out
 
-.PHONY: extract-alert-rules
-extract-alert-rules: $(PROMETHEUS_CONFIG_YAML) 
-	./tests/prometheus_unit_tests/scripts/extract_alerts.sh $(PROMETHEUS_CONFIG_YAML) $(GENERATED_ALERT_DIR)
+$(PROMETHEUS_TEST_DIR)/%.rules.yaml: $(PROMETHEUS_TEST_DIR)/%.unit-tests.yaml $(PROMETHEUS_CONFIG_YAML) $(YQ)
+	$(YQ) eval ".data.\"$(@F:.rules.yaml=.rules)\"" $(PROMETHEUS_CONFIG_YAML) > $@
 
 # Run prometheus-alert-unit-tests
 .PHONY: test-alerts
-test-alerts: extract-alert-rules
-	promtool test rules $(GENERATED_ALERT_DIR)/*_unit_tests.yaml 
+test-alerts: $(PROMETHEUS_ALERT_TESTS:.unit-tests.yaml=.rules.yaml)
+	promtool test rules $(PROMETHEUS_ALERT_TESTS)
 
 #Check for alerts without unit-tests
 .PHONY: check-prometheus-alert-unit-tests
-check-prometheus-alert-unit-tests: extract-alert-rules
-	./tests/prometheus_unit_tests/scripts/check_alert_tests.sh $(PROMETHEUS_CONFIG_YAML) $(GENERATED_ALERT_DIR) $(ALERT_SEVERITY)
+check-prometheus-alert-unit-tests: $(PROMETHEUS_ALERT_TESTS:.unit-tests.yaml=.rules.yaml)
+	./tests/prometheus_unit_tests/scripts/check_alert_tests.sh $(PROMETHEUS_CONFIG_YAML) $(PROMETHEUS_TEST_DIR) $(ALERT_SEVERITY)
 
 .PHONY: e2e-test
 e2e-test: ## Run e2e tests for the controller
@@ -403,3 +406,4 @@ clean: $(GOLANGCI_LINT)
 	$(GOLANGCI_LINT) cache clean
 	chmod u+w -R $(LOCALBIN) # envtest makes its dir RO
 	rm -rf $(CLEANFILES)
+	rm $(PROMETHEUS_ALERT_TESTS:.unit-tests.yaml=.rules.yaml)
diff --git a/README.md b/README.md
@@ -1,4 +1,3 @@
-
 This operator is the primary operator for Open Data Hub. It is responsible for enabling Data science applications like
 Jupyter Notebooks, Modelmesh serving, Datascience pipelines etc. The operator makes use of `DataScienceCluster` CRD to deploy
 and configure these applications.
@@ -185,7 +184,7 @@ e.g `make image-build -e IMAGE_BUILD_FLAGS="--build-arg USE_LOCAL=true"`
 **Deploying operator using OLM**
 
 - To create a new bundle in defined operator namespace, run following command:
-  
+
   ```commandline
   export OPERATOR_NAMESPACE=<namespace-to-install-operator>
   make bundle
@@ -216,7 +215,7 @@ There are 2 ways to test your changes with modification:
 
 Whenever a new api is added or a new field is added to the CRD, please make sure to run the command:
   ```commandline
-  make api-docs 
+  make api-docs
   ```
 This will ensure that the doc for the apis are updated accordingly.
 
@@ -404,7 +403,7 @@ for DataScienceCluster deletion.
 make e2e-test -e OPERATOR_NAMESPACE=<namespace> -e E2E_TEST_FLAGS="--skip-deletion=true"
 ```
 
-## Run Prometheus Unit Tests for Alerts 
+## Run Prometheus Unit Tests for Alerts
 
 Unit tests for Prometheus alerts are included in the repository. You can run them using the following command:
 
@@ -418,7 +417,7 @@ To check for alerts that don't have unit tests, run the below command:
 make check-prometheus-alert-unit-tests
 ```
 
-To add a new unit test, add the alert YAML file and the corresponding rule files to the [extract_alerts](tests/prometheus_unit_tests/scripts/extract_alerts.sh) script.
+To add a new unit test file, name it the same as the rules file in the [prometheus ConfigMap](./config/monitoring/prometheus/apps/prometheus-configs.yaml), just with the `.rules` suffix replaced with `.unit-tests.yaml`
 
 ### API Overview
 

diff --git a/tests/prometheus_unit_tests/.gitignore b/tests/prometheus_unit_tests/.gitignore
@@ -0,0 +1,2 @@
+# Ignore temporary alert yaml files created by the Makefile
+*.rules.yaml
diff --git a/...it_tests/codeflare_alerts_unit_tests.yaml → ..._tests/codeflare-alerting.unit-tests.yaml b/...it_tests/codeflare_alerts_unit_tests.yaml → ..._tests/codeflare-alerting.unit-tests.yaml
@@ -1,5 +1,5 @@
 rule_files:
-  - "codeflare_alerts.yaml"
+  - codeflare-alerting.rules.yaml
 
 evaluation_interval: 1m
 
@@ -93,7 +93,7 @@ tests:
       - eval_time: 1m
         alertname: CodeFlare Operator is not running
         exp_alerts: []
-  
+
   - interval: 1m
     input_series:
       - series: up{job="CodeFlare Operator"}
@@ -111,7 +111,7 @@ tests:
               description: This alert fires when the CodeFlare Operator is not running.
               triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/codeflare-operator-availability.md'
               summary: Alerting for CodeFlare Operator
-  
+
   - interval: 1m
     input_series:
     alert_rule_test:

diff --git a/...pipelines_operator_alerts_unit_tests.yaml → ...pelines-operator-alerting.unit-tests.yaml b/...pipelines_operator_alerts_unit_tests.yaml → ...pelines-operator-alerting.unit-tests.yaml
@@ -1,5 +1,5 @@
 rule_files:
-  - data_science_pipelines_operator_alerts.yaml
+  - data-science-pipelines-operator-alerting.rules.yaml
 
 evaluation_interval: 1m
 
@@ -43,7 +43,7 @@ tests:
               triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-application-error-burn-rate.md'
 
   - interval: 1m
-    input_series: 
+    input_series:
       - series: haproxy_backend_http_responses_total:burnrate30m{component="dsp"}
         values: "1+1x60"
       - series: haproxy_backend_http_responses_total:burnrate6h{component="dsp"}
@@ -62,7 +62,7 @@ tests:
               triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-application-error-burn-rate.md'
 
   - interval: 1m
-    input_series: 
+    input_series:
       - series: haproxy_backend_http_responses_total:burnrate2h{component="dsp"}
         values: "1+1x60"
       - series: haproxy_backend_http_responses_total:burnrate1d{component="dsp"}
@@ -81,7 +81,7 @@ tests:
               triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-application-error-burn-rate.md'
 
   - interval: 1m
-    input_series: 
+    input_series:
       - series: haproxy_backend_http_responses_total:burnrate6h{component="dsp"}
         values: "1+1x200"
       - series: haproxy_backend_http_responses_total:burnrate3d{component="dsp"}
@@ -141,7 +141,7 @@ tests:
               triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-operator-probe-success-burn-rate.md'
 
   - interval: 1m
-    input_series: 
+    input_series:
       - series: probe_success:burnrate30m{instance="data-science-pipelines-operator"}
         values: "1+1x60"
       - series: probe_success:burnrate6h{instance="data-science-pipelines-operator"}
@@ -161,7 +161,7 @@ tests:
               triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-operator-probe-success-burn-rate.md'
 
   - interval: 1m
-    input_series: 
+    input_series:
       - series: probe_success:burnrate2h{instance="data-science-pipelines-operator"}
         values: "1+1x60"
       - series: probe_success:burnrate1d{instance="data-science-pipelines-operator"}
@@ -182,7 +182,7 @@ tests:
 
   # application unavailable
   - interval: 1m
-    input_series: 
+    input_series:
       - series: data_science_pipelines_application_ready{dspa_name="dspa_instance_1", dspa_namespace="dspa_namespace_a"}
         values: "0x200"
       - series: data_science_pipelines_application_apiserver_ready{dspa_name="dspa_instance_1", dspa_namespace="dspa_namespace_a"}

diff --git a/..._unit_tests/kserve_alerts_unit_tests.yaml → ...nit_tests/kserve-alerting.unit-tests.yaml b/..._unit_tests/kserve_alerts_unit_tests.yaml → ...nit_tests/kserve-alerting.unit-tests.yaml
@@ -1,5 +1,5 @@
 rule_files:
-  - "kserve_alerts.yaml"
+  - kserve-alerting.rules.yaml
 
 evaluation_interval: 1m
 
@@ -80,4 +80,3 @@ tests:
               message: "High error budget burn for kserve-controller-manager (current value: 61)."
               summary: Kserve Controller Probe Success Burn Rate
               triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-kserve-controller-probe-success-burn-rate.md"
-
diff --git a/...s_unit_tests/kueue_alerts_unit_tests.yaml → ...unit_tests/kueue-alerting.unit-tests.yaml b/...s_unit_tests/kueue_alerts_unit_tests.yaml → ...unit_tests/kueue-alerting.unit-tests.yaml
@@ -1,5 +1,5 @@
 rule_files:
-  - "kueue_alerts.yaml"
+  - kueue-alerting.rules.yaml
 
 evaluation_interval: 1m
 
@@ -27,7 +27,7 @@ tests:
               description: This alert fires when the Kueue Operator is not running.
               summary: Alerting for Kueue Operator
               triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/kueue-operator-availability.md'
-  
+
   - interval: 1m
     input_series:
       - series: up{job="Kueue Operator"}

diff --git a/...t_tests/model_mesh_alerts_unit_tests.yaml → ...tests/model-mesh-alerting.unit-tests.yaml b/...t_tests/model_mesh_alerts_unit_tests.yaml → ...tests/model-mesh-alerting.unit-tests.yaml
@@ -1,5 +1,5 @@
 rule_files:
-  - model_mesh_alerts.yaml
+  - model-mesh-alerting.rules.yaml
 
 evaluation_interval: 1m
 
@@ -62,7 +62,7 @@ tests:
               summary: "Modelmesh Controller Probe Success Burn Rate"
               message: "High error budget burn for modelmesh-controller (current value: 16)."
               triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-modelmesh-controller-probe-success-burn-rate.md'
- 
+
   - interval: 1m
     input_series:
       - series: probe_success:burnrate2h{instance="modelmesh-controller"}

diff --git a/...s/model_controller_alerts_unit_tests.yaml → ...model-controller-alerting.unit-tests.yaml b/...s/model_controller_alerts_unit_tests.yaml → ...model-controller-alerting.unit-tests.yaml
@@ -1,5 +1,5 @@
 rule_files:
-  - model_controller_alerts.yaml
+  - odh-model-controller-alerting.rules.yaml
 
 evaluation_interval: 1m
 
@@ -62,7 +62,7 @@ tests:
               summary: "ODH Model Controller Probe Success Burn Rate"
               message: "High error budget burn for odh-model-controller (current value: 16)."
               triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-odh-controller-probe-success-burn-rate.md'
- 
+
   - interval: 1m
     input_series:
       - series: probe_success:burnrate2h{instance="odh-model-controller"}

diff --git a/...unit_tests/kuberay_alerts_unit_tests.yaml → ...s_unit_tests/ray-alerting.unit-tests.yaml b/...unit_tests/kuberay_alerts_unit_tests.yaml → ...s_unit_tests/ray-alerting.unit-tests.yaml
@@ -1,5 +1,5 @@
 rule_files:
-  - "kuberay_alerts.yaml"
+  - "ray-alerting.rules.yaml"
 
 evaluation_interval: 1m
 
@@ -27,7 +27,7 @@ tests:
               description: This alert fires when the KubeRay Operator is not running.
               summary: Alerting for KubeRay Operator
               triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/kuberay-operator-availability.md'
-  
+
   - interval: 1m
     input_series:
       - series: up{job="KubeRay Operator"}
@@ -44,4 +44,3 @@ tests:
               description: This alert fires when the KubeRay Operator is not running.
               summary: Alerting for KubeRay Operator
               triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/kuberay-operator-availability.md'
-
diff --git a/...it_tests/dashboard_alerts_unit_tests.yaml → .../rhods-dashboard-alerting.unit_tests.yaml b/...it_tests/dashboard_alerts_unit_tests.yaml → .../rhods-dashboard-alerting.unit_tests.yaml
@@ -1,5 +1,5 @@
 rule_files:
-  - dashboard_alerts.yaml
+  - rhods-dashboard-alerting.rules.yaml
 
 evaluation_interval: 1m
 
@@ -44,7 +44,7 @@ tests:
               triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/RHODS-Dashboard/rhods-error-burn-rate.md'
 
   - interval: 1m
-    input_series: 
+    input_series:
       - series: haproxy_backend_http_responses_total:burnrate30m{route="rhods-dashboard"}
         values: "1+1x60"
       - series: haproxy_backend_http_responses_total:burnrate6h{route="rhods-dashboard"}
@@ -64,7 +64,7 @@ tests:
               triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/RHODS-Dashboard/rhods-error-burn-rate.md'
 
   - interval: 1m
-    input_series: 
+    input_series:
       - series: haproxy_backend_http_responses_total:burnrate2h{route="rhods-dashboard"}
         values: "1+1x60"
       - series: haproxy_backend_http_responses_total:burnrate1d{route="rhods-dashboard"}
@@ -124,7 +124,7 @@ tests:
               triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/RHODS-Dashboard/rhods-dashboard-probe-success-burn-rate.md'
 
   - interval: 1m
-    input_series: 
+    input_series:
       - series: probe_success:burnrate30m{name="rhods-dashboard"}
         values: "1+1x60"
       - series: probe_success:burnrate6h{name="rhods-dashboard"}
@@ -144,7 +144,7 @@ tests:
               triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/RHODS-Dashboard/rhods-dashboard-probe-success-burn-rate.md'
 
   - interval: 1m
-    input_series: 
+    input_series:
       - series: probe_success:burnrate2h{name="rhods-dashboard"}
         values: "1+1x60"
       - series: probe_success:burnrate1d{name="rhods-dashboard"}
@@ -164,7 +164,7 @@ tests:
               triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/RHODS-Dashboard/rhods-dashboard-probe-success-burn-rate.md'
 
   - interval: 1m
-    input_series: 
+    input_series:
       - series: probe_success:burnrate6h{name="rhods-dashboard"}
         values: "1+1x200"
       - series: probe_success:burnrate3d{name="rhods-dashboard"}

diff --git a/tests/prometheus_unit_tests/scripts/extract_alerts.sh b/tests/prometheus_unit_tests/scripts/extract_alerts.sh
diff --git a/...t_tests/training_operator_unit_tests.yaml → ...trainingoperator-alerting.unit-tests.yaml b/...t_tests/training_operator_unit_tests.yaml → ...trainingoperator-alerting.unit-tests.yaml
@@ -1,5 +1,5 @@
 rule_files:
-  - "training_operator_alerts.yaml"
+  - trainingoperator-alerting.rules.yaml
 
 evaluation_interval: 1m
 
@@ -27,11 +27,11 @@ tests:
               description: "This alert fires when the KubeFlow Training Operator is not running."
               summary: Alerting for KubeFlow Training Operator
               triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/training-operator-availability.md"
-  
+
   - interval: 1m
     input_series:
       - series: up{job="KubeFlow Training Operator"}
-        values: "0" 
+        values: "0"
     alert_rule_test:
       - eval_time: 2m
         alertname: KubeFlow Training Operator is not running
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Ignore temporary alert yaml files created by the Makefile
		*.rules.yaml