opendatahub-io · zdtsw · Nov 22, 2024 · Nov 7, 2024 · Nov 8, 2024 · Nov 8, 2024
diff --git a/.github/workflows/prometheus-unit-tests.yaml b/.github/workflows/prometheus-unit-tests.yaml
@@ -0,0 +1,21 @@
+name: Run prometheus unit tests
+on:
+  pull_request:
+    paths:
+      - 'config/monitoring/prometheus/**'
+      - 'tests/prometheus_unit_tests/**'
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v4
+    - name: Setup Go
+      uses: actions/setup-go@v4
+      with:
+        go-version-file: go.mod
+    - name: Install Promtool
+      run: |
+        sudo apt-get update && sudo apt-get install -y prometheus
+    - name: Run prometheus-unit-tests
+      run : make test-alerts
diff --git a/.gitignore b/.gitignore
@@ -62,4 +62,3 @@ local.mk
 
 # Ignore temporary files created by the Makefile
 *.mktmp.*
-
diff --git a/Makefile b/Makefile
@@ -95,6 +95,14 @@ E2E_TEST_FLAGS = "--skip-deletion=false" -timeout 25m # See README.md, default g
 # see target "image-build"
 IMAGE_BUILD_FLAGS ?= --build-arg USE_LOCAL=false
 
+# Prometheus-Unit Tests Parameters
+PROMETHEUS_CONFIG_YAML = ./config/monitoring/prometheus/apps/prometheus-configs.yaml
+PROMETHEUS_CONFIG_DIR = ./config/monitoring/prometheus/apps
+PROMETHEUS_TEST_DIR = ./tests/prometheus_unit_tests
+PROMETHEUS_ALERT_TESTS = $(wildcard $(PROMETHEUS_TEST_DIR)/*.unit-tests.yaml)
+
+ALERT_SEVERITY = critical
+
 # Read any custom variables overrides from a local.mk file.  This will only be read if it exists in the
 # same directory as this Makefile.  Variables can be specified in the standard format supported by
 # GNU Make since `include` processes any valid Makefile
@@ -374,6 +382,22 @@ unit-test: envtest
 	OPERATOR_NAMESPACE=$(OPERATOR_NAMESPACE) KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test $(TEST_SRC) -v  -coverprofile cover.out
 CLEANFILES += cover.out
 
+$(PROMETHEUS_TEST_DIR)/%.rules.yaml: $(PROMETHEUS_TEST_DIR)/%.unit-tests.yaml $(PROMETHEUS_CONFIG_YAML) $(YQ)
+	$(YQ) eval ".data.\"$(@F:.rules.yaml=.rules)\"" $(PROMETHEUS_CONFIG_YAML) > $@
+
+PROMETHEUS_ALERT_RULES := $(PROMETHEUS_ALERT_TESTS:.unit-tests.yaml=.rules.yaml)
+
+# Run prometheus-alert-unit-tests
+.PHONY: test-alerts
+test-alerts: $(PROMETHEUS_ALERT_RULES)
+	promtool test rules $(PROMETHEUS_ALERT_TESTS)
+
+#Check for alerts without unit-tests
+.PHONY: check-prometheus-alert-unit-tests
+check-prometheus-alert-unit-tests: $(PROMETHEUS_ALERT_RULES)
+	./tests/prometheus_unit_tests/scripts/check_alert_tests.sh $(PROMETHEUS_CONFIG_YAML) $(PROMETHEUS_TEST_DIR) $(ALERT_SEVERITY)
+CLEANFILES += $(PROMETHEUS_ALERT_RULES)
+
 .PHONY: e2e-test
 e2e-test: ## Run e2e tests for the controller
 	go test ./tests/e2e/ -run ^TestOdhOperator -v --operator-namespace=${OPERATOR_NAMESPACE} ${E2E_TEST_FLAGS}

diff --git a/README.md b/README.md
@@ -1,4 +1,3 @@
-
 This operator is the primary operator for Open Data Hub. It is responsible for enabling Data science applications like
 Jupyter Notebooks, Modelmesh serving, Datascience pipelines etc. The operator makes use of `DataScienceCluster` CRD to deploy
 and configure these applications.
@@ -185,7 +184,7 @@ e.g `make image-build -e IMAGE_BUILD_FLAGS="--build-arg USE_LOCAL=true"`
 **Deploying operator using OLM**
 
 - To create a new bundle in defined operator namespace, run following command:
-  
+
   ```commandline
   export OPERATOR_NAMESPACE=<namespace-to-install-operator>
   make bundle
@@ -216,7 +215,7 @@ There are 2 ways to test your changes with modification:
 
 Whenever a new api is added or a new field is added to the CRD, please make sure to run the command:
   ```commandline
-  make api-docs 
+  make api-docs
   ```
 This will ensure that the doc for the apis are updated accordingly.
 
@@ -403,6 +402,23 @@ for DataScienceCluster deletion.
 ```shell
 make e2e-test -e OPERATOR_NAMESPACE=<namespace> -e E2E_TEST_FLAGS="--skip-deletion=true"
 ```
+
+## Run Prometheus Unit Tests for Alerts
+
+Unit tests for Prometheus alerts are included in the repository. You can run them using the following command:
+
+```shell
+make test-alerts
+```
+
+To check for alerts that don't have unit tests, run the below command:
+
+```shell
+make check-prometheus-alert-unit-tests
+```
+
+To add a new unit test file, name it the same as the rules file in the [prometheus ConfigMap](./config/monitoring/prometheus/apps/prometheus-configs.yaml), just with the `.rules` suffix replaced with `.unit-tests.yaml`
+
 ### API Overview
 
 Please refer to [api documentation](docs/api-overview.md)

diff --git a/tests/prometheus_unit_tests/.gitignore b/tests/prometheus_unit_tests/.gitignore
@@ -0,0 +1,2 @@
+# Ignore temporary alert yaml files created by the Makefile
+*.rules.yaml
diff --git a/...it_tests/codeflare_alerts_unit_tests.yaml → ..._tests/codeflare-alerting.unit-tests.yaml b/...it_tests/codeflare_alerts_unit_tests.yaml → ..._tests/codeflare-alerting.unit-tests.yaml
@@ -1,5 +1,5 @@
 rule_files:
-  - "codeflare_alerts.yaml"
+  - codeflare-alerting.rules.yaml
 
 evaluation_interval: 1m
 
@@ -93,7 +93,7 @@ tests:
       - eval_time: 1m
         alertname: CodeFlare Operator is not running
         exp_alerts: []
-  
+
   - interval: 1m
     input_series:
       - series: up{job="CodeFlare Operator"}
@@ -111,7 +111,7 @@ tests:
               description: This alert fires when the CodeFlare Operator is not running.
               triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/codeflare-operator-availability.md'
               summary: Alerting for CodeFlare Operator
-  
+
   - interval: 1m
     input_series:
     alert_rule_test:

diff --git a/...pipelines_operator_alerts_unit_tests.yaml → ...pelines-operator-alerting.unit-tests.yaml b/...pipelines_operator_alerts_unit_tests.yaml → ...pelines-operator-alerting.unit-tests.yaml
@@ -1,5 +1,5 @@
 rule_files:
-  - data_science_pipelines_operator_alerts.yaml
+  - data-science-pipelines-operator-alerting.rules.yaml
 
 evaluation_interval: 1m
 
@@ -43,7 +43,7 @@ tests:
               triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-application-error-burn-rate.md'
 
   - interval: 1m
-    input_series: 
+    input_series:
       - series: haproxy_backend_http_responses_total:burnrate30m{component="dsp"}
         values: "1+1x60"
       - series: haproxy_backend_http_responses_total:burnrate6h{component="dsp"}
@@ -62,7 +62,7 @@ tests:
               triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-application-error-burn-rate.md'
 
   - interval: 1m
-    input_series: 
+    input_series:
       - series: haproxy_backend_http_responses_total:burnrate2h{component="dsp"}
         values: "1+1x60"
       - series: haproxy_backend_http_responses_total:burnrate1d{component="dsp"}
@@ -81,7 +81,7 @@ tests:
               triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-application-error-burn-rate.md'
 
   - interval: 1m
-    input_series: 
+    input_series:
       - series: haproxy_backend_http_responses_total:burnrate6h{component="dsp"}
         values: "1+1x200"
       - series: haproxy_backend_http_responses_total:burnrate3d{component="dsp"}
@@ -141,7 +141,7 @@ tests:
               triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-operator-probe-success-burn-rate.md'
 
   - interval: 1m
-    input_series: 
+    input_series:
       - series: probe_success:burnrate30m{instance="data-science-pipelines-operator"}
         values: "1+1x60"
       - series: probe_success:burnrate6h{instance="data-science-pipelines-operator"}
@@ -161,7 +161,7 @@ tests:
               triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-operator-probe-success-burn-rate.md'
 
   - interval: 1m
-    input_series: 
+    input_series:
       - series: probe_success:burnrate2h{instance="data-science-pipelines-operator"}
         values: "1+1x60"
       - series: probe_success:burnrate1d{instance="data-science-pipelines-operator"}
@@ -182,7 +182,7 @@ tests:
 
   # application unavailable
   - interval: 1m
-    input_series: 
+    input_series:
       - series: data_science_pipelines_application_ready{dspa_name="dspa_instance_1", dspa_namespace="dspa_namespace_a"}
         values: "0x200"
       - series: data_science_pipelines_application_apiserver_ready{dspa_name="dspa_instance_1", dspa_namespace="dspa_namespace_a"}

diff --git a/..._unit_tests/kserve_alerts_unit_tests.yaml → ...nit_tests/kserve-alerting.unit-tests.yaml b/..._unit_tests/kserve_alerts_unit_tests.yaml → ...nit_tests/kserve-alerting.unit-tests.yaml
@@ -1,5 +1,5 @@
 rule_files:
-  - "kserve_alerts.yaml"
+  - kserve-alerting.rules.yaml
 
 evaluation_interval: 1m
 
@@ -80,4 +80,3 @@ tests:
               message: "High error budget burn for kserve-controller-manager (current value: 61)."
               summary: Kserve Controller Probe Success Burn Rate
               triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-kserve-controller-probe-success-burn-rate.md"
-
diff --git a/...s_unit_tests/kueue_alerts_unit_tests.yaml → ...unit_tests/kueue-alerting.unit-tests.yaml b/...s_unit_tests/kueue_alerts_unit_tests.yaml → ...unit_tests/kueue-alerting.unit-tests.yaml
@@ -1,5 +1,5 @@
 rule_files:
-  - "kueue_alerts.yaml"
+  - kueue-alerting.rules.yaml
 
 evaluation_interval: 1m
 
@@ -27,7 +27,7 @@ tests:
               description: This alert fires when the Kueue Operator is not running.
               summary: Alerting for Kueue Operator
               triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/kueue-operator-availability.md'
-  
+
   - interval: 1m
     input_series:
       - series: up{job="Kueue Operator"}

diff --git a/...t_tests/model_mesh_alerts_unit_tests.yaml → ...tests/model-mesh-alerting.unit-tests.yaml b/...t_tests/model_mesh_alerts_unit_tests.yaml → ...tests/model-mesh-alerting.unit-tests.yaml
@@ -1,5 +1,5 @@
 rule_files:
-  - model_mesh_alerts.yaml
+  - model-mesh-alerting.rules.yaml
 
 evaluation_interval: 1m
 
@@ -62,7 +62,7 @@ tests:
               summary: "Modelmesh Controller Probe Success Burn Rate"
               message: "High error budget burn for modelmesh-controller (current value: 16)."
               triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-modelmesh-controller-probe-success-burn-rate.md'
- 
+
   - interval: 1m
     input_series:
       - series: probe_success:burnrate2h{instance="modelmesh-controller"}

diff --git a/...s/model_controller_alerts_unit_tests.yaml → ...model-controller-alerting.unit-tests.yaml b/...s/model_controller_alerts_unit_tests.yaml → ...model-controller-alerting.unit-tests.yaml
@@ -1,5 +1,5 @@
 rule_files:
-  - model_controller_alerts.yaml
+  - odh-model-controller-alerting.rules.yaml
 
 evaluation_interval: 1m
 
@@ -62,7 +62,7 @@ tests:
               summary: "ODH Model Controller Probe Success Burn Rate"
               message: "High error budget burn for odh-model-controller (current value: 16)."
               triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-odh-controller-probe-success-burn-rate.md'
- 
+
   - interval: 1m
     input_series:
       - series: probe_success:burnrate2h{instance="odh-model-controller"}

diff --git a/...unit_tests/kuberay_alerts_unit_tests.yaml → ...s_unit_tests/ray-alerting.unit-tests.yaml b/...unit_tests/kuberay_alerts_unit_tests.yaml → ...s_unit_tests/ray-alerting.unit-tests.yaml
@@ -1,5 +1,5 @@
 rule_files:
-  - "kuberay_alerts.yaml"
+  - "ray-alerting.rules.yaml"
 
 evaluation_interval: 1m
 
@@ -27,7 +27,7 @@ tests:
               description: This alert fires when the KubeRay Operator is not running.
               summary: Alerting for KubeRay Operator
               triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/kuberay-operator-availability.md'
-  
+
   - interval: 1m
     input_series:
       - series: up{job="KubeRay Operator"}
@@ -44,4 +44,3 @@ tests:
               description: This alert fires when the KubeRay Operator is not running.
               summary: Alerting for KubeRay Operator
               triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/kuberay-operator-availability.md'
-
diff --git a/...it_tests/dashboard_alerts_unit_tests.yaml → .../rhods-dashboard-alerting.unit_tests.yaml b/...it_tests/dashboard_alerts_unit_tests.yaml → .../rhods-dashboard-alerting.unit_tests.yaml
@@ -1,5 +1,5 @@
 rule_files:
-  - dashboard_alerts.yaml
+  - rhods-dashboard-alerting.rules.yaml
 
 evaluation_interval: 1m
 
@@ -44,7 +44,7 @@ tests:
               triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/RHODS-Dashboard/rhods-error-burn-rate.md'
 
   - interval: 1m
-    input_series: 
+    input_series:
       - series: haproxy_backend_http_responses_total:burnrate30m{route="rhods-dashboard"}
         values: "1+1x60"
       - series: haproxy_backend_http_responses_total:burnrate6h{route="rhods-dashboard"}
@@ -64,7 +64,7 @@ tests:
               triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/RHODS-Dashboard/rhods-error-burn-rate.md'
 
   - interval: 1m
-    input_series: 
+    input_series:
       - series: haproxy_backend_http_responses_total:burnrate2h{route="rhods-dashboard"}
         values: "1+1x60"
       - series: haproxy_backend_http_responses_total:burnrate1d{route="rhods-dashboard"}
@@ -124,7 +124,7 @@ tests:
               triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/RHODS-Dashboard/rhods-dashboard-probe-success-burn-rate.md'
 
   - interval: 1m
-    input_series: 
+    input_series:
       - series: probe_success:burnrate30m{name="rhods-dashboard"}
         values: "1+1x60"
       - series: probe_success:burnrate6h{name="rhods-dashboard"}
@@ -144,7 +144,7 @@ tests:
               triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/RHODS-Dashboard/rhods-dashboard-probe-success-burn-rate.md'
 
   - interval: 1m
-    input_series: 
+    input_series:
       - series: probe_success:burnrate2h{name="rhods-dashboard"}
         values: "1+1x60"
       - series: probe_success:burnrate1d{name="rhods-dashboard"}
@@ -164,7 +164,7 @@ tests:
               triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/RHODS-Dashboard/rhods-dashboard-probe-success-burn-rate.md'
 
   - interval: 1m
-    input_series: 
+    input_series:
       - series: probe_success:burnrate6h{name="rhods-dashboard"}
         values: "1+1x200"
       - series: probe_success:burnrate3d{name="rhods-dashboard"}

diff --git a/tests/prometheus_unit_tests/scripts/check_alert_tests.sh b/tests/prometheus_unit_tests/scripts/check_alert_tests.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+PROMETHEUS_CONFIG_YAML=$1
+UNIT_TEST_DIR=$2
+ALERT_SEVERITY=$3
+
+# Collect all alerts from the configuration file
+while IFS= read -r ALERT; do
+  ALL_ALERTS+=("$ALERT")
+done < <(yq -N e '.data[]
+  | from_yaml
+  | .groups[].rules[]
+  | select(.alert != "DeadManSnitch" and .labels.severity == "'${ALERT_SEVERITY}'")
+  | .alert' "${PROMETHEUS_CONFIG_YAML}")
+
+# Collect all alerts from the unit test files
+while IFS= read -r ALERT; do
+  PROMETHEUS_UNIT_TEST_CHECK+=("$ALERT")
+done < <(
+  for alert in "$UNIT_TEST_DIR"/*.yaml; do
+    yq -N eval-all '.tests[]
+    | .alert_rule_test[]
+    | .exp_alerts[]
+    | .exp_labels
+    | select(.severity == "'${ALERT_SEVERITY}'")
+    | .alertname' "$alert"
+  done
+)
+
+# Sorting the PROMETHEUS_UNIT_TEST_CHECK array for comparison
+PROMETHEUS_UNIT_TEST_CHECK_SORTED=($(echo "${PROMETHEUS_UNIT_TEST_CHECK[@]}" | sort | uniq))
+
+# Finding items in ALL_ALERTS not in PROMETHEUS_UNIT_TEST_CHECK_SORTED
+ALERTS_WITHOUT_UNIT_TESTS=()
+for ALERT in "${ALL_ALERTS[@]}"; do
+  if [[ ! " ${PROMETHEUS_UNIT_TEST_CHECK_SORTED[@]} " =~ " ${ALERT} " ]]; then
+    ALERTS_WITHOUT_UNIT_TESTS+=("$ALERT")
+  fi
+done
+
+# Printing the alerts without unit tests
+echo "Alerts without unit tests:"
+for ALERT in "${ALERTS_WITHOUT_UNIT_TESTS[@]}"; do
+  echo "$ALERT"
+done
Original file line number	Diff line number	Diff line change
Expand Up		@@ -62,4 +62,3 @@ local.mk

		# Ignore temporary files created by the Makefile
		.mktmp.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Ignore temporary alert yaml files created by the Makefile
		*.rules.yaml