diff --git a/.github/workflows/prometheus-unit-tests.yaml b/.github/workflows/prometheus-unit-tests.yaml index b95eaaf83c7..9ffbc5bc360 100644 --- a/.github/workflows/prometheus-unit-tests.yaml +++ b/.github/workflows/prometheus-unit-tests.yaml @@ -10,6 +10,10 @@ jobs: steps: - name: Checkout uses: actions/checkout@v4 + - name: Setup Go + uses: actions/setup-go@v4 + with: + go-version-file: go.mod - name: Install Promtool run: | sudo apt-get update && sudo apt-get install -y prometheus diff --git a/.gitignore b/.gitignore index 4350bd1711e..40afbde7fa3 100644 --- a/.gitignore +++ b/.gitignore @@ -62,6 +62,3 @@ local.mk # Ignore temporary files created by the Makefile *.mktmp.* - -#Ignore temporary alert yaml files created by the Makefile -*_alerts.yaml diff --git a/Makefile b/Makefile index f69ee467aee..c11380c509d 100644 --- a/Makefile +++ b/Makefile @@ -97,8 +97,10 @@ IMAGE_BUILD_FLAGS ?= --build-arg USE_LOCAL=false # Prometheus-Unit Tests Parameters PROMETHEUS_CONFIG_YAML = ./config/monitoring/prometheus/apps/prometheus-configs.yaml -PROMETHEUS_CONFIG_DIR = ./config/monitoring/prometheus/apps/ -GENERATED_ALERT_DIR = ./tests/prometheus_unit_tests/ +PROMETHEUS_CONFIG_DIR = ./config/monitoring/prometheus/apps +PROMETHEUS_TEST_DIR = ./tests/prometheus_unit_tests +PROMETHEUS_ALERT_TESTS = $(shell find $(PROMETHEUS_TEST_DIR) -name "*.unit-tests.yaml") + ALERT_SEVERITY = critical # Read any custom variables overrides from a local.mk file. This will only be read if it exists in the @@ -256,11 +258,13 @@ controller-gen: $(CONTROLLER_GEN) ## Download controller-gen locally if necessar $(CONTROLLER_GEN): $(LOCALBIN) test -s $(CONTROLLER_GEN) || GOBIN=$(LOCALBIN) go install sigs.k8s.io/controller-tools/cmd/controller-gen@$(CONTROLLER_GEN_VERSION) -.PHONY: yq -yq: $(YQ) ## Download yq locally if necessary. +## Download yq locally if necessary. $(YQ): $(LOCALBIN) test -s $(YQ) || GOBIN=$(LOCALBIN) go install github.com/mikefarah/yq/v4@$(YQ_VERSION) +.PHONY: yq +yq: $(YQ) + OPERATOR_SDK_DL_URL ?= https://github.com/operator-framework/operator-sdk/releases/download/$(OPERATOR_SDK_VERSION) .PHONY: operator-sdk operator-sdk: $(OPERATOR_SDK) ## Download and install operator-sdk @@ -380,19 +384,18 @@ unit-test: envtest OPERATOR_NAMESPACE=$(OPERATOR_NAMESPACE) KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test $(TEST_SRC) -v -coverprofile cover.out CLEANFILES += cover.out -.PHONY: extract-alert-rules -extract-alert-rules: $(PROMETHEUS_CONFIG_YAML) - ./tests/prometheus_unit_tests/scripts/extract_alerts.sh $(PROMETHEUS_CONFIG_YAML) $(GENERATED_ALERT_DIR) +$(PROMETHEUS_TEST_DIR)/%.rules.yaml: $(PROMETHEUS_TEST_DIR)/%.unit-tests.yaml $(PROMETHEUS_CONFIG_YAML) $(YQ) + $(YQ) eval ".data.\"$(@F:.rules.yaml=.rules)\"" $(PROMETHEUS_CONFIG_YAML) > $@ # Run prometheus-alert-unit-tests .PHONY: test-alerts -test-alerts: extract-alert-rules - promtool test rules $(GENERATED_ALERT_DIR)/*_unit_tests.yaml +test-alerts: $(PROMETHEUS_ALERT_TESTS:.unit-tests.yaml=.rules.yaml) + promtool test rules $(PROMETHEUS_ALERT_TESTS) #Check for alerts without unit-tests .PHONY: check-prometheus-alert-unit-tests -check-prometheus-alert-unit-tests: extract-alert-rules - ./tests/prometheus_unit_tests/scripts/check_alert_tests.sh $(PROMETHEUS_CONFIG_YAML) $(GENERATED_ALERT_DIR) $(ALERT_SEVERITY) +check-prometheus-alert-unit-tests: $(PROMETHEUS_ALERT_TESTS:.unit-tests.yaml=.rules.yaml) + ./tests/prometheus_unit_tests/scripts/check_alert_tests.sh $(PROMETHEUS_CONFIG_YAML) $(PROMETHEUS_TEST_DIR) $(ALERT_SEVERITY) .PHONY: e2e-test e2e-test: ## Run e2e tests for the controller @@ -403,3 +406,4 @@ clean: $(GOLANGCI_LINT) $(GOLANGCI_LINT) cache clean chmod u+w -R $(LOCALBIN) # envtest makes its dir RO rm -rf $(CLEANFILES) + rm $(PROMETHEUS_ALERT_TESTS:.unit-tests.yaml=.rules.yaml) diff --git a/README.md b/README.md index 8743cd80781..1de5dba1e90 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,3 @@ - This operator is the primary operator for Open Data Hub. It is responsible for enabling Data science applications like Jupyter Notebooks, Modelmesh serving, Datascience pipelines etc. The operator makes use of `DataScienceCluster` CRD to deploy and configure these applications. @@ -185,7 +184,7 @@ e.g `make image-build -e IMAGE_BUILD_FLAGS="--build-arg USE_LOCAL=true"` **Deploying operator using OLM** - To create a new bundle in defined operator namespace, run following command: - + ```commandline export OPERATOR_NAMESPACE= make bundle @@ -216,7 +215,7 @@ There are 2 ways to test your changes with modification: Whenever a new api is added or a new field is added to the CRD, please make sure to run the command: ```commandline - make api-docs + make api-docs ``` This will ensure that the doc for the apis are updated accordingly. @@ -404,7 +403,7 @@ for DataScienceCluster deletion. make e2e-test -e OPERATOR_NAMESPACE= -e E2E_TEST_FLAGS="--skip-deletion=true" ``` -## Run Prometheus Unit Tests for Alerts +## Run Prometheus Unit Tests for Alerts Unit tests for Prometheus alerts are included in the repository. You can run them using the following command: @@ -418,7 +417,7 @@ To check for alerts that don't have unit tests, run the below command: make check-prometheus-alert-unit-tests ``` -To add a new unit test, add the alert YAML file and the corresponding rule files to the [extract_alerts](tests/prometheus_unit_tests/scripts/extract_alerts.sh) script. +To add a new unit test file, name it the same as the rules file in the [prometheus ConfigMap](./config/monitoring/prometheus/apps/prometheus-configs.yaml), just with the `.rules` suffix replaced with `.unit-tests.yaml` ### API Overview diff --git a/tests/prometheus_unit_tests/.gitignore b/tests/prometheus_unit_tests/.gitignore new file mode 100644 index 00000000000..e54df85e5cd --- /dev/null +++ b/tests/prometheus_unit_tests/.gitignore @@ -0,0 +1,2 @@ +# Ignore temporary alert yaml files created by the Makefile +*.rules.yaml diff --git a/tests/prometheus_unit_tests/codeflare_alerts_unit_tests.yaml b/tests/prometheus_unit_tests/codeflare-alerting.unit-tests.yaml similarity index 99% rename from tests/prometheus_unit_tests/codeflare_alerts_unit_tests.yaml rename to tests/prometheus_unit_tests/codeflare-alerting.unit-tests.yaml index a4bd43c7fa5..5b4571c8da6 100644 --- a/tests/prometheus_unit_tests/codeflare_alerts_unit_tests.yaml +++ b/tests/prometheus_unit_tests/codeflare-alerting.unit-tests.yaml @@ -1,5 +1,5 @@ rule_files: - - "codeflare_alerts.yaml" + - codeflare-alerting.rules.yaml evaluation_interval: 1m @@ -93,7 +93,7 @@ tests: - eval_time: 1m alertname: CodeFlare Operator is not running exp_alerts: [] - + - interval: 1m input_series: - series: up{job="CodeFlare Operator"} @@ -111,7 +111,7 @@ tests: description: This alert fires when the CodeFlare Operator is not running. triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/codeflare-operator-availability.md' summary: Alerting for CodeFlare Operator - + - interval: 1m input_series: alert_rule_test: diff --git a/tests/prometheus_unit_tests/data_science_pipelines_operator_alerts_unit_tests.yaml b/tests/prometheus_unit_tests/data-science-pipelines-operator-alerting.unit-tests.yaml similarity index 98% rename from tests/prometheus_unit_tests/data_science_pipelines_operator_alerts_unit_tests.yaml rename to tests/prometheus_unit_tests/data-science-pipelines-operator-alerting.unit-tests.yaml index ea59788529f..41218063851 100644 --- a/tests/prometheus_unit_tests/data_science_pipelines_operator_alerts_unit_tests.yaml +++ b/tests/prometheus_unit_tests/data-science-pipelines-operator-alerting.unit-tests.yaml @@ -1,5 +1,5 @@ rule_files: - - data_science_pipelines_operator_alerts.yaml + - data-science-pipelines-operator-alerting.rules.yaml evaluation_interval: 1m @@ -43,7 +43,7 @@ tests: triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-application-error-burn-rate.md' - interval: 1m - input_series: + input_series: - series: haproxy_backend_http_responses_total:burnrate30m{component="dsp"} values: "1+1x60" - series: haproxy_backend_http_responses_total:burnrate6h{component="dsp"} @@ -62,7 +62,7 @@ tests: triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-application-error-burn-rate.md' - interval: 1m - input_series: + input_series: - series: haproxy_backend_http_responses_total:burnrate2h{component="dsp"} values: "1+1x60" - series: haproxy_backend_http_responses_total:burnrate1d{component="dsp"} @@ -81,7 +81,7 @@ tests: triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-application-error-burn-rate.md' - interval: 1m - input_series: + input_series: - series: haproxy_backend_http_responses_total:burnrate6h{component="dsp"} values: "1+1x200" - series: haproxy_backend_http_responses_total:burnrate3d{component="dsp"} @@ -141,7 +141,7 @@ tests: triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-operator-probe-success-burn-rate.md' - interval: 1m - input_series: + input_series: - series: probe_success:burnrate30m{instance="data-science-pipelines-operator"} values: "1+1x60" - series: probe_success:burnrate6h{instance="data-science-pipelines-operator"} @@ -161,7 +161,7 @@ tests: triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-operator-probe-success-burn-rate.md' - interval: 1m - input_series: + input_series: - series: probe_success:burnrate2h{instance="data-science-pipelines-operator"} values: "1+1x60" - series: probe_success:burnrate1d{instance="data-science-pipelines-operator"} @@ -182,7 +182,7 @@ tests: # application unavailable - interval: 1m - input_series: + input_series: - series: data_science_pipelines_application_ready{dspa_name="dspa_instance_1", dspa_namespace="dspa_namespace_a"} values: "0x200" - series: data_science_pipelines_application_apiserver_ready{dspa_name="dspa_instance_1", dspa_namespace="dspa_namespace_a"} diff --git a/tests/prometheus_unit_tests/kserve_alerts_unit_tests.yaml b/tests/prometheus_unit_tests/kserve-alerting.unit-tests.yaml similarity index 99% rename from tests/prometheus_unit_tests/kserve_alerts_unit_tests.yaml rename to tests/prometheus_unit_tests/kserve-alerting.unit-tests.yaml index 8284b88de3d..1d409c82831 100644 --- a/tests/prometheus_unit_tests/kserve_alerts_unit_tests.yaml +++ b/tests/prometheus_unit_tests/kserve-alerting.unit-tests.yaml @@ -1,5 +1,5 @@ rule_files: - - "kserve_alerts.yaml" + - kserve-alerting.rules.yaml evaluation_interval: 1m @@ -80,4 +80,3 @@ tests: message: "High error budget burn for kserve-controller-manager (current value: 61)." summary: Kserve Controller Probe Success Burn Rate triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-kserve-controller-probe-success-burn-rate.md" - diff --git a/tests/prometheus_unit_tests/kueue_alerts_unit_tests.yaml b/tests/prometheus_unit_tests/kueue-alerting.unit-tests.yaml similarity index 98% rename from tests/prometheus_unit_tests/kueue_alerts_unit_tests.yaml rename to tests/prometheus_unit_tests/kueue-alerting.unit-tests.yaml index 86e6d21b00f..a1bf7445247 100644 --- a/tests/prometheus_unit_tests/kueue_alerts_unit_tests.yaml +++ b/tests/prometheus_unit_tests/kueue-alerting.unit-tests.yaml @@ -1,5 +1,5 @@ rule_files: - - "kueue_alerts.yaml" + - kueue-alerting.rules.yaml evaluation_interval: 1m @@ -27,7 +27,7 @@ tests: description: This alert fires when the Kueue Operator is not running. summary: Alerting for Kueue Operator triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/kueue-operator-availability.md' - + - interval: 1m input_series: - series: up{job="Kueue Operator"} diff --git a/tests/prometheus_unit_tests/model_mesh_alerts_unit_tests.yaml b/tests/prometheus_unit_tests/model-mesh-alerting.unit-tests.yaml similarity index 99% rename from tests/prometheus_unit_tests/model_mesh_alerts_unit_tests.yaml rename to tests/prometheus_unit_tests/model-mesh-alerting.unit-tests.yaml index 8defbbadaf2..681a102afec 100644 --- a/tests/prometheus_unit_tests/model_mesh_alerts_unit_tests.yaml +++ b/tests/prometheus_unit_tests/model-mesh-alerting.unit-tests.yaml @@ -1,5 +1,5 @@ rule_files: - - model_mesh_alerts.yaml + - model-mesh-alerting.rules.yaml evaluation_interval: 1m @@ -62,7 +62,7 @@ tests: summary: "Modelmesh Controller Probe Success Burn Rate" message: "High error budget burn for modelmesh-controller (current value: 16)." triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-modelmesh-controller-probe-success-burn-rate.md' - + - interval: 1m input_series: - series: probe_success:burnrate2h{instance="modelmesh-controller"} diff --git a/tests/prometheus_unit_tests/model_controller_alerts_unit_tests.yaml b/tests/prometheus_unit_tests/odh-model-controller-alerting.unit-tests.yaml similarity index 98% rename from tests/prometheus_unit_tests/model_controller_alerts_unit_tests.yaml rename to tests/prometheus_unit_tests/odh-model-controller-alerting.unit-tests.yaml index ba150c645cb..0b37ebac9f8 100644 --- a/tests/prometheus_unit_tests/model_controller_alerts_unit_tests.yaml +++ b/tests/prometheus_unit_tests/odh-model-controller-alerting.unit-tests.yaml @@ -1,5 +1,5 @@ rule_files: - - model_controller_alerts.yaml + - odh-model-controller-alerting.rules.yaml evaluation_interval: 1m @@ -62,7 +62,7 @@ tests: summary: "ODH Model Controller Probe Success Burn Rate" message: "High error budget burn for odh-model-controller (current value: 16)." triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-odh-controller-probe-success-burn-rate.md' - + - interval: 1m input_series: - series: probe_success:burnrate2h{instance="odh-model-controller"} diff --git a/tests/prometheus_unit_tests/kuberay_alerts_unit_tests.yaml b/tests/prometheus_unit_tests/ray-alerting.unit-tests.yaml similarity index 98% rename from tests/prometheus_unit_tests/kuberay_alerts_unit_tests.yaml rename to tests/prometheus_unit_tests/ray-alerting.unit-tests.yaml index ded41b3ac1a..9f2f96e593b 100644 --- a/tests/prometheus_unit_tests/kuberay_alerts_unit_tests.yaml +++ b/tests/prometheus_unit_tests/ray-alerting.unit-tests.yaml @@ -1,5 +1,5 @@ rule_files: - - "kuberay_alerts.yaml" + - "ray-alerting.rules.yaml" evaluation_interval: 1m @@ -27,7 +27,7 @@ tests: description: This alert fires when the KubeRay Operator is not running. summary: Alerting for KubeRay Operator triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/kuberay-operator-availability.md' - + - interval: 1m input_series: - series: up{job="KubeRay Operator"} @@ -44,4 +44,3 @@ tests: description: This alert fires when the KubeRay Operator is not running. summary: Alerting for KubeRay Operator triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/kuberay-operator-availability.md' - diff --git a/tests/prometheus_unit_tests/dashboard_alerts_unit_tests.yaml b/tests/prometheus_unit_tests/rhods-dashboard-alerting.unit_tests.yaml similarity index 98% rename from tests/prometheus_unit_tests/dashboard_alerts_unit_tests.yaml rename to tests/prometheus_unit_tests/rhods-dashboard-alerting.unit_tests.yaml index 2efb6cf7a6e..18f4500d4a3 100644 --- a/tests/prometheus_unit_tests/dashboard_alerts_unit_tests.yaml +++ b/tests/prometheus_unit_tests/rhods-dashboard-alerting.unit_tests.yaml @@ -1,5 +1,5 @@ rule_files: - - dashboard_alerts.yaml + - rhods-dashboard-alerting.rules.yaml evaluation_interval: 1m @@ -44,7 +44,7 @@ tests: triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/RHODS-Dashboard/rhods-error-burn-rate.md' - interval: 1m - input_series: + input_series: - series: haproxy_backend_http_responses_total:burnrate30m{route="rhods-dashboard"} values: "1+1x60" - series: haproxy_backend_http_responses_total:burnrate6h{route="rhods-dashboard"} @@ -64,7 +64,7 @@ tests: triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/RHODS-Dashboard/rhods-error-burn-rate.md' - interval: 1m - input_series: + input_series: - series: haproxy_backend_http_responses_total:burnrate2h{route="rhods-dashboard"} values: "1+1x60" - series: haproxy_backend_http_responses_total:burnrate1d{route="rhods-dashboard"} @@ -124,7 +124,7 @@ tests: triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/RHODS-Dashboard/rhods-dashboard-probe-success-burn-rate.md' - interval: 1m - input_series: + input_series: - series: probe_success:burnrate30m{name="rhods-dashboard"} values: "1+1x60" - series: probe_success:burnrate6h{name="rhods-dashboard"} @@ -144,7 +144,7 @@ tests: triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/RHODS-Dashboard/rhods-dashboard-probe-success-burn-rate.md' - interval: 1m - input_series: + input_series: - series: probe_success:burnrate2h{name="rhods-dashboard"} values: "1+1x60" - series: probe_success:burnrate1d{name="rhods-dashboard"} @@ -164,7 +164,7 @@ tests: triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/RHODS-Dashboard/rhods-dashboard-probe-success-burn-rate.md' - interval: 1m - input_series: + input_series: - series: probe_success:burnrate6h{name="rhods-dashboard"} values: "1+1x200" - series: probe_success:burnrate3d{name="rhods-dashboard"} diff --git a/tests/prometheus_unit_tests/scripts/extract_alerts.sh b/tests/prometheus_unit_tests/scripts/extract_alerts.sh deleted file mode 100755 index 5d85f64cf4c..00000000000 --- a/tests/prometheus_unit_tests/scripts/extract_alerts.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env bash -set -e - -INPUT_YAML="$1" -OUTPUT_DIR="$2" - -# Define an associative array mapping rule files to alert files -declare -A RULES_TO_ALERTS=( - [rhods-dashboard-alerting.rules]="dashboard_alerts.yaml" - [model-mesh-alerting.rules]="model_mesh_alerts.yaml" - [trustyai-alerting.rules]="trustyai_alerts.yaml" - [odh-model-controller-alerting.rules]="model_controller_alerts.yaml" - [workbenches-alerting.rules]="workbenches_alerts.yaml" - [data-science-pipelines-operator-alerting.rules]="data_science_pipelines_operator_alerts.yaml" - [kserve-alerting.rules]="kserve_alerts.yaml" - [kueue-alerting.rules]="kueue_alerts.yaml" - [ray-alerting.rules]="kuberay_alerts.yaml" - [codeflare-alerting.rules]="codeflare_alerts.yaml" - [trainingoperator-alerting.rules]="training_operator_alerts.yaml" -) - -for RULE_FILE in "${!RULES_TO_ALERTS[@]}"; do - ALERT_FILE="${RULES_TO_ALERTS[$RULE_FILE]}" - - echo "Extracting $RULE_FILE to $OUTPUT_DIR/$ALERT_FILE" - yq ".data.\"$RULE_FILE\"" "$INPUT_YAML" > "$OUTPUT_DIR/$ALERT_FILE" -done diff --git a/tests/prometheus_unit_tests/training_operator_unit_tests.yaml b/tests/prometheus_unit_tests/trainingoperator-alerting.unit-tests.yaml similarity index 96% rename from tests/prometheus_unit_tests/training_operator_unit_tests.yaml rename to tests/prometheus_unit_tests/trainingoperator-alerting.unit-tests.yaml index d07cfc94c89..893e8335792 100644 --- a/tests/prometheus_unit_tests/training_operator_unit_tests.yaml +++ b/tests/prometheus_unit_tests/trainingoperator-alerting.unit-tests.yaml @@ -1,5 +1,5 @@ rule_files: - - "training_operator_alerts.yaml" + - trainingoperator-alerting.rules.yaml evaluation_interval: 1m @@ -27,11 +27,11 @@ tests: description: "This alert fires when the KubeFlow Training Operator is not running." summary: Alerting for KubeFlow Training Operator triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/training-operator-availability.md" - + - interval: 1m input_series: - series: up{job="KubeFlow Training Operator"} - values: "0" + values: "0" alert_rule_test: - eval_time: 2m alertname: KubeFlow Training Operator is not running diff --git a/tests/prometheus_unit_tests/trustyai_alerts_unit_tests.yaml b/tests/prometheus_unit_tests/trustyai-alerting.unit-tests.yaml similarity index 99% rename from tests/prometheus_unit_tests/trustyai_alerts_unit_tests.yaml rename to tests/prometheus_unit_tests/trustyai-alerting.unit-tests.yaml index f556eb751d9..2fd1594cbe7 100644 --- a/tests/prometheus_unit_tests/trustyai_alerts_unit_tests.yaml +++ b/tests/prometheus_unit_tests/trustyai-alerting.unit-tests.yaml @@ -1,5 +1,5 @@ rule_files: - - trustyai_alerts.yaml + - trustyai-alerting.rules.yaml evaluation_interval: 1m @@ -60,7 +60,7 @@ tests: summary: "TrustyAI Controller Probe Success Burn Rate" message: "High error budget burn for trustyai-service-operator-controller-manager (current value: 16)." triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhoai-trustyai-controller-probe-success-burn-rate.md' - + - interval: 1m input_series: - series: probe_success:burnrate2h{instance="trustyai-service-operator-controller-manager"} diff --git a/tests/prometheus_unit_tests/workbenches_alerts_unit_tests.yaml b/tests/prometheus_unit_tests/workbenches-alerting.unit-tests.yaml similarity index 98% rename from tests/prometheus_unit_tests/workbenches_alerts_unit_tests.yaml rename to tests/prometheus_unit_tests/workbenches-alerting.unit-tests.yaml index 76adc223c5e..a8b8e3e2369 100644 --- a/tests/prometheus_unit_tests/workbenches_alerts_unit_tests.yaml +++ b/tests/prometheus_unit_tests/workbenches-alerting.unit-tests.yaml @@ -1,5 +1,5 @@ rule_files: - - workbenches_alerts.yaml + - workbenches-alerting.rules.yaml evaluation_interval: 1m @@ -54,8 +54,8 @@ tests: exp_annotations: summary: "User notebook pvc usage at 100%" message: "The user notebook jupyterhub-nb-1a is using 100% of its Volume. You might want to decrease the amount of data stored on the server or you can reach out to your cluster admin to increase the storage capacity to prevent disruptions and loss of data. Please back up your data before increasing the storage limit." - triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/tree/main/RHODS' - + triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/tree/main/RHODS' + # Probe success - interval: 1m input_series: @@ -113,7 +113,7 @@ tests: summary: "RHODS Jupyter Probe Success Burn Rate" message: "High error budget burn for notebook-spawner (current value: 16)." triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Jupyter/rhods-jupyter-probe-success-burn-rate.md' - + - interval: 1m input_series: - series: probe_success:burnrate2h{instance="notebook-spawner"} @@ -161,7 +161,7 @@ tests: - eval_time: 5m alertname: ODH notebook controller pod is not running exp_alerts: [] - + - interval: 1m input_series: alert_rule_test: @@ -176,7 +176,7 @@ tests: summary: ODH notebook controller pod is not running message: 'ODH notebook controller is down!' triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Jupyter/rhods-odh-notebook-controller-alert.md" - + # Kubeflow notebook controllers running - interval: 1m input_series: @@ -186,7 +186,7 @@ tests: - eval_time: 5m alertname: Kubeflow notebook controller pod is not running exp_alerts: [] - + - interval: 1m input_series: alert_rule_test: