Skip to content

Commit

Permalink
Merge pull request #1 from grdryn/prom-test-things
Browse files Browse the repository at this point in the history
chore: remove need for extract_alerts.sh
  • Loading branch information
biswassri authored Nov 15, 2024
2 parents 291f0a4 + e8509c9 commit 5a453e9
Show file tree
Hide file tree
Showing 17 changed files with 62 additions and 85 deletions.
4 changes: 4 additions & 0 deletions .github/workflows/prometheus-unit-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ jobs:
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup Go
uses: actions/setup-go@v4
with:
go-version-file: go.mod
- name: Install Promtool
run: |
sudo apt-get update && sudo apt-get install -y prometheus
Expand Down
3 changes: 0 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,3 @@ local.mk

# Ignore temporary files created by the Makefile
*.mktmp.*

#Ignore temporary alert yaml files created by the Makefile
*_alerts.yaml
26 changes: 15 additions & 11 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,10 @@ IMAGE_BUILD_FLAGS ?= --build-arg USE_LOCAL=false

# Prometheus-Unit Tests Parameters
PROMETHEUS_CONFIG_YAML = ./config/monitoring/prometheus/apps/prometheus-configs.yaml
PROMETHEUS_CONFIG_DIR = ./config/monitoring/prometheus/apps/
GENERATED_ALERT_DIR = ./tests/prometheus_unit_tests/
PROMETHEUS_CONFIG_DIR = ./config/monitoring/prometheus/apps
PROMETHEUS_TEST_DIR = ./tests/prometheus_unit_tests
PROMETHEUS_ALERT_TESTS = $(shell find $(PROMETHEUS_TEST_DIR) -name "*.unit-tests.yaml")

ALERT_SEVERITY = critical

# Read any custom variables overrides from a local.mk file. This will only be read if it exists in the
Expand Down Expand Up @@ -256,11 +258,13 @@ controller-gen: $(CONTROLLER_GEN) ## Download controller-gen locally if necessar
$(CONTROLLER_GEN): $(LOCALBIN)
test -s $(CONTROLLER_GEN) || GOBIN=$(LOCALBIN) go install sigs.k8s.io/controller-tools/cmd/controller-gen@$(CONTROLLER_GEN_VERSION)

.PHONY: yq
yq: $(YQ) ## Download yq locally if necessary.
## Download yq locally if necessary.
$(YQ): $(LOCALBIN)
test -s $(YQ) || GOBIN=$(LOCALBIN) go install github.com/mikefarah/yq/v4@$(YQ_VERSION)

.PHONY: yq
yq: $(YQ)

OPERATOR_SDK_DL_URL ?= https://github.com/operator-framework/operator-sdk/releases/download/$(OPERATOR_SDK_VERSION)
.PHONY: operator-sdk
operator-sdk: $(OPERATOR_SDK) ## Download and install operator-sdk
Expand Down Expand Up @@ -380,19 +384,18 @@ unit-test: envtest
OPERATOR_NAMESPACE=$(OPERATOR_NAMESPACE) KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test $(TEST_SRC) -v -coverprofile cover.out
CLEANFILES += cover.out

.PHONY: extract-alert-rules
extract-alert-rules: $(PROMETHEUS_CONFIG_YAML)
./tests/prometheus_unit_tests/scripts/extract_alerts.sh $(PROMETHEUS_CONFIG_YAML) $(GENERATED_ALERT_DIR)
$(PROMETHEUS_TEST_DIR)/%.rules.yaml: $(PROMETHEUS_TEST_DIR)/%.unit-tests.yaml $(PROMETHEUS_CONFIG_YAML) $(YQ)
$(YQ) eval ".data.\"$(@F:.rules.yaml=.rules)\"" $(PROMETHEUS_CONFIG_YAML) > $@

# Run prometheus-alert-unit-tests
.PHONY: test-alerts
test-alerts: extract-alert-rules
promtool test rules $(GENERATED_ALERT_DIR)/*_unit_tests.yaml
test-alerts: $(PROMETHEUS_ALERT_TESTS:.unit-tests.yaml=.rules.yaml)
promtool test rules $(PROMETHEUS_ALERT_TESTS)

#Check for alerts without unit-tests
.PHONY: check-prometheus-alert-unit-tests
check-prometheus-alert-unit-tests: extract-alert-rules
./tests/prometheus_unit_tests/scripts/check_alert_tests.sh $(PROMETHEUS_CONFIG_YAML) $(GENERATED_ALERT_DIR) $(ALERT_SEVERITY)
check-prometheus-alert-unit-tests: $(PROMETHEUS_ALERT_TESTS:.unit-tests.yaml=.rules.yaml)
./tests/prometheus_unit_tests/scripts/check_alert_tests.sh $(PROMETHEUS_CONFIG_YAML) $(PROMETHEUS_TEST_DIR) $(ALERT_SEVERITY)

.PHONY: e2e-test
e2e-test: ## Run e2e tests for the controller
Expand All @@ -403,3 +406,4 @@ clean: $(GOLANGCI_LINT)
$(GOLANGCI_LINT) cache clean
chmod u+w -R $(LOCALBIN) # envtest makes its dir RO
rm -rf $(CLEANFILES)
rm $(PROMETHEUS_ALERT_TESTS:.unit-tests.yaml=.rules.yaml)
9 changes: 4 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

This operator is the primary operator for Open Data Hub. It is responsible for enabling Data science applications like
Jupyter Notebooks, Modelmesh serving, Datascience pipelines etc. The operator makes use of `DataScienceCluster` CRD to deploy
and configure these applications.
Expand Down Expand Up @@ -185,7 +184,7 @@ e.g `make image-build -e IMAGE_BUILD_FLAGS="--build-arg USE_LOCAL=true"`
**Deploying operator using OLM**

- To create a new bundle in defined operator namespace, run following command:

```commandline
export OPERATOR_NAMESPACE=<namespace-to-install-operator>
make bundle
Expand Down Expand Up @@ -216,7 +215,7 @@ There are 2 ways to test your changes with modification:

Whenever a new api is added or a new field is added to the CRD, please make sure to run the command:
```commandline
make api-docs
make api-docs
```
This will ensure that the doc for the apis are updated accordingly.

Expand Down Expand Up @@ -404,7 +403,7 @@ for DataScienceCluster deletion.
make e2e-test -e OPERATOR_NAMESPACE=<namespace> -e E2E_TEST_FLAGS="--skip-deletion=true"
```

## Run Prometheus Unit Tests for Alerts
## Run Prometheus Unit Tests for Alerts

Unit tests for Prometheus alerts are included in the repository. You can run them using the following command:

Expand All @@ -418,7 +417,7 @@ To check for alerts that don't have unit tests, run the below command:
make check-prometheus-alert-unit-tests
```

To add a new unit test, add the alert YAML file and the corresponding rule files to the [extract_alerts](tests/prometheus_unit_tests/scripts/extract_alerts.sh) script.
To add a new unit test file, name it the same as the rules file in the [prometheus ConfigMap](./config/monitoring/prometheus/apps/prometheus-configs.yaml), just with the `.rules` suffix replaced with `.unit-tests.yaml`

### API Overview

Expand Down
2 changes: 2 additions & 0 deletions tests/prometheus_unit_tests/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Ignore temporary alert yaml files created by the Makefile
*.rules.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
rule_files:
- "codeflare_alerts.yaml"
- codeflare-alerting.rules.yaml

evaluation_interval: 1m

Expand Down Expand Up @@ -93,7 +93,7 @@ tests:
- eval_time: 1m
alertname: CodeFlare Operator is not running
exp_alerts: []

- interval: 1m
input_series:
- series: up{job="CodeFlare Operator"}
Expand All @@ -111,7 +111,7 @@ tests:
description: This alert fires when the CodeFlare Operator is not running.
triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/codeflare-operator-availability.md'
summary: Alerting for CodeFlare Operator

- interval: 1m
input_series:
alert_rule_test:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
rule_files:
- data_science_pipelines_operator_alerts.yaml
- data-science-pipelines-operator-alerting.rules.yaml

evaluation_interval: 1m

Expand Down Expand Up @@ -43,7 +43,7 @@ tests:
triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-application-error-burn-rate.md'

- interval: 1m
input_series:
input_series:
- series: haproxy_backend_http_responses_total:burnrate30m{component="dsp"}
values: "1+1x60"
- series: haproxy_backend_http_responses_total:burnrate6h{component="dsp"}
Expand All @@ -62,7 +62,7 @@ tests:
triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-application-error-burn-rate.md'

- interval: 1m
input_series:
input_series:
- series: haproxy_backend_http_responses_total:burnrate2h{component="dsp"}
values: "1+1x60"
- series: haproxy_backend_http_responses_total:burnrate1d{component="dsp"}
Expand All @@ -81,7 +81,7 @@ tests:
triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-application-error-burn-rate.md'

- interval: 1m
input_series:
input_series:
- series: haproxy_backend_http_responses_total:burnrate6h{component="dsp"}
values: "1+1x200"
- series: haproxy_backend_http_responses_total:burnrate3d{component="dsp"}
Expand Down Expand Up @@ -141,7 +141,7 @@ tests:
triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-operator-probe-success-burn-rate.md'

- interval: 1m
input_series:
input_series:
- series: probe_success:burnrate30m{instance="data-science-pipelines-operator"}
values: "1+1x60"
- series: probe_success:burnrate6h{instance="data-science-pipelines-operator"}
Expand All @@ -161,7 +161,7 @@ tests:
triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-operator-probe-success-burn-rate.md'

- interval: 1m
input_series:
input_series:
- series: probe_success:burnrate2h{instance="data-science-pipelines-operator"}
values: "1+1x60"
- series: probe_success:burnrate1d{instance="data-science-pipelines-operator"}
Expand All @@ -182,7 +182,7 @@ tests:

# application unavailable
- interval: 1m
input_series:
input_series:
- series: data_science_pipelines_application_ready{dspa_name="dspa_instance_1", dspa_namespace="dspa_namespace_a"}
values: "0x200"
- series: data_science_pipelines_application_apiserver_ready{dspa_name="dspa_instance_1", dspa_namespace="dspa_namespace_a"}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
rule_files:
- "kserve_alerts.yaml"
- kserve-alerting.rules.yaml

evaluation_interval: 1m

Expand Down Expand Up @@ -80,4 +80,3 @@ tests:
message: "High error budget burn for kserve-controller-manager (current value: 61)."
summary: Kserve Controller Probe Success Burn Rate
triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-kserve-controller-probe-success-burn-rate.md"

Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
rule_files:
- "kueue_alerts.yaml"
- kueue-alerting.rules.yaml

evaluation_interval: 1m

Expand Down Expand Up @@ -27,7 +27,7 @@ tests:
description: This alert fires when the Kueue Operator is not running.
summary: Alerting for Kueue Operator
triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/kueue-operator-availability.md'

- interval: 1m
input_series:
- series: up{job="Kueue Operator"}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
rule_files:
- model_mesh_alerts.yaml
- model-mesh-alerting.rules.yaml

evaluation_interval: 1m

Expand Down Expand Up @@ -62,7 +62,7 @@ tests:
summary: "Modelmesh Controller Probe Success Burn Rate"
message: "High error budget burn for modelmesh-controller (current value: 16)."
triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-modelmesh-controller-probe-success-burn-rate.md'

- interval: 1m
input_series:
- series: probe_success:burnrate2h{instance="modelmesh-controller"}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
rule_files:
- model_controller_alerts.yaml
- odh-model-controller-alerting.rules.yaml

evaluation_interval: 1m

Expand Down Expand Up @@ -62,7 +62,7 @@ tests:
summary: "ODH Model Controller Probe Success Burn Rate"
message: "High error budget burn for odh-model-controller (current value: 16)."
triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-odh-controller-probe-success-burn-rate.md'

- interval: 1m
input_series:
- series: probe_success:burnrate2h{instance="odh-model-controller"}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
rule_files:
- "kuberay_alerts.yaml"
- "ray-alerting.rules.yaml"

evaluation_interval: 1m

Expand Down Expand Up @@ -27,7 +27,7 @@ tests:
description: This alert fires when the KubeRay Operator is not running.
summary: Alerting for KubeRay Operator
triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/kuberay-operator-availability.md'

- interval: 1m
input_series:
- series: up{job="KubeRay Operator"}
Expand All @@ -44,4 +44,3 @@ tests:
description: This alert fires when the KubeRay Operator is not running.
summary: Alerting for KubeRay Operator
triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/kuberay-operator-availability.md'

Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
rule_files:
- dashboard_alerts.yaml
- rhods-dashboard-alerting.rules.yaml

evaluation_interval: 1m

Expand Down Expand Up @@ -44,7 +44,7 @@ tests:
triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/RHODS-Dashboard/rhods-error-burn-rate.md'

- interval: 1m
input_series:
input_series:
- series: haproxy_backend_http_responses_total:burnrate30m{route="rhods-dashboard"}
values: "1+1x60"
- series: haproxy_backend_http_responses_total:burnrate6h{route="rhods-dashboard"}
Expand All @@ -64,7 +64,7 @@ tests:
triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/RHODS-Dashboard/rhods-error-burn-rate.md'

- interval: 1m
input_series:
input_series:
- series: haproxy_backend_http_responses_total:burnrate2h{route="rhods-dashboard"}
values: "1+1x60"
- series: haproxy_backend_http_responses_total:burnrate1d{route="rhods-dashboard"}
Expand Down Expand Up @@ -124,7 +124,7 @@ tests:
triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/RHODS-Dashboard/rhods-dashboard-probe-success-burn-rate.md'

- interval: 1m
input_series:
input_series:
- series: probe_success:burnrate30m{name="rhods-dashboard"}
values: "1+1x60"
- series: probe_success:burnrate6h{name="rhods-dashboard"}
Expand All @@ -144,7 +144,7 @@ tests:
triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/RHODS-Dashboard/rhods-dashboard-probe-success-burn-rate.md'

- interval: 1m
input_series:
input_series:
- series: probe_success:burnrate2h{name="rhods-dashboard"}
values: "1+1x60"
- series: probe_success:burnrate1d{name="rhods-dashboard"}
Expand All @@ -164,7 +164,7 @@ tests:
triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/RHODS-Dashboard/rhods-dashboard-probe-success-burn-rate.md'

- interval: 1m
input_series:
input_series:
- series: probe_success:burnrate6h{name="rhods-dashboard"}
values: "1+1x200"
- series: probe_success:burnrate3d{name="rhods-dashboard"}
Expand Down
27 changes: 0 additions & 27 deletions tests/prometheus_unit_tests/scripts/extract_alerts.sh

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
rule_files:
- "training_operator_alerts.yaml"
- trainingoperator-alerting.rules.yaml

evaluation_interval: 1m

Expand Down Expand Up @@ -27,11 +27,11 @@ tests:
description: "This alert fires when the KubeFlow Training Operator is not running."
summary: Alerting for KubeFlow Training Operator
triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/training-operator-availability.md"

- interval: 1m
input_series:
- series: up{job="KubeFlow Training Operator"}
values: "0"
values: "0"
alert_rule_test:
- eval_time: 2m
alertname: KubeFlow Training Operator is not running
Expand Down
Loading

0 comments on commit 5a453e9

Please sign in to comment.