diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index fb3a02c..7d8b460 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -12,7 +12,8 @@ jobs: LOGZIO_API_URL: https://api.logz.io LOGZIO_API_TOKEN: ${{ secrets.LOGZIO_API_TOKEN }} RULES_DS: ${{ secrets.RULES_DS }} - CONFIGMAP_ANNOTATION: prometheus.io/kube-rules + RULES_CONFIGMAP_ANNOTATION: prometheus.io/kube-rules + ALERTMANAGER_CONFIGMAP_ANNOTATION: prometheus.io/kube-alertmanager steps: - name: Set up Go @@ -42,5 +43,9 @@ jobs: run: go mod download - name: Run tests - run: go test ./... -cover + run: go test -v ./... -coverprofile=coverage.out + + - name: Extract coverage percentage + run: go tool cover -func=coverage.out | grep total | awk '{print $3}' + diff --git a/README.md b/README.md index c66ade9..f5bdea3 100644 --- a/README.md +++ b/README.md @@ -8,18 +8,28 @@ Before running this software, ensure you have: - Access to a Kubernetes cluster - Logz.io account with API access +## Supported contact point types +- `Email` +- `Slack` +- `Pagerduty` + +More types will be supported in the future, If you have a specific request please post an issue with your request + ## Configuration Configure the application using the following environment variables: -| Environment Variable | Description | Default Value | -|------------------------|------------------------------------------------------------------------------------|----------------------------| -| `LOGZIO_API_TOKEN` | The API token for your Logz.io account. | `None` | -| `LOGZIO_API_URL` | The URL endpoint for the Logz.io API. | `https://api.logz.io` | -| `CONFIGMAP_ANNOTATION` | The specific annotation the controller should look for in Prometheus alert rules. | `prometheus.io/kube-rules` | -| `RULES_DS` | The metrics data source name in logz.io for the Prometheus rules. | `None` | -| `ENV_ID` | Environment identifier, usually cluster name. | `my-env` | -| `WORKER_COUNT` | The number of workers to process the alerts. | `2` | +| Environment Variable | Description | Default Value | +|-------------------------------------|---------------------------------------------------------------------------------------------------|-----------------------------------| +| `LOGZIO_API_TOKEN` | The API token for your Logz.io account. | `None` | +| `LOGZIO_API_URL` | The URL endpoint for the Logz.io API. | `https://api.logz.io` | +| `RULES_CONFIGMAP_ANNOTATION` | The specific annotation the controller should look for in Prometheus alert rules. | `prometheus.io/kube-rules` | +| `ALERTMANAGER_CONFIGMAP_ANNOTATION` | The specific annotation the controller should look for in Prometheus alert manager configuration. | `prometheus.io/kube-alertmanager` | +| `RULES_DS` | The metrics data source name in logz.io for the Prometheus rules. | `None` | +| `ENV_ID` | Environment identifier, usually cluster name. | `my-env` | +| `WORKER_COUNT` | The number of workers to process the alerts. | `2` | +| `IGNORE_SLACK_TEXT` | Ignore slack contact points `text` field. | `flase` | +| `IGNORE_SLACK_TITLE` | Ignore slack contact points `title` field. | `false` | Please ensure to set all necessary environment variables before running the application. @@ -30,12 +40,12 @@ To start using the controller: 2. Navigate to the project directory. 3. Run the controller `make run-local`. -### ConfigMap Format -The controller is designed to process ConfigMaps containing Prometheus alert rules. These ConfigMaps must be annotated with a specific key that matches the value of the `ANNOTATION` environment variable for the controller to process them. +### ConfigMap format +The controller is designed to process ConfigMaps containing Prometheus alert rules and promethium alert manager configuration. These ConfigMaps must be annotated with a specific key that matches the value of the `RULES_CONFIGMAP_ANNOTATION` or `ALERTMANAGER_CONFIGMAP_ANNOTATION` environment variables for the controller to process them. -### Example ConfigMap +### Example rules configMap -Below is an example of how a ConfigMap should be structured: +Below is an example of how a rules configMap should be structured: ```yaml apiVersion: v1 @@ -60,7 +70,68 @@ data: - Replace `prometheus.io/kube-rules` with the actual annotation you use to identify relevant ConfigMaps. The data section should contain your Prometheus alert rules in YAML format. - Deploy the configmap to your cluster `kubectl apply -f .yml` +### Example alert manager configMap + +Below is an example of how a alert manager ConfigMap should be structured: + +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: logzio-rules + namespace: monitoring + annotations: + prometheus.io/kube-alertmanager: "true" +data: + all_instances_down_otel_collector: | + global: + # Global configurations, adjust these to your SMTP server details + smtp_smarthost: 'smtp.example.com:587' + smtp_from: 'alertmanager@example.com' + smtp_auth_username: 'alertmanager' + smtp_auth_password: 'password' + # The root route on which each incoming alert enters. + route: + receiver: 'default-receiver' + group_by: ['alertname', 'env'] + group_wait: 30s + group_interval: 5m + repeat_interval: 1h + # Child routes + routes: + - match: + env: production + receiver: 'slack-production' + continue: true + - match: + env: staging + receiver: 'slack-staging' + continue: true + + # Receivers defines ways to send notifications about alerts. + receivers: + - name: 'default-receiver' + email_configs: + - to: 'alerts@example.com' + - name: 'slack-production' + slack_configs: + - api_url: 'https://hooks.slack.com/services/T00000000/B00000000/' + channel: '#prod-alerts' + - name: 'slack-staging' + slack_configs: + - api_url: 'https://hooks.slack.com/services/T00000000/B11111111/' + channel: '#staging-alerts' + +``` +- Replace `prometheus.io/kube-alertmanager` with the actual annotation you use to identify relevant ConfigMaps. The data section should contain your Prometheus alert rules in YAML format. +- Deploy the configmap to your cluster `kubectl apply -f .yml` + + ## Changelog +- v1.0.3 + - Handle Prometheus alert manager configuration file + - Add CRUD operations for contact points and notification policies + - Add `IGNORE_SLACK_TEXT` and `IGNORE_SLACK_TITLE` flags - v1.0.2 - Add `reduce` query to alerts (grafana alerts can evaluate alerts only from reduced data) - v1.0.1 diff --git a/common/common.go b/common/common.go new file mode 100644 index 0000000..b7eaaff --- /dev/null +++ b/common/common.go @@ -0,0 +1,246 @@ +package common + +import ( + "flag" + "fmt" + "github.com/logzio/logzio_terraform_client/grafana_alerts" + "github.com/prometheus/prometheus/model/rulefmt" + corev1 "k8s.io/api/core/v1" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" + "k8s.io/client-go/util/homedir" + "k8s.io/klog/v2" + "math/rand" + "os" + "path/filepath" + "reflect" + "strconv" + "time" +) + +const ( + LetterBytes = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" + letterIdxBits = 6 // 6 bits to represent a letter index + letterIdxMask = 1<= 0; { + if remain == 0 { + cache, remain = src.Int63(), letterIdxMax + } + if idx := int(cache & letterIdxMask); idx < len(LetterBytes) { + b[i] = LetterBytes[idx] + i-- + } + cache >>= letterIdxBits + remain-- + } + + return string(b) +} + +// ParseDuration turns a duration string (example: 5m, 1h) into an int64 value +func ParseDuration(durationStr string) (int64, error) { + // Check if the string is empty + if durationStr == "" { + return 0, fmt.Errorf("duration string is empty") + } + + // Handle the special case where the duration string is just a number (assumed to be seconds) + if _, err := strconv.Atoi(durationStr); err == nil { + seconds, _ := strconv.ParseInt(durationStr, 10, 64) + return seconds * int64(time.Second), nil + } + + // Parse the duration string + duration, err := time.ParseDuration(durationStr) + if err != nil { + return 0, err + } + + // Convert the time.Duration value to an int64 + return int64(duration), nil +} + +func CreateNameStub(cm *corev1.ConfigMap) string { + name := cm.GetObjectMeta().GetName() + namespace := cm.GetObjectMeta().GetNamespace() + + return fmt.Sprintf("%s-%s", namespace, name) +} + +// IsAlertEqual compares two AlertRule objects for equality. +func IsAlertEqual(rule rulefmt.RuleNode, grafanaRule grafana_alerts.GrafanaAlertRule) bool { + // Start with name comparison; if these don't match, they're definitely not equal. + if rule.Alert.Value != grafanaRule.Title { + return false + } + if !reflect.DeepEqual(rule.Labels, grafanaRule.Labels) { + return false + } + if !reflect.DeepEqual(rule.Annotations, grafanaRule.Annotations) { + return false + } + forAtt, _ := ParseDuration(rule.For.String()) + if forAtt != grafanaRule.For { + return false + } + if rule.Expr.Value != grafanaRule.Data[0].Model.(map[string]interface{})["expr"] { + return false + } + return true +} + +// GetConfig returns a Kubernetes config +func GetConfig() (*rest.Config, error) { + var config *rest.Config + + kubeconfig := filepath.Join(homedir.HomeDir(), ".kube", "config") + if _, err := os.Stat(kubeconfig); err == nil { + config, err = clientcmd.BuildConfigFromFlags("", kubeconfig) + if err != nil { + return nil, err + } + } else { + config, err = rest.InClusterConfig() + if err != nil { + return nil, err + } + } + + return config, nil +} + +// Config holds all the configuration needed for the application to run. +type Config struct { + RulesAnnotation string + AlertManagerAnnotation string + LogzioAPIToken string + LogzioAPIURL string + RulesDS string + EnvID string + WorkerCount int + IgnoreSlackText bool + IgnoreSlackTitle bool +} diff --git a/common/common_test.go b/common/common_test.go new file mode 100644 index 0000000..e5fc18f --- /dev/null +++ b/common/common_test.go @@ -0,0 +1,199 @@ +package common + +import ( + "github.com/logzio/logzio_terraform_client/grafana_alerts" + "github.com/prometheus/common/model" + "github.com/prometheus/prometheus/model/rulefmt" + "gopkg.in/yaml.v3" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "strings" + "testing" + "time" +) + +func TestGenerateRandomString(t *testing.T) { + testCases := []struct { + name string + length int + }{ + {"length 10", 10}, + {"length 0", 0}, + {"negative length", -1}, + {"large length", 1000}, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + result := GenerateRandomString(tc.length) + + if len(result) != tc.length && tc.length >= 0 { + t.Errorf("Expected string of length %d, got string of length %d", tc.length, len(result)) + } + + for _, char := range result { + if !strings.Contains(LetterBytes, string(char)) { + t.Errorf("generateRandomString() produced a string with invalid character: %v", char) + } + } + + if tc.length > 0 { + otherResult := GenerateRandomString(tc.length) + if result == otherResult { + t.Errorf("generateRandomString() does not seem to produce random strings") + } + } + }) + } +} + +func TestParseDuration(t *testing.T) { + tests := []struct { + input string + expected int64 + err bool + }{ + {"", 0, true}, + {"123", 123 * int64(time.Second), false}, + {"1h", int64(time.Hour), false}, + {"invalid", 0, true}, + } + + for _, test := range tests { + duration, err := ParseDuration(test.input) + if test.err && err == nil { + t.Errorf("Expected error for input %s", test.input) + } + if !test.err && duration != test.expected { + t.Errorf("Expected %d, got %d for input %s", test.expected, duration, test.input) + } + } +} + +func TestCreateNameStub(t *testing.T) { + cm := &v1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-name", + Namespace: "test-namespace", + }, + } + expected := "test-namespace-test-name" + stub := CreateNameStub(cm) + if stub != expected { + t.Errorf("Expected %s, got %s", expected, stub) + } +} + +func TestIsAlertEqual(t *testing.T) { + // dummy time duration + tenMinutes, _ := model.ParseDuration("10m") + tenMinutesNs := int64(10 * time.Minute) + fiveMinutes, _ := model.ParseDuration("5m") + + // dummy expression nodes + exprNode := yaml.Node{Value: "metric > 0.5"} + exprQuery := []*grafana_alerts.GrafanaAlertQuery{{Model: map[string]interface{}{"expr": "metric > 0.5"}}} + differentExprQuery := []*grafana_alerts.GrafanaAlertQuery{{Model: map[string]interface{}{"expr": "metric > 0.7"}}} + + testCases := []struct { + name string + rule rulefmt.RuleNode + grafanaRule grafana_alerts.GrafanaAlertRule + expected bool + }{ + { + name: "same rules", + rule: rulefmt.RuleNode{ + Alert: yaml.Node{Value: "SameName"}, + Expr: exprNode, + For: tenMinutes, + Labels: map[string]string{"severity": "critical"}, + Annotations: map[string]string{"summary": "High CPU usage"}, + }, + grafanaRule: grafana_alerts.GrafanaAlertRule{ + Title: "SameName", + Data: exprQuery, + For: tenMinutesNs, + Labels: map[string]string{"severity": "critical"}, + Annotations: map[string]string{"summary": "High CPU usage"}, + }, + expected: true, + }, + { + name: "different titles", + rule: rulefmt.RuleNode{ + Alert: yaml.Node{Value: "AlertName1"}, + Expr: exprNode, + For: tenMinutes, + }, + grafanaRule: grafana_alerts.GrafanaAlertRule{ + Title: "AlertName2", + Data: exprQuery, + For: tenMinutesNs, + }, + expected: false, + }, + { + name: "different labels", + rule: rulefmt.RuleNode{ + Alert: yaml.Node{Value: "SameName"}, + Expr: exprNode, + Labels: map[string]string{"severity": "warning"}, + }, + grafanaRule: grafana_alerts.GrafanaAlertRule{ + Title: "SameName", + Labels: map[string]string{"severity": "critical"}, + Data: exprQuery, + }, + expected: false, + }, + { + name: "different annotations", + rule: rulefmt.RuleNode{ + Alert: yaml.Node{Value: "SameName"}, + Expr: exprNode, + Annotations: map[string]string{"description": "CPU usage is high"}, + }, + grafanaRule: grafana_alerts.GrafanaAlertRule{ + Title: "SameName", + Annotations: map[string]string{"description": "Disk usage is high"}, + Data: exprQuery, + }, + expected: false, + }, + { + name: "different expressions", + rule: rulefmt.RuleNode{ + Alert: yaml.Node{Value: "SameName"}, + Expr: exprNode, + }, + grafanaRule: grafana_alerts.GrafanaAlertRule{ + Title: "SameName", + Data: differentExprQuery, + }, + expected: false, + }, + { + name: "different durations", + rule: rulefmt.RuleNode{ + Alert: yaml.Node{Value: "SameName"}, + Expr: exprNode, + For: fiveMinutes, + }, + grafanaRule: grafana_alerts.GrafanaAlertRule{ + Title: "SameName", + Data: exprQuery, + For: tenMinutesNs, + }, + expected: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + if got := IsAlertEqual(tc.rule, tc.grafanaRule); got != tc.expected { + t.Errorf("isAlertEqual() for test case %q = %v, want %v", tc.name, got, tc.expected) + } + }) + } +} diff --git a/controller/controller.go b/controller/controller.go index dad7991..a7d77e6 100644 --- a/controller/controller.go +++ b/controller/controller.go @@ -2,13 +2,12 @@ package controller import ( "context" - "encoding/json" "fmt" - "time" - "github.com/logzio/logzio_terraform_client/grafana_alerts" - "github.com/logzio/logzio_terraform_client/grafana_datasources" - "github.com/logzio/logzio_terraform_client/grafana_folders" + "github.com/logzio/logzio_terraform_client/grafana_contact_points" + "github.com/logzio/prometheus-alerts-migrator/common" + "github.com/logzio/prometheus-alerts-migrator/logzio_alerts_client" + alert_manager_config "github.com/prometheus/alertmanager/config" "github.com/prometheus/prometheus/model/rulefmt" _ "github.com/prometheus/prometheus/promql/parser" "gopkg.in/yaml.v3" @@ -19,7 +18,8 @@ import ( "k8s.io/client-go/tools/cache" "k8s.io/client-go/tools/record" "k8s.io/client-go/util/workqueue" - "k8s.io/klog" + "k8s.io/klog/v2" + "time" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -30,55 +30,10 @@ import ( ) const ( - alertFolder = "prometheus-alerts" controllerAgentName = "logzio-prometheus-alerts-migrator-controller" ErrInvalidKey = "InvalidKey" - letterBytes = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" - letterIdxBits = 6 // 6 bits to represent a letter index - letterIdxMask = 1< 0 { - c.writeRules(toAdd, folderUid) + /* + Processing logic: + 1. compare contact points with logz.io managed contact points + 2. if contact point is not found at logz.io, add it + 3. if contact point is found at logz.io, update it + 4. handle setting new notification policy tree after contact points are processed, to prevent missing contact points at logzio + 5. delete contact points from logz.io that are not found in the alert manager configmap + Note: `name` field is the identifier for contact points, when a user changes the name of a contact point, it will delete the old one and create a new one, so we handle deletion after setting the new notification policy tree to avoid deleting contact points that are in use + */ + + contactPointsToAdd, contactPointsToUpdate, contactPointsToDelete := c.compareContactPoints(receiversMap, logzioContactPoints) + if len(contactPointsToUpdate) > 0 { + c.logzioGrafanaAlertsClient.UpdateContactPoints(contactPointsToUpdate, logzioContactPoints) } - if len(toUpdate) > 0 { - c.updateRules(toUpdate, logzioRulesMap, folderUid) + if len(contactPointsToAdd) > 0 { + c.logzioGrafanaAlertsClient.WriteContactPoints(contactPointsToAdd) } - if len(toDelete) > 0 { - c.deleteRules(toDelete, folderUid) + // Handle the notification policies after contact points are processed, to prevent missing contact points at logzio + c.logzioGrafanaAlertsClient.SetNotificationPolicyTreeFromRouteTree(routeTree) + + if len(contactPointsToDelete) > 0 { + c.logzioGrafanaAlertsClient.DeleteContactPoints(contactPointsToDelete) } return nil } -// deleteRules deletes the rules from logz.io -func (c *Controller) deleteRules(rulesToDelete []grafana_alerts.GrafanaAlertRule, folderUid string) { - for _, rule := range rulesToDelete { - err := c.logzioAlertClient.DeleteGrafanaAlertRule(rule.Uid) - if err != nil { - klog.Warningf("Error deleting rule: %s - %s", rule.Title, err.Error()) - } - } -} +func (c *Controller) getClusterReceiversAndRoutes(configmap *corev1.ConfigMap) ([]alert_manager_config.Receiver, *alert_manager_config.Route, error) { + var receivers []alert_manager_config.Receiver + var routeTree alert_manager_config.Route + if c.isAlertManagerConfigMap(configmap) { + for _, value := range configmap.Data { + alertManagerConfig, err := alert_manager_config.Load(value) + if err != nil { + utilruntime.HandleError(fmt.Errorf("unable to load alert manager config; %s", err)) + return nil, &alert_manager_config.Route{}, err + } + // Add prefix to distinguish between alert manager imported from alert manager and logz.io custom contact points + stub := common.CreateNameStub(configmap) + for _, receiver := range alertManagerConfig.Receivers { + receiver.Name = fmt.Sprintf("%s-%s-%s", c.envId, stub, receiver.Name) + receivers = append(receivers, receiver) -// updateRules updates the rules in logz.io -func (c *Controller) updateRules(rulesToUpdate []rulefmt.RuleNode, logzioRulesMap map[string]grafana_alerts.GrafanaAlertRule, folderUid string) { - for _, rule := range rulesToUpdate { - // Retrieve the existing GrafanaAlertRule to get the Uid. - existingRule := logzioRulesMap[rule.Alert.Value] - alert, err := c.generateGrafanaAlert(rule, folderUid) - if err != nil { - klog.Warning(err) - continue // Skip this rule and continue with the next - } - // Set the Uid from the existing rule. - alert.Uid = existingRule.Uid - err = c.logzioAlertClient.UpdateGrafanaAlertRule(alert) - if err != nil { - klog.Warningf("Error updating rule: %s - %s", alert.Title, err.Error()) + } + // Add prefix to routes to match with contact points + routeTree = *alertManagerConfig.Route + routeTree.Receiver = fmt.Sprintf("%s-%s-%s", c.envId, stub, routeTree.Receiver) + for _, route := range routeTree.Routes { + route.Receiver = fmt.Sprintf("%s-%s-%s", c.envId, stub, route.Receiver) + } + // setting the `AlertManagerGlobalConfig` context for logzio grafana alerts client + c.logzioGrafanaAlertsClient.AlertManagerGlobalConfig = alertManagerConfig.Global } } + klog.Infof("Found %d receivers and %d routes, in %s", len(receivers), len(routeTree.Routes), configmap.Name) + return receivers, &routeTree, nil } -// writeRules writes the rules to logz.io -func (c *Controller) writeRules(rulesToWrite []rulefmt.RuleNode, folderUid string) { - for _, rule := range rulesToWrite { - alert, err := c.generateGrafanaAlert(rule, folderUid) - if err != nil { - klog.Warning(err) +// compareContactPoints +func (c *Controller) compareContactPoints(receiversMap map[string]alert_manager_config.Receiver, logzioContactPoints []grafana_contact_points.GrafanaContactPoint) (contactPointsToAdd, contactPointsToUpdate []alert_manager_config.Receiver, contactPointsToDelete []grafana_contact_points.GrafanaContactPoint) { + // Initialize a map with slices as values for Logz.io contact points + existingContactPoints := make(map[string][]grafana_contact_points.GrafanaContactPoint) + for _, contactPoint := range logzioContactPoints { + existingContactPoints[contactPoint.Name] = append(existingContactPoints[contactPoint.Name], contactPoint) + } + // Iterate over receivers to find which ones to add or update + for receiverName, receiver := range receiversMap { + _, exists := existingContactPoints[receiverName] + if !exists { + // If the receiver does not exist in Logz.io contact points, add it + contactPointsToAdd = append(contactPointsToAdd, receiver) + } else { + // If the receiver exists in Logz.io contact points, override with the alert manager receiver state + contactPointsToUpdate = append(contactPointsToUpdate, receiver) } - _, err = c.logzioAlertClient.CreateGrafanaAlertRule(alert) - if err != nil { - klog.Warning("Error writing rule:", alert.Title, err.Error()) + } + // Iterate over Logz.io contact points to find which ones to delete + for _, contactPoints := range existingContactPoints { + for _, contactPoint := range contactPoints { + if _, exists := receiversMap[contactPoint.Name]; !exists { + // If the Logz.io contact point does not exist among the receivers, delete it + contactPointsToDelete = append(contactPointsToDelete, contactPoint) + } } } + + return contactPointsToAdd, contactPointsToUpdate, contactPointsToDelete } -// generateGrafanaAlert generates a GrafanaAlertRule from a Prometheus rule -func (c *Controller) generateGrafanaAlert(rule rulefmt.RuleNode, folderUid string) (grafana_alerts.GrafanaAlertRule, error) { - // Create promql query to return time series data for the expression. - promqlQuery := PrometheusQueryModel{ - Expr: rule.Expr.Value, - Hide: false, - RefId: refIdA, +// processRulesConfigMaps gets the state of alert rules from both cluster configmaps and logz.io, compares the rules and decide what crud operations to perform +func (c *Controller) processRulesConfigMaps(mapList *corev1.ConfigMapList) error { + alertRules := c.getClusterAlertRules(mapList) + folderUid, err := c.logzioGrafanaAlertsClient.FindOrCreatePrometheusAlertsFolder() + if err != nil { + utilruntime.HandleError(err) + return err } - // Use the ToJSON method to marshal the Query struct. - promqlModel, err := promqlQuery.ToJSON() + logzioAlertRules, err := c.logzioGrafanaAlertsClient.GetLogzioGrafanaAlerts(folderUid) if err != nil { - return grafana_alerts.GrafanaAlertRule{}, err - } - queryA := grafana_alerts.GrafanaAlertQuery{ - DatasourceUid: c.rulesDataSource, - Model: promqlModel, - RefId: refIdA, - QueryType: queryType, - RelativeTimeRange: grafana_alerts.RelativeTimeRangeObj{ - From: 300, - To: 0, - }, + utilruntime.HandleError(err) + return err } - // Create reduce query to return the reduced last value of the time series data. - reduceQuery := ReduceQueryModel{ - DataSource: map[string]string{ - "type": expressionString, - "uid": expressionString, - }, - Expression: refIdA, - Hide: false, - RefId: refIdB, - Reducer: "last", - Type: "reduce", + // Maps for efficient lookups by alert name. + rulesMap := make(map[string]rulefmt.RuleNode, len(*alertRules)) + logzioRulesMap := make(map[string]grafana_alerts.GrafanaAlertRule, len(logzioAlertRules)) + // Process Kubernetes alerts into a map. + for _, alert := range *alertRules { + rulesMap[alert.Alert.Value] = alert } - reduceModel, err := reduceQuery.ToJSON() - if err != nil { - return grafana_alerts.GrafanaAlertRule{}, err - } - queryB := grafana_alerts.GrafanaAlertQuery{ - DatasourceUid: expressionString, - Model: reduceModel, - RefId: refIdB, - QueryType: "", - RelativeTimeRange: grafana_alerts.RelativeTimeRangeObj{ - From: 300, - To: 0, - }, + // Process Logz.io alerts into a map. + for _, alert := range logzioAlertRules { + logzioRulesMap[alert.Title] = alert } - duration, err := parseDuration(rule.For.String()) - if err != nil { - return grafana_alerts.GrafanaAlertRule{}, err - } - - // Create the GrafanaAlertRule, we are alerting on the reduced last value of the time series data (query b). - grafanaAlert := grafana_alerts.GrafanaAlertRule{ - Annotations: rule.Annotations, - Condition: refIdB, - Data: []*grafana_alerts.GrafanaAlertQuery{&queryA, &queryB}, - FolderUID: folderUid, - NoDataState: grafana_alerts.NoDataOk, - ExecErrState: grafana_alerts.ErrOK, - Labels: rule.Labels, - OrgID: 1, - RuleGroup: rule.Alert.Value, - Title: rule.Alert.Value, - For: duration, - } - return grafanaAlert, nil + c.processAlertRules(rulesMap, logzioRulesMap, folderUid) + return nil } -// enqueueConfigMap get the cm on the workqueue -func (c *Controller) enqueueConfigMap(obj interface{}) { - var key string - var err error - if key, err = cache.MetaNamespaceKeyFunc(obj); err != nil { - utilruntime.HandleError(err) - return +func (c *Controller) processAlertRules(rulesMap map[string]rulefmt.RuleNode, logzioRulesMap map[string]grafana_alerts.GrafanaAlertRule, folderUid string) { + rulesToAdd, rulesToUpdate, rulesToDelete := c.compareAlertRules(rulesMap, logzioRulesMap) + klog.Infof("Alert rules summary: to add: %d, to update: %d, to delete: %d", len(rulesToAdd), len(rulesToUpdate), len(rulesToDelete)) + + if len(rulesToAdd) > 0 { + c.logzioGrafanaAlertsClient.WriteRules(rulesToAdd, folderUid) + } + if len(rulesToUpdate) > 0 { + c.logzioGrafanaAlertsClient.UpdateRules(rulesToUpdate, logzioRulesMap, folderUid) + } + if len(rulesToDelete) > 0 { + c.logzioGrafanaAlertsClient.DeleteRules(rulesToDelete, folderUid) } - c.workqueue.Add(key) } // getClusterAlertRules builds a list of rules from all the configmaps in the cluster @@ -497,50 +430,10 @@ func (c *Controller) getClusterAlertRules(mapList *corev1.ConfigMapList) *[]rule return &finalRules } -// getLogzioGrafanaAlerts builds a list of rules from all logz.io -func (c *Controller) getLogzioGrafanaAlerts(folderUid string) ([]grafana_alerts.GrafanaAlertRule, error) { - alertRules, ListLogzioRulesErr := c.logzioAlertClient.ListGrafanaAlertRules() - if ListLogzioRulesErr != nil { - return nil, ListLogzioRulesErr - } - // find all alerts inside prometheus alerts folder - var alertsInFolder []grafana_alerts.GrafanaAlertRule - for _, rule := range alertRules { - if rule.FolderUID == folderUid { - alertsInFolder = append(alertsInFolder, rule) - } - } - return alertsInFolder, nil -} - -// findOrCreatePrometheusAlertsFolder tries to find the prometheus alerts folder in logz.io, if it does not exist it creates it. -func (c *Controller) findOrCreatePrometheusAlertsFolder() (string, error) { - folders, err := c.logzioFolderClient.ListGrafanaFolders() - if err != nil { - return "", err - } - envFolderTitle := fmt.Sprintf("%s-%s", c.envId, alertFolder) - // try to find the prometheus alerts folder - for _, folder := range folders { - if folder.Title == envFolderTitle { - return folder.Uid, nil - } - } - // if not found, create the prometheus alerts folder - grafanaFolder, err := c.logzioFolderClient.CreateGrafanaFolder(grafana_folders.CreateUpdateFolder{ - Uid: fmt.Sprintf("%s-%s", envFolderTitle, generateRandomString(randomStringLength)), - Title: envFolderTitle, - }) - if err != nil { - return "", err - } - return grafanaFolder.Uid, nil -} - // extractValues extracts the rules from the configmap, and validates them func (c *Controller) extractValues(cm *corev1.ConfigMap) []rulefmt.RuleNode { - fallbackNameStub := createNameStub(cm) + fallbackNameStub := common.CreateNameStub(cm) var toalRules []rulefmt.RuleNode @@ -597,16 +490,16 @@ func (c *Controller) extractRules(value string) (error, rulefmt.RuleNode) { // compareAlertRules compares the rules from Kubernetes with those in Logz.io. // It returns three slices of rulefmt.RuleNode and grafana_alerts.GrafanaAlertRule indicating which rules to add, update, or delete. -func (c *Controller) compareAlertRules(k8sRulesMap map[string]rulefmt.RuleNode, logzioRulesMap map[string]grafana_alerts.GrafanaAlertRule) (toAdd, toUpdate []rulefmt.RuleNode, toDelete []grafana_alerts.GrafanaAlertRule) { +func (c *Controller) compareAlertRules(k8sRulesMap map[string]rulefmt.RuleNode, logzioRulesMap map[string]grafana_alerts.GrafanaAlertRule) (rulesToAdd, rulesToUpdate []rulefmt.RuleNode, rulesToDelete []grafana_alerts.GrafanaAlertRule) { // Determine rules to add or update. for alertName, k8sRule := range k8sRulesMap { logzioRule, exists := logzioRulesMap[alertName] if !exists { // Alert doesn't exist in Logz.io, needs to be added. - toAdd = append(toAdd, k8sRule) - } else if !isAlertEqual(k8sRule, logzioRule) { + rulesToAdd = append(rulesToAdd, k8sRule) + } else if !common.IsAlertEqual(k8sRule, logzioRule) { // Alert exists but differs, needs to be updated. - toUpdate = append(toUpdate, k8sRule) + rulesToUpdate = append(rulesToUpdate, k8sRule) } } @@ -614,11 +507,11 @@ func (c *Controller) compareAlertRules(k8sRulesMap map[string]rulefmt.RuleNode, for alertName := range logzioRulesMap { if _, exists := k8sRulesMap[alertName]; !exists { // Alert is in Logz.io but not in Kubernetes, needs to be deleted. - toDelete = append(toDelete, logzioRulesMap[alertName]) + rulesToDelete = append(rulesToDelete, logzioRulesMap[alertName]) } } - return toAdd, toUpdate, toDelete + return rulesToAdd, rulesToUpdate, rulesToDelete } // isRuleConfigMap checks if the configmap is a rule configmap @@ -629,11 +522,24 @@ func (c *Controller) isRuleConfigMap(cm *corev1.ConfigMap) bool { annotations := cm.GetObjectMeta().GetAnnotations() for key := range annotations { - if key == *c.interestingAnnotation { + if key == *c.rulesAnnotation { return true } } + return false +} +// isAlertManagerConfigMap checks if the configmap is a rule configmap +func (c *Controller) isAlertManagerConfigMap(cm *corev1.ConfigMap) bool { + if cm == nil { + return false + } + annotations := cm.GetObjectMeta().GetAnnotations() + for key := range annotations { + if key == *c.alertManagerAnnotation { + return true + } + } return false } @@ -645,8 +551,8 @@ func (c *Controller) haveConfigMapsChanged(mapList *corev1.ConfigMapList) bool { return false } for _, cm := range mapList.Items { - if c.isRuleConfigMap(&cm) { - stub := createNameStub(&cm) + if c.isRuleConfigMap(&cm) || c.isAlertManagerConfigMap(&cm) { + stub := common.CreateNameStub(&cm) val, ok := c.resourceVersionMap[stub] if !ok { // new configmap diff --git a/controller/controller_e2e_cp_test.go b/controller/controller_e2e_cp_test.go new file mode 100644 index 0000000..29025d0 --- /dev/null +++ b/controller/controller_e2e_cp_test.go @@ -0,0 +1,70 @@ +package controller + +import ( + "github.com/logzio/prometheus-alerts-migrator/common" + "github.com/logzio/prometheus-alerts-migrator/pkg/signals" + "github.com/stretchr/testify/assert" + "k8s.io/client-go/informers" + "k8s.io/client-go/kubernetes" + "log" + "testing" + "time" +) + +var stopCh = signals.SetupSignalHandler() + +func cleanupLogzioContactPoints(ctl Controller) { + contactPoints, err := ctl.logzioGrafanaAlertsClient.GetLogzioManagedGrafanaContactPoints() + if err != nil { + log.Fatalf("Failed to get logzio contact points: %v", err) + } + ctl.logzioGrafanaAlertsClient.DeleteContactPoints(contactPoints) +} + +// TestControllerE2E is the main function that runs the end-to-end test +func TestControllerContactPointsE2E(t *testing.T) { + // Setup the test environment + config, err := common.GetConfig() + if err != nil { + t.Fatalf("Failed to get Kubernetes config: %v", err) + } + + clientset, err := kubernetes.NewForConfig(config) + if err != nil { + t.Fatalf("Failed to create Kubernetes clientset: %v", err) + } + ctlConfig := common.NewConfig() + kubeInformerFactory := informers.NewSharedInformerFactory(clientset, time.Second*30) + // set up signals so we handle the first shutdown signal gracefully + // Instantiate the controller + ctrl := NewController(clientset, kubeInformerFactory.Core().V1().ConfigMaps(), *ctlConfig) + + kubeInformerFactory.Start(stopCh) + + // test contact points + defer cleanupLogzioContactPoints(*ctrl) + defer cleanupTestCluster(clientset, testNamespace, "alert-manager") + err = deployConfigMaps(clientset, "../testdata/alert_manager_contact_points.yaml") + if err != nil { + t.Fatalf("Failed to deploy ConfigMaps: %v", err) + } + go func() { + runErr := ctrl.Run(1, stopCh) + if runErr != nil { + t.Errorf("Failed to run controller: %v", runErr) + return + } + }() + t.Log("going to sleep") + time.Sleep(time.Second * 10) + logzioContactPoints, err := ctrl.logzioGrafanaAlertsClient.GetLogzioManagedGrafanaContactPoints() + t.Log("logzio contact points:") + for i, contactPoint := range logzioContactPoints { + t.Logf("%d: %v", i, contactPoint.Name) + } + if err != nil { + t.Fatalf("Failed to get logzio contact points: %v", err) + } + assert.Equal(t, 13, len(logzioContactPoints)) + +} diff --git a/controller/controller_e2e_np_test.go b/controller/controller_e2e_np_test.go new file mode 100644 index 0000000..78fb51c --- /dev/null +++ b/controller/controller_e2e_np_test.go @@ -0,0 +1,72 @@ +package controller + +import ( + "github.com/logzio/prometheus-alerts-migrator/common" + "github.com/stretchr/testify/assert" + "k8s.io/client-go/informers" + "k8s.io/client-go/kubernetes" + "k8s.io/klog/v2" + "testing" + "time" +) + +func cleanupLogzioNotificationPolicies(ctl Controller) { + err := ctl.logzioGrafanaAlertsClient.ResetNotificationPolicyTree() + if err != nil { + klog.Error(err) + } + cleanupLogzioContactPoints(ctl) +} + +// TestControllerE2E is the main function that runs the end-to-end test +func TestControllerNotificationPoliciesE2E(t *testing.T) { + // Setup the test environment + config, err := common.GetConfig() + if err != nil { + t.Fatalf("Failed to get Kubernetes config: %v", err) + } + + clientset, err := kubernetes.NewForConfig(config) + if err != nil { + t.Fatalf("Failed to create Kubernetes clientset: %v", err) + } + ctlConfig := common.NewConfig() + kubeInformerFactory := informers.NewSharedInformerFactory(clientset, time.Second*30) + // Instantiate the controller + ctrl := NewController(clientset, kubeInformerFactory.Core().V1().ConfigMaps(), *ctlConfig) + // cleanup before starting the test to start in a clean env + cleanupLogzioNotificationPolicies(*ctrl) + cleanupLogzioContactPoints(*ctrl) + // test contact points + defer cleanupLogzioNotificationPolicies(*ctrl) + defer cleanupTestCluster(clientset, testNamespace, "alert-manager-np") + err = deployConfigMaps(clientset, "../testdata/alert_manager_notification_policies.yaml") + if err != nil { + t.Fatalf("Failed to deploy ConfigMaps: %v", err) + } + go func() { + runErr := ctrl.Run(1, stopCh) + if runErr != nil { + t.Errorf("Failed to run controller: %v", runErr) + return + } + }() + t.Log("going to sleep") + time.Sleep(time.Second * 10) + logzioContactPoints, err := ctrl.logzioGrafanaAlertsClient.GetLogzioManagedGrafanaContactPoints() + t.Log("logzio contact points:") + for i, contactPoint := range logzioContactPoints { + t.Logf("%d: %v", i, contactPoint.Name) + } + if err != nil { + t.Fatalf("Failed to get logzio contact points: %v", err) + } + assert.Equal(t, 8, len(logzioContactPoints)) + logzioNotificationPolicyTree, err := ctrl.logzioGrafanaAlertsClient.GetLogzioGrafanaNotificationPolicies() + assert.Equal(t, "my-env-alert-migrator-test-alert-manager-np-default-email", logzioNotificationPolicyTree.Receiver) + t.Log("logzio routes:") + for i, route := range logzioNotificationPolicyTree.Routes { + t.Logf("route %d: %v", i, route.Receiver) + } + assert.Equal(t, 7, len(logzioNotificationPolicyTree.Routes)) +} diff --git a/controller/controller_e2e_test.go b/controller/controller_e2e_rules_test.go similarity index 78% rename from controller/controller_e2e_test.go rename to controller/controller_e2e_rules_test.go index ede69d5..6638bae 100644 --- a/controller/controller_e2e_test.go +++ b/controller/controller_e2e_rules_test.go @@ -3,9 +3,8 @@ package controller import ( "context" "fmt" - "github.com/logzio/prometheus-alerts-migrator/pkg/signals" + "github.com/logzio/prometheus-alerts-migrator/common" "github.com/stretchr/testify/assert" - "io/ioutil" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/informers" @@ -23,7 +22,7 @@ const testNamespace = "alert-migrator-test" func deployConfigMaps(clientset *kubernetes.Clientset, configs ...string) error { for _, config := range configs { // Read the YAML file content - yamlContent, err := ioutil.ReadFile(config) + yamlContent, err := os.ReadFile(config) if err != nil { return fmt.Errorf("failed to read YAML file %s: %v", config, err) } @@ -63,22 +62,21 @@ func cleanupTestCluster(clientset *kubernetes.Clientset, namespace string, confi } func cleanupLogzioAlerts(ctl Controller) { - folderUid, err := ctl.findOrCreatePrometheusAlertsFolder() + folderUid, err := ctl.logzioGrafanaAlertsClient.FindOrCreatePrometheusAlertsFolder() if err != nil { log.Fatalf("Failed to get logzio alerts folder uid: %v", err) } - logzioAlerts, err := ctl.getLogzioGrafanaAlerts(folderUid) + logzioAlerts, err := ctl.logzioGrafanaAlertsClient.GetLogzioGrafanaAlerts(folderUid) if err != nil { log.Fatalf("Failed to get logzio alerts: %v", err) } - // defer cleanup - ctl.deleteRules(logzioAlerts, folderUid) + ctl.logzioGrafanaAlertsClient.DeleteRules(logzioAlerts, folderUid) } // TestControllerE2E is the main function that runs the end-to-end test -func TestControllerE2E(t *testing.T) { +func TestControllerRulesE2E(t *testing.T) { // Setup the test environment - config, err := GetConfig() + config, err := common.GetConfig() if err != nil { t.Fatalf("Failed to get Kubernetes config: %v", err) } @@ -87,21 +85,15 @@ func TestControllerE2E(t *testing.T) { if err != nil { t.Fatalf("Failed to create Kubernetes clientset: %v", err) } - logzioUrl := os.Getenv("LOGZIO_API_URL") - logzioAPIToken := os.Getenv("LOGZIO_API_TOKEN") - rulesDS := os.Getenv("RULES_DS") - anno := os.Getenv("CONFIGMAP_ANNOTATION") + ctlConfig := common.NewConfig() kubeInformerFactory := informers.NewSharedInformerFactory(clientset, time.Second*30) - // set up signals so we handle the first shutdown signal gracefully - stopCh := signals.SetupSignalHandler() // Instantiate the controller - ctrl := NewController(clientset, kubeInformerFactory.Core().V1().ConfigMaps(), &anno, logzioAPIToken, logzioUrl, rulesDS, "integration-test") + ctrl := NewController(clientset, kubeInformerFactory.Core().V1().ConfigMaps(), *ctlConfig) // defer cleanup defer cleanupLogzioAlerts(*ctrl) defer cleanupTestCluster(clientset, testNamespace, "opentelemetry-rules", "infrastructure-rules") - kubeInformerFactory.Start(stopCh) err = deployConfigMaps(clientset, "../testdata/cm.yml", "../testdata/cm2.yml") if err != nil { t.Fatalf("Failed to deploy ConfigMaps: %v", err) @@ -115,14 +107,18 @@ func TestControllerE2E(t *testing.T) { }() t.Log("going to sleep") time.Sleep(time.Second * 10) - folderUid, err := ctrl.findOrCreatePrometheusAlertsFolder() + folderUid, err := ctrl.logzioGrafanaAlertsClient.FindOrCreatePrometheusAlertsFolder() if err != nil { t.Fatalf("Failed to get logzio alerts folder uid: %v", err) } - logzioAlerts, err := ctrl.getLogzioGrafanaAlerts(folderUid) + logzioAlerts, err := ctrl.logzioGrafanaAlertsClient.GetLogzioGrafanaAlerts(folderUid) if err != nil { t.Fatalf("Failed to get logzio alerts: %v", err) } + t.Log("logzio alert rules:") + for i, alert := range logzioAlerts { + t.Logf("%d: %v", i, alert.Title) + } assert.Equal(t, 14, len(logzioAlerts)) } diff --git a/controller/controller_test.go b/controller/controller_test.go index d1c9b0d..1c17ca3 100644 --- a/controller/controller_test.go +++ b/controller/controller_test.go @@ -1,27 +1,23 @@ package controller import ( - "os" - "reflect" - "strings" - "testing" - "time" - "github.com/logzio/logzio_terraform_client/grafana_alerts" - "github.com/prometheus/common/model" + "github.com/logzio/prometheus-alerts-migrator/common" "github.com/prometheus/prometheus/model/rulefmt" "gopkg.in/yaml.v3" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes" - "k8s.io/klog" + "k8s.io/klog/v2" + "reflect" + "testing" ) -const annotation = "test-annotation" +const annotation = "prometheus.io/kube-rules" func generateTestController() *Controller { - cfg, err := GetConfig() + cfg, err := common.GetConfig() if err != nil { klog.Fatalf("Error getting Kubernetes config: %s", err) } @@ -30,262 +26,12 @@ func generateTestController() *Controller { if err != nil { klog.Fatalf("Error building kubernetes clientset: %s", err) } - logzioUrl := os.Getenv("LOGZIO_API_URL") - logzioAPIToken := os.Getenv("LOGZIO_API_TOKEN") - rulesDS := os.Getenv("RULES_DS") + ctlConfig := common.NewConfig() kubeInformerFactory := informers.NewSharedInformerFactory(kubeClient, 0) - annotation := "test-annotation" - c := NewController(kubeClient, kubeInformerFactory.Core().V1().ConfigMaps(), &annotation, logzioAPIToken, logzioUrl, rulesDS, "integration-test") + c := NewController(kubeClient, kubeInformerFactory.Core().V1().ConfigMaps(), *ctlConfig) return c } -func TestGenerateRandomString(t *testing.T) { - testCases := []struct { - name string - length int - }{ - {"length 10", 10}, - {"length 0", 0}, - {"negative length", -1}, - {"large length", 1000}, - } - - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - result := generateRandomString(tc.length) - - if len(result) != tc.length && tc.length >= 0 { - t.Errorf("Expected string of length %d, got string of length %d", tc.length, len(result)) - } - - for _, char := range result { - if !strings.Contains(letterBytes, string(char)) { - t.Errorf("generateRandomString() produced a string with invalid character: %v", char) - } - } - - if tc.length > 0 { - otherResult := generateRandomString(tc.length) - if result == otherResult { - t.Errorf("generateRandomString() does not seem to produce random strings") - } - } - }) - } -} - -func TestParseDuration(t *testing.T) { - tests := []struct { - input string - expected int64 - err bool - }{ - {"", 0, true}, - {"123", 123 * int64(time.Second), false}, - {"1h", int64(time.Hour), false}, - {"invalid", 0, true}, - } - - for _, test := range tests { - duration, err := parseDuration(test.input) - if test.err && err == nil { - t.Errorf("Expected error for input %s", test.input) - } - if !test.err && duration != test.expected { - t.Errorf("Expected %d, got %d for input %s", test.expected, duration, test.input) - } - } -} - -func TestCreateNameStub(t *testing.T) { - cm := &v1.ConfigMap{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test-name", - Namespace: "test-namespace", - }, - } - expected := "test-namespace-test-name" - stub := createNameStub(cm) - if stub != expected { - t.Errorf("Expected %s, got %s", expected, stub) - } -} - -func TestIsAlertEqual(t *testing.T) { - // dummy time duration - tenMinutes, _ := model.ParseDuration("10m") - tenMinutesNs := int64(10 * time.Minute) - fiveMinutes, _ := model.ParseDuration("5m") - - // dummy expression nodes - exprNode := yaml.Node{Value: "metric > 0.5"} - exprQuery := []*grafana_alerts.GrafanaAlertQuery{{Model: map[string]interface{}{"expr": "metric > 0.5"}}} - differentExprQuery := []*grafana_alerts.GrafanaAlertQuery{{Model: map[string]interface{}{"expr": "metric > 0.7"}}} - - testCases := []struct { - name string - rule rulefmt.RuleNode - grafanaRule grafana_alerts.GrafanaAlertRule - expected bool - }{ - { - name: "same rules", - rule: rulefmt.RuleNode{ - Alert: yaml.Node{Value: "SameName"}, - Expr: exprNode, - For: tenMinutes, - Labels: map[string]string{"severity": "critical"}, - Annotations: map[string]string{"summary": "High CPU usage"}, - }, - grafanaRule: grafana_alerts.GrafanaAlertRule{ - Title: "SameName", - Data: exprQuery, - For: tenMinutesNs, - Labels: map[string]string{"severity": "critical"}, - Annotations: map[string]string{"summary": "High CPU usage"}, - }, - expected: true, - }, - { - name: "different titles", - rule: rulefmt.RuleNode{ - Alert: yaml.Node{Value: "AlertName1"}, - Expr: exprNode, - For: tenMinutes, - }, - grafanaRule: grafana_alerts.GrafanaAlertRule{ - Title: "AlertName2", - Data: exprQuery, - For: tenMinutesNs, - }, - expected: false, - }, - { - name: "different labels", - rule: rulefmt.RuleNode{ - Alert: yaml.Node{Value: "SameName"}, - Expr: exprNode, - Labels: map[string]string{"severity": "warning"}, - }, - grafanaRule: grafana_alerts.GrafanaAlertRule{ - Title: "SameName", - Labels: map[string]string{"severity": "critical"}, - Data: exprQuery, - }, - expected: false, - }, - { - name: "different annotations", - rule: rulefmt.RuleNode{ - Alert: yaml.Node{Value: "SameName"}, - Expr: exprNode, - Annotations: map[string]string{"description": "CPU usage is high"}, - }, - grafanaRule: grafana_alerts.GrafanaAlertRule{ - Title: "SameName", - Annotations: map[string]string{"description": "Disk usage is high"}, - Data: exprQuery, - }, - expected: false, - }, - { - name: "different expressions", - rule: rulefmt.RuleNode{ - Alert: yaml.Node{Value: "SameName"}, - Expr: exprNode, - }, - grafanaRule: grafana_alerts.GrafanaAlertRule{ - Title: "SameName", - Data: differentExprQuery, - }, - expected: false, - }, - { - name: "different durations", - rule: rulefmt.RuleNode{ - Alert: yaml.Node{Value: "SameName"}, - Expr: exprNode, - For: fiveMinutes, - }, - grafanaRule: grafana_alerts.GrafanaAlertRule{ - Title: "SameName", - Data: exprQuery, - For: tenMinutesNs, - }, - expected: false, - }, - } - - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - if got := isAlertEqual(tc.rule, tc.grafanaRule); got != tc.expected { - t.Errorf("isAlertEqual() for test case %q = %v, want %v", tc.name, got, tc.expected) - } - }) - } -} - -func TestGenerateGrafanaAlert(t *testing.T) { - ctrl := generateTestController() - // Define common rule parts for reuse in test cases - baseRule := rulefmt.RuleNode{ - Alert: yaml.Node{Value: "TestAlert"}, - Expr: yaml.Node{Value: "up == 1"}, - For: model.Duration(5 * time.Minute), - Labels: map[string]string{"severity": "critical"}, - Annotations: map[string]string{"description": "Instance is down"}, - } - baseFolderUid := "folder123" - - // Test cases - testCases := []struct { - name string - rule rulefmt.RuleNode - folderUid string - wantErr bool - }{ - { - name: "valid conversion with annotations and labels", - rule: baseRule, // Already has annotations and labels - folderUid: baseFolderUid, - wantErr: false, - }, - } - - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - alertRule, err := ctrl.generateGrafanaAlert(tc.rule, tc.folderUid) - - // Check for unexpected errors or lack thereof - if (err != nil) != tc.wantErr { - t.Errorf("generateGrafanaAlert() error = %v, wantErr %v", err, tc.wantErr) - return // Skip further checks if there's an unexpected error - } - if !tc.wantErr { - // Validate Title - if alertRule.Title != tc.rule.Alert.Value { - t.Errorf("generateGrafanaAlert() Title = %v, want %v", alertRule.Title, tc.rule.Alert.Value) - } - - // Validate FolderUID - if alertRule.FolderUID != tc.folderUid { - t.Errorf("generateGrafanaAlert() FolderUID = %v, want %v", alertRule.FolderUID, tc.folderUid) - } - - // Validate Labels - if !reflect.DeepEqual(alertRule.Labels, tc.rule.Labels) { - t.Errorf("generateGrafanaAlert() Labels = %v, want %v", alertRule.Labels, tc.rule.Labels) - } - - // Validate Annotations - if !reflect.DeepEqual(alertRule.Annotations, tc.rule.Annotations) { - t.Errorf("generateGrafanaAlert() Annotations = %v, want %v", alertRule.Annotations, tc.rule.Annotations) - } - } - }) - } -} - func TestExtractValues(t *testing.T) { c := generateTestController() // Define test cases @@ -381,7 +127,7 @@ func TestIsRuleConfigMap(t *testing.T) { configMap: &v1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ Annotations: map[string]string{ - "test-annotation": "true", + annotation: "true", }, }, }, @@ -411,7 +157,7 @@ func TestHaveConfigMapsChanged(t *testing.T) { }, }, } - c.resourceVersionMap[createNameStub(&knownConfigMap)] = "12345" + c.resourceVersionMap[common.CreateNameStub(&knownConfigMap)] = "12345" testCases := []struct { name string diff --git a/controller/utils.go b/controller/utils.go deleted file mode 100644 index 681c98d..0000000 --- a/controller/utils.go +++ /dev/null @@ -1,113 +0,0 @@ -package controller - -import ( - "fmt" - "math/rand" - "os" - "path/filepath" - "reflect" - "strconv" - "time" - - "github.com/logzio/logzio_terraform_client/grafana_alerts" - "github.com/prometheus/prometheus/model/rulefmt" - corev1 "k8s.io/api/core/v1" - "k8s.io/client-go/rest" - "k8s.io/client-go/tools/clientcmd" - "k8s.io/client-go/util/homedir" -) - -// borrowed from here https://stackoverflow.com/questions/22892120/how-to-generate-a-random-string-of-a-fixed-length-in-go -func generateRandomString(n int) string { - if n <= 0 { - return "" // Return an empty string for non-positive lengths - } - b := make([]byte, n) - src := rand.NewSource(time.Now().UnixNano()) - for i, cache, remain := n-1, src.Int63(), letterIdxMax; i >= 0; { - if remain == 0 { - cache, remain = src.Int63(), letterIdxMax - } - if idx := int(cache & letterIdxMask); idx < len(letterBytes) { - b[i] = letterBytes[idx] - i-- - } - cache >>= letterIdxBits - remain-- - } - - return string(b) -} - -// parseDuration turns a duration string (example: 5m, 1h) into an int64 value -func parseDuration(durationStr string) (int64, error) { - // Check if the string is empty - if durationStr == "" { - return 0, fmt.Errorf("duration string is empty") - } - - // Handle the special case where the duration string is just a number (assumed to be seconds) - if _, err := strconv.Atoi(durationStr); err == nil { - seconds, _ := strconv.ParseInt(durationStr, 10, 64) - return seconds * int64(time.Second), nil - } - - // Parse the duration string - duration, err := time.ParseDuration(durationStr) - if err != nil { - return 0, err - } - - // Convert the time.Duration value to an int64 - return int64(duration), nil -} - -func createNameStub(cm *corev1.ConfigMap) string { - name := cm.GetObjectMeta().GetName() - namespace := cm.GetObjectMeta().GetNamespace() - - return fmt.Sprintf("%s-%s", namespace, name) -} - -// isAlertEqual compares two AlertRule objects for equality. -// You should expand this function to compare all relevant fields of AlertRule. -func isAlertEqual(rule rulefmt.RuleNode, grafanaRule grafana_alerts.GrafanaAlertRule) bool { - // Start with name comparison; if these don't match, they're definitely not equal. - if rule.Alert.Value != grafanaRule.Title { - return false - } - if !reflect.DeepEqual(rule.Labels, grafanaRule.Labels) { - return false - } - if !reflect.DeepEqual(rule.Annotations, grafanaRule.Annotations) { - return false - } - forAtt, _ := parseDuration(rule.For.String()) - if forAtt != grafanaRule.For { - return false - } - if rule.Expr.Value != grafanaRule.Data[0].Model.(map[string]interface{})["expr"] { - return false - } - return true -} - -// GetConfig returns a Kubernetes config -func GetConfig() (*rest.Config, error) { - var config *rest.Config - - kubeconfig := filepath.Join(homedir.HomeDir(), ".kube", "config") - if _, err := os.Stat(kubeconfig); err == nil { - config, err = clientcmd.BuildConfigFromFlags("", kubeconfig) - if err != nil { - return nil, err - } - } else { - config, err = rest.InClusterConfig() - if err != nil { - return nil, err - } - } - - return config, nil -} diff --git a/go.mod b/go.mod index dfc7dfb..dc1a17a 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,8 @@ module github.com/logzio/prometheus-alerts-migrator go 1.19 require ( - github.com/logzio/logzio_terraform_client v1.19.0 + github.com/logzio/logzio_terraform_client v1.20.0 + github.com/prometheus/alertmanager v0.26.0 github.com/prometheus/common v0.44.0 github.com/prometheus/prometheus v0.47.2 github.com/stretchr/testify v1.8.4 @@ -11,7 +12,7 @@ require ( k8s.io/api v0.28.3 k8s.io/apimachinery v0.28.3 k8s.io/client-go v0.28.3 - k8s.io/klog v1.0.0 + k8s.io/klog/v2 v2.110.1 ) require ( @@ -21,7 +22,7 @@ require ( github.com/AzureAD/microsoft-authentication-library-for-go v1.0.0 // indirect github.com/alecthomas/units v0.0.0-20211218093645-b94a6e3cc137 // indirect github.com/avast/retry-go v3.0.0+incompatible // indirect - github.com/aws/aws-sdk-go v1.44.302 // indirect + github.com/aws/aws-sdk-go v1.44.317 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/cespare/xxhash/v2 v2.2.0 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect @@ -30,7 +31,7 @@ require ( github.com/emicklei/go-restful/v3 v3.10.2 // indirect github.com/go-kit/log v0.2.1 // indirect github.com/go-logfmt/logfmt v0.6.0 // indirect - github.com/go-logr/logr v1.2.4 // indirect + github.com/go-logr/logr v1.3.0 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/go-openapi/jsonpointer v0.20.0 // indirect github.com/go-openapi/jsonreference v0.20.2 // indirect @@ -85,7 +86,6 @@ require ( google.golang.org/protobuf v1.31.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect - k8s.io/klog/v2 v2.100.1 // indirect k8s.io/kube-openapi v0.0.0-20230717233707-2695361300d9 // indirect k8s.io/utils v0.0.0-20230711102312-30195339c3c7 // indirect sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect diff --git a/go.sum b/go.sum index 50fac99..404fc03 100644 --- a/go.sum +++ b/go.sum @@ -62,8 +62,8 @@ github.com/armon/go-metrics v0.4.1 h1:hR91U9KYmb6bLBYLQjyM+3j+rcd/UhE+G78SFnF8gJ github.com/avast/retry-go v3.0.0+incompatible h1:4SOWQ7Qs+oroOTQOYnAHqelpCO0biHSxpiH9JdtuBj0= github.com/avast/retry-go v3.0.0+incompatible/go.mod h1:XtSnn+n/sHqQIpZ10K1qAevBhOOCWBLXXy3hyiqqBrY= github.com/aws/aws-sdk-go v1.38.35/go.mod h1:hcU610XS61/+aQV88ixoOzUoG7v3b31pl2zKMmprdro= -github.com/aws/aws-sdk-go v1.44.302 h1:ST3ko6GrJKn3Xi+nAvxjG3uk/V1pW8KC52WLeIxqqNk= -github.com/aws/aws-sdk-go v1.44.302/go.mod h1:aVsgQcEevwlmQ7qHE9I3h+dtQgpqhFB+i8Phjh7fkwI= +github.com/aws/aws-sdk-go v1.44.317 h1:+8XWrLmGMwPPXSRSLPzhgcGnzJ2mYkgkrcB9C/GnSOU= +github.com/aws/aws-sdk-go v1.44.317/go.mod h1:aVsgQcEevwlmQ7qHE9I3h+dtQgpqhFB+i8Phjh7fkwI= github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= @@ -116,11 +116,9 @@ github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V github.com/go-logfmt/logfmt v0.5.0/go.mod h1:wCYkCAKZfumFQihp8CzCvQ3paCTfi41vtzG1KdI/P7A= github.com/go-logfmt/logfmt v0.6.0 h1:wGYYu3uicYdqXVgoYbvnkrPVXkuLM1p1ifugDMEdRi4= github.com/go-logfmt/logfmt v0.6.0/go.mod h1:WYhtIu8zTZfxdn5+rREduYbwxfcBr/Vr6KEVveWlfTs= -github.com/go-logr/logr v0.1.0/go.mod h1:ixOQHD9gLJUVQQ2ZOR7zLEifBX6tGkNJF4QyIY7sIas= -github.com/go-logr/logr v1.2.0/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= -github.com/go-logr/logr v1.2.4 h1:g01GSCwiDw2xSZfjJ2/T9M+S6pFdcNtFYsp+Y43HYDQ= -github.com/go-logr/logr v1.2.4/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= +github.com/go-logr/logr v1.3.0 h1:2y3SDp0ZXuc6/cjLSZ+Q3ir+QB9T/iG5yYRXqsagWSY= +github.com/go-logr/logr v1.3.0/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs= @@ -264,8 +262,8 @@ github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= github.com/linode/linodego v1.19.0 h1:n4WJrcr9+30e9JGZ6DI0nZbm5SdAj1kSwvvt/998YUw= -github.com/logzio/logzio_terraform_client v1.19.0 h1:PV6q/ezMtzljjVDq0rEWiG7M5CxCdu7csh6ndOztZSI= -github.com/logzio/logzio_terraform_client v1.19.0/go.mod h1:hEQixCq9RPpvyzWerxIWKf0SYgangyWpPeogN7nytC0= +github.com/logzio/logzio_terraform_client v1.20.0 h1:0eynfD4nDB5H7pNwsodWeff6fh4Ccd7Cj8DGaWwRnyU= +github.com/logzio/logzio_terraform_client v1.20.0/go.mod h1:hEQixCq9RPpvyzWerxIWKf0SYgangyWpPeogN7nytC0= github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= @@ -304,6 +302,8 @@ github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINE github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/alertmanager v0.26.0 h1:uOMJWfIwJguc3NaM3appWNbbrh6G/OjvaHMk22aBBYc= +github.com/prometheus/alertmanager v0.26.0/go.mod h1:rVcnARltVjavgVaNnmevxK7kOn7IZavyf0KNgHkbEpU= github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw= github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo= github.com/prometheus/client_golang v1.7.1/go.mod h1:PY5Wy2awLA44sXw4AOSfFBetzPP4j5+D6mVACh+pe2M= @@ -703,10 +703,8 @@ k8s.io/apimachinery v0.28.3 h1:B1wYx8txOaCQG0HmYF6nbpU8dg6HvA06x5tEffvOe7A= k8s.io/apimachinery v0.28.3/go.mod h1:uQTKmIqs+rAYaq+DFaoD2X7pcjLOqbQX2AOiO0nIpb8= k8s.io/client-go v0.28.3 h1:2OqNb72ZuTZPKCl+4gTKvqao0AMOl9f3o2ijbAj3LI4= k8s.io/client-go v0.28.3/go.mod h1:LTykbBp9gsA7SwqirlCXBWtK0guzfhpoW4qSm7i9dxo= -k8s.io/klog v1.0.0 h1:Pt+yjF5aB1xDSVbau4VsWe+dQNzA0qv1LlXdC2dF6Q8= -k8s.io/klog v1.0.0/go.mod h1:4Bi6QPql/J/LkTDqv7R/cd3hPo4k2DG6Ptcz060Ez5I= -k8s.io/klog/v2 v2.100.1 h1:7WCHKK6K8fNhTqfBhISHQ97KrnJNFZMcQvKp7gP/tmg= -k8s.io/klog/v2 v2.100.1/go.mod h1:y1WjHnz7Dj687irZUWR/WLkLc5N1YHtjLdmgWjndZn0= +k8s.io/klog/v2 v2.110.1 h1:U/Af64HJf7FcwMcXyKm2RPM22WZzyR7OSpYj5tg3cL0= +k8s.io/klog/v2 v2.110.1/go.mod h1:YGtd1984u+GgbuZ7e08/yBuAfKLSO0+uR1Fhi6ExXjo= k8s.io/kube-openapi v0.0.0-20230717233707-2695361300d9 h1:LyMgNKD2P8Wn1iAwQU5OhxCKlKJy0sHc+PcDwFB24dQ= k8s.io/kube-openapi v0.0.0-20230717233707-2695361300d9/go.mod h1:wZK2AVp1uHCp4VamDVgBP2COHZjqD1T68Rf0CM3YjSM= k8s.io/utils v0.0.0-20230711102312-30195339c3c7 h1:ZgnF1KZsYxWIifwSNZFZgNtWE89WI5yiP5WwlfDoIyc= diff --git a/logzio_alerts_client/logzio_alerts_client.go b/logzio_alerts_client/logzio_alerts_client.go new file mode 100644 index 0000000..ba2059c --- /dev/null +++ b/logzio_alerts_client/logzio_alerts_client.go @@ -0,0 +1,519 @@ +package logzio_alerts_client + +import ( + "encoding/json" + "fmt" + "github.com/logzio/logzio_terraform_client/grafana_alerts" + "github.com/logzio/logzio_terraform_client/grafana_contact_points" + "github.com/logzio/logzio_terraform_client/grafana_datasources" + "github.com/logzio/logzio_terraform_client/grafana_folders" + "github.com/logzio/logzio_terraform_client/grafana_notification_policies" + "github.com/logzio/prometheus-alerts-migrator/common" + alert_manager_config "github.com/prometheus/alertmanager/config" + "github.com/prometheus/prometheus/model/rulefmt" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/klog/v2" + "regexp" + "strings" +) + +const ( + refIdA = "A" + refIdB = "B" + expressionString = "__expr__" + queryType = "query" + alertFolder = "prometheus-alerts" + randomStringLength = 5 + grafanaDefaultReceiver = "default-email" +) + +// ReduceQueryModel represents a reduce query for time series data +type ReduceQueryModel struct { + DataSource map[string]string `json:"datasource"` + Expression string `json:"expression"` + Hide bool `json:"hide"` + RefId string `json:"refId"` + Reducer string `json:"reducer"` + Type string `json:"type"` +} + +// ToJSON marshals the Query model into a JSON byte slice +func (r ReduceQueryModel) ToJSON() (json.RawMessage, error) { + marshaled, err := json.Marshal(r) + if err != nil { + return nil, err + } + return marshaled, nil +} + +// PrometheusQueryModel represents a Prometheus query. +type PrometheusQueryModel struct { + Expr string `json:"expr"` + Hide bool `json:"hide"` + RefId string `json:"refId"` +} + +// ToJSON marshals the Query into a JSON byte slice +func (p PrometheusQueryModel) ToJSON() (json.RawMessage, error) { + marshaled, err := json.Marshal(p) + if err != nil { + return nil, err + } + return marshaled, nil +} + +type LogzioGrafanaAlertsClient struct { + AlertManagerGlobalConfig *alert_manager_config.GlobalConfig + logzioAlertClient *grafana_alerts.GrafanaAlertClient + logzioFolderClient *grafana_folders.GrafanaFolderClient + logzioDataSourceClient *grafana_datasources.GrafanaDatasourceClient + logzioContactPointClient *grafana_contact_points.GrafanaContactPointClient + logzioNotificationPolicyClient *grafana_notification_policies.GrafanaNotificationPolicyClient + rulesDataSource string + envId string + ignoreSlackText bool + ignoreSlackTitle bool +} + +func NewLogzioGrafanaAlertsClient(logzioApiToken string, logzioApiUrl string, rulesDs string, envId string, ignoreSlackText bool, ignoreSlackTitle bool) *LogzioGrafanaAlertsClient { + logzioAlertClient, err := grafana_alerts.New(logzioApiToken, logzioApiUrl) + if err != nil { + klog.Errorf("Failed to create logzio alert client: %v", err) + return nil + } + logzioFolderClient, err := grafana_folders.New(logzioApiToken, logzioApiUrl) + if err != nil { + klog.Errorf("Failed to create logzio folder client: %v", err) + return nil + } + logzioDataSourceClient, err := grafana_datasources.New(logzioApiToken, logzioApiUrl) + if err != nil { + klog.Errorf("Failed to create logzio datasource client: %v", err) + return nil + } + logzioContactPointClient, err := grafana_contact_points.New(logzioApiToken, logzioApiUrl) + if err != nil { + klog.Errorf("Failed to create logzio contact point client: %v", err) + return nil + } + logzioNotificationPolicyClient, err := grafana_notification_policies.New(logzioApiToken, logzioApiUrl) + if err != nil { + klog.Errorf("Failed to create logzio notification policy client: %v", err) + return nil + } + // get datasource uid and validate value and type + rulesDsData, err := logzioDataSourceClient.GetForAccount(rulesDs) + if err != nil || rulesDsData.Uid == "" { + klog.Errorf("Failed to get datasource uid: %v", err) + return nil + } + if rulesDsData.Type != "prometheus" { + klog.Errorf("Datasource type is not prometheus: %v", err) + return nil + } + return &LogzioGrafanaAlertsClient{ + logzioAlertClient: logzioAlertClient, + logzioFolderClient: logzioFolderClient, + logzioDataSourceClient: logzioDataSourceClient, + logzioContactPointClient: logzioContactPointClient, + logzioNotificationPolicyClient: logzioNotificationPolicyClient, + rulesDataSource: rulesDsData.Uid, + envId: envId, + ignoreSlackText: ignoreSlackText, + ignoreSlackTitle: ignoreSlackTitle, + } +} + +func (l *LogzioGrafanaAlertsClient) ResetNotificationPolicyTree() error { + defaultGrafanaNotificationPolicy := grafana_notification_policies.GrafanaNotificationPolicyTree{ + Receiver: grafanaDefaultReceiver, + Routes: []grafana_notification_policies.GrafanaNotificationPolicy{}, + } + err := l.logzioNotificationPolicyClient.SetupGrafanaNotificationPolicyTree(defaultGrafanaNotificationPolicy) + if err != nil { + return err + } + return nil +} + +// SetNotificationPolicyTreeFromRouteTree converts route tree to grafana notification policy tree and writes it to logz.io +func (l *LogzioGrafanaAlertsClient) SetNotificationPolicyTreeFromRouteTree(routeTree *alert_manager_config.Route) { + // getting logzio contact points to ensure it exists for the notification policy tree + logzioContactPoints, err := l.GetLogzioManagedGrafanaContactPoints() + if err != nil { + klog.Errorf("Failed to get logz.io managed contact points: %v", err) + return + } + // create contact points map for efficient lookup + existingContactPoints := make(map[string]bool) + for _, contactPoint := range logzioContactPoints { + existingContactPoints[contactPoint.Name] = true + } + notificationPolicyTree := l.convertRouteTreeToNotificationPolicyTree(routeTree, existingContactPoints) + err = l.logzioNotificationPolicyClient.SetupGrafanaNotificationPolicyTree(notificationPolicyTree) + if err != nil { + klog.Errorf("Failed to create notification policy tree: %v", err) + } +} + +func (l *LogzioGrafanaAlertsClient) convertRouteTreeToNotificationPolicyTree(routeTree *alert_manager_config.Route, existingContactPoints map[string]bool) (notificationPolicyTree grafana_notification_policies.GrafanaNotificationPolicyTree) { + // checking for empty values to avoid nil pointer errors + if routeTree.GroupByStr != nil { + notificationPolicyTree.GroupBy = routeTree.GroupByStr + } + if routeTree.GroupInterval != nil { + notificationPolicyTree.GroupInterval = routeTree.GroupInterval.String() + } + if routeTree.GroupWait != nil { + notificationPolicyTree.GroupWait = routeTree.GroupWait.String() + } + if routeTree.RepeatInterval != nil { + notificationPolicyTree.RepeatInterval = routeTree.RepeatInterval.String() + } + notificationPolicyTree.Receiver = routeTree.Receiver + for _, childRoute := range routeTree.Routes { + // check if the receiver of the child route exists in `existingContactPoints` + if _, ok := existingContactPoints[childRoute.Receiver]; ok { + notificationPolicy := l.generateGrafanaNotificationPolicy(childRoute) + notificationPolicyTree.Routes = append(notificationPolicyTree.Routes, notificationPolicy) + } + } + return notificationPolicyTree +} + +// generateGrafanaNotificationPolicy generates a GrafanaNotificationPolicy from a alert_manager_config.Route +func (l *LogzioGrafanaAlertsClient) generateGrafanaNotificationPolicy(route *alert_manager_config.Route) (notificationPolicy grafana_notification_policies.GrafanaNotificationPolicy) { + // checking for empty values to avoid nil pointer errors + if route.GroupInterval != nil { + notificationPolicy.GroupInterval = route.GroupInterval.String() + } + if route.GroupWait != nil { + notificationPolicy.GroupWait = route.GroupWait.String() + } + if route.RepeatInterval != nil { + notificationPolicy.RepeatInterval = route.RepeatInterval.String() + } + if route.GroupByStr != nil { + notificationPolicy.GroupBy = route.GroupByStr + } + notificationPolicy.Receiver = route.Receiver + routeMatchersYaml, err := route.Matchers.MarshalYAML() + if err != nil { + utilruntime.HandleError(err) + return grafana_notification_policies.GrafanaNotificationPolicy{} + } + // converting the route matchers to the Grafana format + routeMatchersList := routeMatchersYaml.([]string) + grafanaObjMatchers := grafana_notification_policies.MatchersObj{} + for _, routeMatcher := range routeMatchersList { + // we split the route matcher by the regex (=|~|=|!=) to convert it to the Grafana format + regex := regexp.MustCompile(`(=|~=?|!=)`) + parts := regex.FindStringSubmatchIndex(routeMatcher) + if len(parts) > 0 { + // Extracting the key, operator, and value + key := routeMatcher[:parts[0]] + operator := routeMatcher[parts[0]:parts[1]] + value := routeMatcher[parts[1]:] + grafanaObjMatchers = append(grafanaObjMatchers, grafana_notification_policies.MatcherObj{key, operator, value}) + } + } + // handling `match` operators although it's deprecated to support users with old prometheus versions + for key, value := range route.Match { + grafanaObjMatchers = append(grafanaObjMatchers, grafana_notification_policies.MatcherObj{key, "=", value}) + } + notificationPolicy.ObjectMatchers = grafanaObjMatchers + // repeat the process for nested policies + for _, childRoute := range route.Routes { + childNotificationPolicy := l.generateGrafanaNotificationPolicy(childRoute) + notificationPolicy.Routes = append(notificationPolicy.Routes, childNotificationPolicy) + } + return notificationPolicy +} + +// WriteContactPoints writes the contact points to logz.io +func (l *LogzioGrafanaAlertsClient) WriteContactPoints(contactPointsToWrite []alert_manager_config.Receiver) { + for _, contactPoint := range contactPointsToWrite { + contactPointsList := l.generateGrafanaContactPoint(contactPoint) + for _, cp := range contactPointsList { + _, err := l.logzioContactPointClient.CreateGrafanaContactPoint(cp) + if err != nil { + klog.Warningf("Failed to create contact point: %v", err) + } + } + } +} + +// DeleteContactPoints deletes the contact points from logz.io +func (l *LogzioGrafanaAlertsClient) DeleteContactPoints(contactPointsToDelete []grafana_contact_points.GrafanaContactPoint) { + for _, contactPoint := range contactPointsToDelete { + err := l.logzioContactPointClient.DeleteGrafanaContactPoint(contactPoint.Uid) + if err != nil { + klog.Warningf("Failed to delete contact point: %v", err) + } + } +} + +// UpdateContactPoints updates the contact points in logz.io +func (l *LogzioGrafanaAlertsClient) UpdateContactPoints(contactPointsToUpdate []alert_manager_config.Receiver, contactPointsMap []grafana_contact_points.GrafanaContactPoint) { + for _, contactPoint := range contactPointsToUpdate { + contactPointsList := l.generateGrafanaContactPoint(contactPoint) + for _, cp := range contactPointsList { + for _, logzioContactPoint := range contactPointsMap { + if logzioContactPoint.Name == cp.Name { + cp.Uid = logzioContactPoint.Uid + err := l.logzioContactPointClient.UpdateContactPoint(cp) + if err != nil { + klog.Warningf("Failed to update contact point: %v", err) + } + } + } + } + } +} + +// generateGrafanaContactPoint generates a GrafanaContactPoint from a alert_manager_config.Receiver +func (l *LogzioGrafanaAlertsClient) generateGrafanaContactPoint(receiver alert_manager_config.Receiver) (contactPointsList []grafana_contact_points.GrafanaContactPoint) { + // check for email type configs + for _, emailConfig := range receiver.EmailConfigs { + contactPoint := grafana_contact_points.GrafanaContactPoint{ + Name: receiver.Name, + Type: common.TypeEmail, + Uid: common.GenerateRandomString(9), + DisableResolveMessage: false, + Settings: map[string]interface{}{ + "addresses": emailConfig.To, + "message": emailConfig.HTML, + "singleEmail": true, + }, + } + contactPointsList = append(contactPointsList, contactPoint) + } + // check for slack type configs + for _, slackConfig := range receiver.SlackConfigs { + var url string + if slackConfig.APIURL.String() != "" { + url = slackConfig.APIURL.String() + } else { + url = l.AlertManagerGlobalConfig.SlackAPIURL.String() + } + contactPoint := grafana_contact_points.GrafanaContactPoint{ + Name: receiver.Name, + Type: common.TypeSlack, + Uid: common.GenerateRandomString(9), + DisableResolveMessage: false, + Settings: map[string]interface{}{ + "url": url, + "recipient": slackConfig.Channel, + "username": slackConfig.Username, + }, + } + // Adding title and text fields based on program flags + if !l.ignoreSlackTitle { + contactPoint.Settings["title"] = slackConfig.Title + } + if !l.ignoreSlackText { + contactPoint.Settings["text"] = slackConfig.Text + } + contactPointsList = append(contactPointsList, contactPoint) + } + // check for pagerduty type configs + for _, pagerdutyConfig := range receiver.PagerdutyConfigs { + contactPoint := grafana_contact_points.GrafanaContactPoint{ + Name: receiver.Name, + Type: common.TypePagerDuty, + Uid: common.GenerateRandomString(9), + DisableResolveMessage: false, + Settings: map[string]interface{}{ + "integrationKey": pagerdutyConfig.ServiceKey, + "description": pagerdutyConfig.Description, + "client": pagerdutyConfig.Client, + "clientUrl": pagerdutyConfig.ClientURL, + }, + } + contactPointsList = append(contactPointsList, contactPoint) + } + return contactPointsList +} + +// DeleteRules deletes the rules from logz.io +func (l *LogzioGrafanaAlertsClient) DeleteRules(rulesToDelete []grafana_alerts.GrafanaAlertRule, folderUid string) { + for _, rule := range rulesToDelete { + err := l.logzioAlertClient.DeleteGrafanaAlertRule(rule.Uid) + if err != nil { + klog.Warningf("Error deleting rule: %s - %s", rule.Title, err.Error()) + } + } +} + +// UpdateRules updates the rules in logz.io +func (l *LogzioGrafanaAlertsClient) UpdateRules(rulesToUpdate []rulefmt.RuleNode, logzioRulesMap map[string]grafana_alerts.GrafanaAlertRule, folderUid string) { + for _, rule := range rulesToUpdate { + // Retrieve the existing GrafanaAlertRule to get the Uid. + existingRule := logzioRulesMap[rule.Alert.Value] + alert, err := l.generateGrafanaAlert(rule, folderUid) + if err != nil { + klog.Warning(err) + continue // Skip this rule and continue with the next + } + // Set the Uid from the existing rule. + alert.Uid = existingRule.Uid + err = l.logzioAlertClient.UpdateGrafanaAlertRule(alert) + if err != nil { + klog.Warningf("Error updating rule: %s - %s", alert.Title, err.Error()) + } + } +} + +// WriteRules writes the rules to logz.io +func (l *LogzioGrafanaAlertsClient) WriteRules(rulesToWrite []rulefmt.RuleNode, folderUid string) { + for _, rule := range rulesToWrite { + alert, err := l.generateGrafanaAlert(rule, folderUid) + if err != nil { + klog.Warning(err) + } + _, err = l.logzioAlertClient.CreateGrafanaAlertRule(alert) + if err != nil { + klog.Warning("Error writing rule:", alert.Title, err.Error()) + } + } +} + +// generateGrafanaAlert generates a GrafanaAlertRule from a Prometheus rule +func (l *LogzioGrafanaAlertsClient) generateGrafanaAlert(rule rulefmt.RuleNode, folderUid string) (grafana_alerts.GrafanaAlertRule, error) { + // validate the rule + validationErrs := rule.Validate() + if len(validationErrs) > 0 { + return grafana_alerts.GrafanaAlertRule{}, fmt.Errorf("invalid rule: %v", validationErrs) + } + // Create promql query to return time series data for the expression. + promqlQuery := PrometheusQueryModel{ + Expr: rule.Expr.Value, + Hide: false, + RefId: refIdA, + } + // Use the ToJSON method to marshal the Query struct. + promqlModel, err := promqlQuery.ToJSON() + if err != nil { + return grafana_alerts.GrafanaAlertRule{}, err + } + queryA := grafana_alerts.GrafanaAlertQuery{ + DatasourceUid: l.rulesDataSource, + Model: promqlModel, + RefId: refIdA, + QueryType: queryType, + RelativeTimeRange: grafana_alerts.RelativeTimeRangeObj{ + From: 300, + To: 0, + }, + } + // Create reduce query to return the reduced last value of the time series data. + reduceQuery := ReduceQueryModel{ + DataSource: map[string]string{ + "type": expressionString, + "uid": expressionString, + }, + Expression: refIdA, + Hide: false, + RefId: refIdB, + Reducer: "last", + Type: "reduce", + } + reduceModel, err := reduceQuery.ToJSON() + if err != nil { + return grafana_alerts.GrafanaAlertRule{}, err + } + queryB := grafana_alerts.GrafanaAlertQuery{ + DatasourceUid: expressionString, + Model: reduceModel, + RefId: refIdB, + QueryType: "", + RelativeTimeRange: grafana_alerts.RelativeTimeRangeObj{ + From: 300, + To: 0, + }, + } + duration, err := common.ParseDuration(rule.For.String()) + if err != nil { + return grafana_alerts.GrafanaAlertRule{}, err + } + + // Create the GrafanaAlertRule, we are alerting on the reduced last value of the time series data (query B). + grafanaAlert := grafana_alerts.GrafanaAlertRule{ + Annotations: rule.Annotations, + Condition: refIdB, + Data: []*grafana_alerts.GrafanaAlertQuery{&queryA, &queryB}, + FolderUID: folderUid, + NoDataState: grafana_alerts.NoDataOk, + ExecErrState: grafana_alerts.ErrOK, + Labels: rule.Labels, + OrgID: 1, + RuleGroup: rule.Alert.Value, + Title: rule.Alert.Value, + For: duration, + } + return grafanaAlert, nil +} + +func (l *LogzioGrafanaAlertsClient) GetLogzioManagedGrafanaContactPoints() ([]grafana_contact_points.GrafanaContactPoint, error) { + contactPoints, err := l.logzioContactPointClient.GetAllGrafanaContactPoints() + if err != nil { + return nil, err + } + var managedContactPoints []grafana_contact_points.GrafanaContactPoint + for _, contactPoint := range contactPoints { + // check if the contact point name contains the env id to determine if it is a managed contact point + if strings.Contains(contactPoint.Name, l.envId) { + managedContactPoints = append(managedContactPoints, contactPoint) + } + } + return managedContactPoints, nil +} + +func (l *LogzioGrafanaAlertsClient) GetLogzioGrafanaNotificationPolicies() (grafana_notification_policies.GrafanaNotificationPolicyTree, error) { + notificationPolicies, err := l.logzioNotificationPolicyClient.GetGrafanaNotificationPolicyTree() + if err != nil { + return grafana_notification_policies.GrafanaNotificationPolicyTree{}, err + } + return notificationPolicies, nil + +} + +// GetLogzioGrafanaAlerts builds a list of rules from all logz.io +func (l *LogzioGrafanaAlertsClient) GetLogzioGrafanaAlerts(folderUid string) ([]grafana_alerts.GrafanaAlertRule, error) { + alertRules, ListLogzioRulesErr := l.logzioAlertClient.ListGrafanaAlertRules() + if ListLogzioRulesErr != nil { + return nil, ListLogzioRulesErr + } + // find all alerts inside prometheus alerts folder + var alertsInFolder []grafana_alerts.GrafanaAlertRule + for _, rule := range alertRules { + if rule.FolderUID == folderUid { + alertsInFolder = append(alertsInFolder, rule) + } + } + return alertsInFolder, nil +} + +// FindOrCreatePrometheusAlertsFolder tries to find the prometheus alerts folder in logz.io, if it does not exist it creates it. +func (l *LogzioGrafanaAlertsClient) FindOrCreatePrometheusAlertsFolder() (string, error) { + folders, err := l.logzioFolderClient.ListGrafanaFolders() + if err != nil { + return "", err + } + envFolderTitle := fmt.Sprintf("%s-%s", l.envId, alertFolder) + // try to find the prometheus alerts folder + for _, folder := range folders { + if folder.Title == envFolderTitle { + return folder.Uid, nil + } + } + // if not found, create the prometheus alerts folder + grafanaFolder, err := l.logzioFolderClient.CreateGrafanaFolder(grafana_folders.CreateUpdateFolder{ + Uid: fmt.Sprintf("%s-%s", envFolderTitle, common.GenerateRandomString(randomStringLength)), + Title: envFolderTitle, + }) + if err != nil { + return "", err + } + return grafanaFolder.Uid, nil +} diff --git a/logzio_alerts_client/logzio_alerts_client_test.go b/logzio_alerts_client/logzio_alerts_client_test.go new file mode 100644 index 0000000..ad011b4 --- /dev/null +++ b/logzio_alerts_client/logzio_alerts_client_test.go @@ -0,0 +1,164 @@ +package logzio_alerts_client + +import ( + "github.com/logzio/prometheus-alerts-migrator/common" + "github.com/prometheus/alertmanager/config" + "github.com/prometheus/common/model" + "github.com/prometheus/prometheus/model/rulefmt" + "github.com/stretchr/testify/assert" + "gopkg.in/yaml.v3" + "net/url" + "reflect" + "testing" + "time" +) + +func generateTestLogzioGrafanaAlertsClient() *LogzioGrafanaAlertsClient { + ctlConfig := common.NewConfig() + logzioGrafanaAlertsClient := NewLogzioGrafanaAlertsClient(ctlConfig.LogzioAPIToken, ctlConfig.LogzioAPIURL, ctlConfig.RulesDS, ctlConfig.EnvID, ctlConfig.IgnoreSlackTitle, ctlConfig.IgnoreSlackTitle) + return logzioGrafanaAlertsClient + +} + +func TestGenerateGrafanaAlert(t *testing.T) { + cl := generateTestLogzioGrafanaAlertsClient() + // Define common rule parts for reuse in test cases + baseRule := rulefmt.RuleNode{ + Alert: yaml.Node{Value: "TestAlert"}, + Expr: yaml.Node{Value: "up == 1"}, + For: model.Duration(5 * time.Minute), + Labels: map[string]string{"severity": "critical"}, + Annotations: map[string]string{"description": "Instance is down"}, + } + invalidRule := rulefmt.RuleNode{ + Alert: yaml.Node{Value: "TestAlertInvalid"}, + Expr: yaml.Node{Value: "up as== 1sadsa"}, + For: model.Duration(5 * time.Minute), + Labels: map[string]string{"severity": "critical"}, + Annotations: map[string]string{"description": "Instance is down"}, + } + baseFolderUid := "folder123" + + // Test cases + testCases := []struct { + name string + rule rulefmt.RuleNode + folderUid string + wantErr bool + }{ + { + name: "valid conversion with annotations and labels", + rule: baseRule, // Already has annotations and labels + folderUid: baseFolderUid, + wantErr: false, + }, + { + name: "invalid rule", + rule: invalidRule, + folderUid: baseFolderUid, + wantErr: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + alertRule, err := cl.generateGrafanaAlert(tc.rule, tc.folderUid) + + // Check for unexpected errors or lack thereof + if (err != nil) != tc.wantErr { + t.Errorf("generateGrafanaAlert() error = %v, wantErr %v", err, tc.wantErr) + return // Skip further checks if there's an unexpected error + } + if !tc.wantErr { + // Validate Title + if alertRule.Title != tc.rule.Alert.Value { + t.Errorf("generateGrafanaAlert() Title = %v, want %v", alertRule.Title, tc.rule.Alert.Value) + } + + // Validate FolderUID + if alertRule.FolderUID != tc.folderUid { + t.Errorf("generateGrafanaAlert() FolderUID = %v, want %v", alertRule.FolderUID, tc.folderUid) + } + + // Validate Labels + if !reflect.DeepEqual(alertRule.Labels, tc.rule.Labels) { + t.Errorf("generateGrafanaAlert() Labels = %v, want %v", alertRule.Labels, tc.rule.Labels) + } + + // Validate Annotations + if !reflect.DeepEqual(alertRule.Annotations, tc.rule.Annotations) { + t.Errorf("generateGrafanaAlert() Annotations = %v, want %v", alertRule.Annotations, tc.rule.Annotations) + } + } + }) + } +} + +func TestGenerateGrafanaContactPoint(t *testing.T) { + client := generateTestLogzioGrafanaAlertsClient() + testCases := []struct { + name string + receiver config.Receiver + expectedLength int + expectedType string + }{ + { + name: "Email Configuration", + receiver: config.Receiver{ + EmailConfigs: []*config.EmailConfig{ + { + To: "test@example.com", + }, + { + To: "test2@example.com", + }, + }, + }, + expectedLength: 2, + expectedType: common.TypeEmail, + }, + { + name: "Slack Configuration", + receiver: config.Receiver{ + SlackConfigs: []*config.SlackConfig{ + { + Channel: "#test", + APIURL: &config.SecretURL{ + URL: &url.URL{ + Scheme: "https", + Host: "api.slack.com", + Path: "/api/chat.postMessage", + }, + }, + }, + }, + }, + expectedLength: 1, + expectedType: common.TypeSlack, + }, + { + name: "Pagerduty Configuration", + receiver: config.Receiver{ + PagerdutyConfigs: []*config.PagerdutyConfig{ + { + ServiceKey: "test", + }, + }, + }, + expectedLength: 1, + expectedType: common.TypePagerDuty, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + contactPoints := client.generateGrafanaContactPoint(tc.receiver) + assert.Len(t, contactPoints, tc.expectedLength, "Incorrect number of contact points generated") + // Assert the type of contact point + if tc.expectedLength > 0 { + assert.Equal(t, tc.expectedType, contactPoints[0].Type, "Incorrect type of contact point") + // Add more assertions to check other fields like settings, name, etc. + } + }) + } +} diff --git a/main.go b/main.go index bee5ea4..d842464 100644 --- a/main.go +++ b/main.go @@ -1,103 +1,38 @@ package main import ( - "flag" - "os" - "strconv" + "github.com/logzio/prometheus-alerts-migrator/common" "time" "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes" - "k8s.io/klog" + "k8s.io/klog/v2" "github.com/logzio/prometheus-alerts-migrator/controller" "github.com/logzio/prometheus-alerts-migrator/pkg/signals" ) -// Config holds all the configuration needed for the application to run. -type Config struct { - Annotation string - LogzioAPIToken string - LogzioAPIURL string - RulesDS string - EnvID string - WorkerCount int -} - -// NewConfig creates a Config struct, populating it with values from command-line flags and environment variables. -func NewConfig() *Config { - // Define flags - helpFlag := flag.Bool("help", false, "Display help") - configmapAnnotation := flag.String("annotation", "prometheus.io/kube-rules", "Annotation that states that this configmap contains prometheus rules") - logzioAPITokenFlag := flag.String("logzio-api-token", "", "LOGZIO API token") - logzioAPIURLFlag := flag.String("logzio-api-url", "https://api.logz.io", "LOGZIO API URL") - rulesDSFlag := flag.String("rules-ds", "", "name of the data source for the alert rules") - envIDFlag := flag.String("env-id", "my-env", "environment identifier, usually cluster name") - workerCountFlag := flag.Int("workers", 2, "The number of workers to process the alerts") - - // Parse the flags - flag.Parse() - - if *helpFlag { - flag.PrintDefaults() - os.Exit(0) - } - - // Environment variables have lower precedence than flags - logzioAPIURL := getEnvWithFallback("LOGZIO_API_URL", *logzioAPIURLFlag) - envID := getEnvWithFallback("ENV_ID", *envIDFlag) - // api token is mandatory - logzioAPIToken := getEnvWithFallback("LOGZIO_API_TOKEN", *logzioAPITokenFlag) - if logzioAPIToken == "" { - klog.Fatal("No logzio api token provided") - } - rulesDS := getEnvWithFallback("RULES_DS", *rulesDSFlag) - if rulesDS == "" { - klog.Fatal("No rules data source provided") - } - // Annotation must be provided either by flag or environment variable - annotation := getEnvWithFallback("CONFIGMAP_ANNOTATION", *configmapAnnotation) - if annotation == "" { - klog.Fatal("No ConfigMap annotation provided") - } - workerCountStr := getEnvWithFallback("WORKERS_COOUNT", strconv.Itoa(*workerCountFlag)) - workerCount, err := strconv.Atoi(workerCountStr) - if err != nil { - workerCount = 2 // default value - } - - return &Config{ - Annotation: annotation, - LogzioAPIToken: logzioAPIToken, - LogzioAPIURL: logzioAPIURL, - RulesDS: rulesDS, - EnvID: envID, - WorkerCount: workerCount, - } -} - -// getEnvWithFallback tries to get the value from an environment variable and falls back to the given default value if not found. -func getEnvWithFallback(envName, defaultValue string) string { - if value, exists := os.LookupEnv(envName); exists { - return value - } - return defaultValue -} - func main() { - config := NewConfig() + config := common.NewConfig() klog.Info("Rule Updater starting.\n") - klog.Infof("ConfigMap annotation: %s\n", config.Annotation) + klog.Infof("Rules configMap annotation: %s\n", config.RulesAnnotation) + klog.Infof("AlertManager configMap annotation: %s\n", config.AlertManagerAnnotation) klog.Infof("Environment ID: %s\n", config.EnvID) klog.Infof("Logzio api url: %s\n", config.LogzioAPIURL) klog.Infof("Logzio rules data source: %s\n", config.RulesDS) klog.Infof("Number of workers: %d\n", config.WorkerCount) + if config.IgnoreSlackText == true { + klog.Info("Slack text field will be ignored") + } + if config.IgnoreSlackTitle == true { + klog.Info("Slack title field will be ignored") + } // set up signals so we handle the first shutdown signal gracefully stopCh := signals.SetupSignalHandler() - cfg, err := controller.GetConfig() + cfg, err := common.GetConfig() if err != nil { klog.Fatalf("Error getting Kubernetes config: %s", err) } @@ -108,14 +43,13 @@ func main() { } kubeInformerFactory := informers.NewSharedInformerFactory(kubeClient, time.Second*30) - - c := controller.NewController(kubeClient, kubeInformerFactory.Core().V1().ConfigMaps(), &config.Annotation, config.LogzioAPIToken, config.LogzioAPIURL, config.RulesDS, config.EnvID) - if c == nil { + ctl := controller.NewController(kubeClient, kubeInformerFactory.Core().V1().ConfigMaps(), *config) + if ctl == nil { klog.Fatal("Error creating controller") } kubeInformerFactory.Start(stopCh) - if err = c.Run(config.WorkerCount, stopCh); err != nil { + if err = ctl.Run(config.WorkerCount, stopCh); err != nil { klog.Fatalf("Error running controller: %s", err) } } diff --git a/testdata/alert_manager_contact_points.yaml b/testdata/alert_manager_contact_points.yaml new file mode 100644 index 0000000..bc46f9f --- /dev/null +++ b/testdata/alert_manager_contact_points.yaml @@ -0,0 +1,305 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: alert-manager + namespace: alert-migrator-test + labels: + app: prometheus + annotations: + prometheus.io/kube-alertmanager: "true" +data: + alert_manager: | + global: + resolve_timeout: 5m + http_config: + follow_redirects: true + enable_http2: true + smtp_from: alertmanager@logzio.com + smtp_hello: localhost + smtp_require_tls: false + slack_api_url: https://api.slack.com/ + pagerduty_url: https://events.pagerduty.com/v2/enqueue + opsgenie_api_url: https://api.opsgenie.com/ + wechat_api_url: https://qyapi.weixin.qq.com/cgi-bin/ + victorops_api_url: https://alert.victorops.com/integrations/generic/20131114/alert/ + telegram_api_url: https://api.telegram.org + webex_api_url: https://webexapis.com/v1/messages + route: + receiver: lost-alerts-slack + group_by: + - alertname + - hostname + - service + continue: false + routes: [] + receivers: + - name: lost-alerts-slack + slack_configs: + - send_resolved: true + http_config: + follow_redirects: true + enable_http2: true + api_url: https://api.slack.com/ + channel: '#eng-sre-lost-alerts' + username: AlertManagerd (tooling-test) + color: '{{ if eq .Status "firing" }}danger{{ else }}good{{ end }}' + title: '{{ template "default.title" . }}' + title_link: '{{ template "slack.default.titlelink" . }}' + pretext: '{{ template "slack.default.pretext" . }}' + text: '{{ template "default.text" . }}' + short_fields: false + footer: '{{ template "slack.default.footer" . }}' + fallback: '{{ template "slack.default.fallback" . }}' + callback_id: '{{ template "slack.default.callbackid" . }}' + icon_emoji: '{{ template "slack.default.iconemoji" . }}' + icon_url: '{{ template "slack.default.iconurl" . }}' + link_names: false + + - send_resolved: true + http_config: + follow_redirects: true + enable_http2: true + api_url: https://api.slack.com/ + channel: '#eng-sre-lost-alertsto' + username: AlertManagers (tooling-test) + color: '{{ if eq .Status "firing" }}danger{{ else }}good{{ end }}' + title: '{{ template "default.title" . }}' + title_link: '{{ template "slack.default.titlelink" . }}' + pretext: '{{ template "slack.default.pretext" . }}' + text: '{{ template "default.text" . }}' + short_fields: false + footer: '{{ template "slack.default.footer" . }}' + fallback: '{{ template "slack.default.fallback" . }}' + callback_id: '{{ template "slack.default.callbackid" . }}' + icon_emoji: '{{ template "slack.default.iconemoji" . }}' + icon_url: '{{ template "slack.default.iconurl" . }}' + link_names: false + + - name: ada-disaster-channels + slack_configs: + - send_resolved: true + http_config: + follow_redirects: true + enable_http2: true + api_url: https://api.slack.com/ + channel: '#eng-ada-alerts' + username: AlertManager (tooling-test) + color: '{{ if eq .Status "firing" }}danger{{ else }}good{{ end }}' + title: '{{ template "default.title" . }}' + title_link: '{{ template "slack.default.titlelink" . }}' + pretext: '{{ template "slack.default.pretext" . }}' + text: '{{ template "default.text" . }}' + short_fields: false + footer: '{{ template "slack.default.footer" . }}' + fallback: '{{ template "slack.default.fallback" . }}' + callback_id: '{{ template "slack.default.callbackid" . }}' + icon_emoji: '{{ template "slack.default.iconemoji" . }}' + icon_url: '{{ template "slack.default.iconurl" . }}' + link_names: false + - name: ada-major-channels + slack_configs: + - send_resolved: true + http_config: + follow_redirects: true + enable_http2: true + api_url: https://api.slack.com/ + channel: '#eng-ada-alerts' + username: AlertManager (tooling-test) + color: '{{ if eq .Status "firing" }}danger{{ else }}good{{ end }}' + title: '{{ template "default.title" . }}' + title_link: '{{ template "slack.default.titlelink" . }}' + pretext: '{{ template "slack.default.pretext" . }}' + text: '{{ template "default.text" . }}' + short_fields: false + footer: '{{ template "slack.default.footer" . }}' + fallback: '{{ template "slack.default.fallback" . }}' + callback_id: '{{ template "slack.default.callbackid" . }}' + icon_emoji: '{{ template "slack.default.iconemoji" . }}' + icon_url: '{{ template "slack.default.iconurl" . }}' + link_names: false + - name: ada-testlab-channels + slack_configs: + - send_resolved: true + http_config: + follow_redirects: true + enable_http2: true + api_url: https://api.slack.com/ + channel: '#eng-ada-lab-alerts' + username: AlertManager (tooling-test) + color: '{{ if eq .Status "firing" }}danger{{ else }}good{{ end }}' + title: '{{ template "default.title" . }}' + title_link: '{{ template "slack.default.titlelink" . }}' + pretext: '{{ template "slack.default.pretext" . }}' + text: '{{ template "default.text" . }}' + short_fields: false + footer: '{{ template "slack.default.footer" . }}' + fallback: '{{ template "slack.default.fallback" . }}' + callback_id: '{{ template "slack.default.callbackid" . }}' + icon_emoji: '{{ template "slack.default.iconemoji" . }}' + icon_url: '{{ template "slack.default.iconurl" . }}' + link_names: false + - name: athlone-disaster-channels + slack_configs: + - send_resolved: true + http_config: + follow_redirects: true + enable_http2: true + api_url: https://api.slack.com/ + channel: '#eng-athlone-alerts' + username: AlertManager (tooling-test) + color: '{{ if eq .Status "firing" }}danger{{ else }}good{{ end }}' + title: '{{ template "default.title" . }}' + title_link: '{{ template "slack.default.titlelink" . }}' + pretext: '{{ template "slack.default.pretext" . }}' + text: '{{ template "default.text" . }}' + short_fields: false + footer: '{{ template "slack.default.footer" . }}' + fallback: '{{ template "slack.default.fallback" . }}' + callback_id: '{{ template "slack.default.callbackid" . }}' + icon_emoji: '{{ template "slack.default.iconemoji" . }}' + icon_url: '{{ template "slack.default.iconurl" . }}' + link_names: false + - name: ops-disaster-channels + pagerduty_configs: + - send_resolved: true + http_config: + follow_redirects: true + enable_http2: true + service_key: + url: https://events.pagerduty.com/v2/enqueue + client: '{{ template "pagerduty.default.client" . }}' + client_url: '{{ template "pagerduty.default.clientURL" . }}' + description: '{{ template "pagerduty.default.description" .}}' + details: + firing: '{{ template "pagerduty.default.instances" .Alerts.Firing }}' + num_firing: '{{ .Alerts.Firing | len }}' + num_resolved: '{{ .Alerts.Resolved | len }}' + resolved: '{{ template "pagerduty.default.instances" .Alerts.Resolved }}' + source: '{{ template "pagerduty.default.client" . }}' + slack_configs: + - send_resolved: true + http_config: + follow_redirects: true + enable_http2: true + api_url: https://api.slack.com/ + channel: '#eng-sysops-alerts' + username: AlertManager (tooling-test) + color: '{{ if eq .Status "firing" }}danger{{ else }}good{{ end }}' + title: '{{ template "default.title" . }}' + title_link: '{{ template "slack.default.titlelink" . }}' + pretext: '{{ template "slack.default.pretext" . }}' + text: '{{ template "default.text" . }}' + short_fields: false + footer: '{{ template "slack.default.footer" . }}' + fallback: '{{ template "slack.default.fallback" . }}' + callback_id: '{{ template "slack.default.callbackid" . }}' + icon_emoji: '{{ template "slack.default.iconemoji" . }}' + icon_url: '{{ template "slack.default.iconurl" . }}' + link_names: false + - name: ops-major-channels + slack_configs: + - send_resolved: true + http_config: + follow_redirects: true + enable_http2: true + api_url: https://api.slack.com/ + channel: '#eng-sysops-alerts' + username: AlertManager (tooling-test) + color: '{{ if eq .Status "firing" }}danger{{ else }}good{{ end }}' + title: '{{ template "default.title" . }}' + title_link: '{{ template "slack.default.titlelink" . }}' + pretext: '{{ template "slack.default.pretext" . }}' + text: '{{ template "default.text" . }}' + short_fields: false + footer: '{{ template "slack.default.footer" . }}' + fallback: '{{ template "slack.default.fallback" . }}' + callback_id: '{{ template "slack.default.callbackid" . }}' + icon_emoji: '{{ template "slack.default.iconemoji" . }}' + icon_url: '{{ template "slack.default.iconurl" . }}' + link_names: false + - name: ops-quarantine-channels + slack_configs: + - send_resolved: true + http_config: + follow_redirects: true + enable_http2: true + api_url: https://api.slack.com/ + channel: '#eng-ops-alerts-spam' + username: AlertManager (tooling-test) + color: '{{ if eq .Status "firing" }}danger{{ else }}good{{ end }}' + title: '{{ template "default.title" . }}' + title_link: '{{ template "slack.default.titlelink" . }}' + pretext: '{{ template "slack.default.pretext" . }}' + text: '{{ template "default.text" . }}' + short_fields: false + footer: '{{ template "slack.default.footer" . }}' + fallback: '{{ template "slack.default.fallback" . }}' + callback_id: '{{ template "slack.default.callbackid" . }}' + icon_emoji: '{{ template "slack.default.iconemoji" . }}' + icon_url: '{{ template "slack.default.iconurl" . }}' + link_names: false + - name: ops-testlab-channelss + slack_configs: + - send_resolved: true + http_config: + follow_redirects: true + enable_http2: true + api_url: https://api.slack.com/ + channel: '#eng-sysops-alerts' + username: AlertManager (tooling-test222) + color: '{{ if eq .Status "firing" }}danger{{ else }}good{{ end }}' + title: '{{ template "default.title" . }}' + title_link: '{{ template "slack.default.titlelink" . }}' + pretext: '{{ template "slack.default.pretext" . }}' + text: '{{ template "default.text" . }}' + short_fields: false + footer: '{{ template "slack.default.footer" . }}' + fallback: '{{ template "slack.default.fallback" . }}' + callback_id: '{{ template "slack.default.callbackid" . }}' + icon_emoji: '{{ template "slack.default.iconemoji" . }}' + icon_url: '{{ template "slack.default.iconurl" . }}' + link_names: false + - name: kube-system-alerts + slack_configs: + - send_resolved: true + http_config: + follow_redirects: true + enable_http2: true + api_url: https://api.slack.com/ + channel: '#eng-sysops-alerts' + username: AlertManager (tooling-test) + color: '{{ if eq .Status "firing" }}danger{{ else }}good{{ end }}' + title: '{{ template "default.title" . }}' + title_link: '{{ template "slack.default.titlelink" . }}' + pretext: '{{ template "slack.default.pretext" . }}' + text: '{{ template "default.text" . }}' + short_fields: false + footer: '{{ template "slack.default.footer" . }}' + fallback: '{{ template "slack.default.fallback" . }}' + callback_id: '{{ template "slack.default.callbackid" . }}' + icon_emoji: '{{ template "slack.default.iconemoji" . }}' + icon_url: '{{ template "slack.default.iconurl" . }}' + link_names: false + - name: optimus-disaster-channels + slack_configs: + - send_resolved: true + http_config: + follow_redirects: true + enable_http2: true + api_url: https://api.slack.com/ + channel: '#eng-optimus-alerts' + username: AlertManager (tooling-test) + color: '{{ if eq .Status "firing" }}danger{{ else }}good{{ end }}' + title: '{{ template "default.title" . }}' + title_link: '{{ template "slack.default.titlelink" . }}' + pretext: '{{ template "slack.default.pretext" . }}' + text: '{{ template "default.text" . }}' + short_fields: false + footer: '{{ template "slack.default.footer" . }}' + fallback: '{{ template "slack.default.fallback" . }}' + callback_id: '{{ template "slack.default.callbackid" . }}' + icon_emoji: '{{ template "slack.default.iconemoji" . }}' + icon_url: '{{ template "slack.default.iconurl" . }}' + link_names: false + templates: [] + diff --git a/testdata/alert_manager_notification_policies.yaml b/testdata/alert_manager_notification_policies.yaml new file mode 100644 index 0000000..62f433b --- /dev/null +++ b/testdata/alert_manager_notification_policies.yaml @@ -0,0 +1,137 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: alert-manager-np + namespace: alert-migrator-test + annotations: + prometheus.io/kube-alertmanager: "true" +data: + all_instances_down_otel_collector: | + global: + smtp_smarthost: 'smtp.example.com:587' + smtp_from: 'alertmanager@example.com' + smtp_auth_username: 'alertmanager' + smtp_auth_password: 'password' + route: + receiver: 'default-email' + group_by: ['alertname', 'env'] + group_wait: 30s + group_interval: 5m + repeat_interval: 1h + routes: + - matchers: + - quarantine="true" + - team="ops" + group_by: + - alertname + - hostname + - instance + - device + - node + receiver: 'slack-quarantine-ops' + continue: true + + - matchers: + - severity="critical" + - team="dev" + group_by: + - alertname + - hostname + - instance + - device + - node + receiver: 'pagerduty-critical-dev' + continue: true + + - matchers: + - environment="staging" + group_by: + - alertname + - hostname + - instance + - device + - node + receiver: 'email-staging' + continue: true + + - matchers: + - environment="production" + - severity="warning" + group_by: + - alertname + - hostname + - instance + - device + - node + receiver: 'slack-prod-warning' + continue: true + + - matchers: + - team="network" + group_by: + - alertname + - hostname + - instance + - device + - node + receiver: 'slack-network-team' + continue: true + + - matchers: + - environment="qa" + group_by: + - alertname + - hostname + - instance + - device + - node + receiver: 'slack-qa-alerts' + continue: true + + - matchers: + - service="database" + group_by: + - alertname + - hostname + - instance + - device + - node + receiver: 'email-database-service' + continue: true + + receivers: + - name: 'default-email' + email_configs: + - to: 'alerts@example.com' + + - name: 'slack-quarantine-ops' + slack_configs: + - api_url: 'https://hooks.slack.com/services/T00000000/B00000000' + channel: '#quarantine-ops-alerts' + + - name: 'pagerduty-critical-dev' + pagerduty_configs: + - service_key: 'YOUR_PAGERDUTY_SERVICE_KEY' + + - name: 'email-staging' + email_configs: + - to: 'staging-alerts@example.com' + + - name: 'slack-prod-warning' + slack_configs: + - api_url: 'https://hooks.slack.com/services/T00000000/B11111111' + channel: '#prod-warning-alerts' + + - name: 'slack-network-team' + slack_configs: + - api_url: 'https://hooks.slack.com/services/T00000000/B22222222' + channel: '#network-team-alerts' + + - name: 'slack-qa-alerts' + slack_configs: + - api_url: 'https://hooks.slack.com/services/T00000000/B33333333' + channel: '#qa-alerts' + + - name: 'email-database-service' + email_configs: + - to: 'database-service-alerts@example.com' diff --git a/testdata/cm.yml b/testdata/cm.yml index f8011ad..838441c 100644 --- a/testdata/cm.yml +++ b/testdata/cm.yml @@ -59,9 +59,9 @@ data: annotations: description: "The Splunk OpenTelemetry collector is failing to export spans with the following protocol {% raw %}{{ $labels.exporter }}{% endraw %}" causes: "Service is most likely unhealthy" - all_instances_down_otel_collector_yotams: | + all_instances_down_otel_collector: | alert: Opentelemetry_Collector_Downq - expr: sum(up{app="opentelemetry-collectord", job="kubernetes-pods"}) == 0 or absent(up{app="opentelemetry-collector", job="kubernetes-pods"}) > 0 + expr: sum(up{app="opentelemetry-collectordsd", job="kubernetes-pods"}) == 1 or absent(up{app="opentelemetry-collector", job="kubernetes-pods"}) > 0 for: 5m labels: team: "sre" @@ -71,7 +71,7 @@ data: causes: "Service is most likely down or fails healthchecks" all_instances_down_splunk_collectors: | alert: Splunk_Collector_Down - expr: sum(up{app="splunk-otel-collector", job="kubernetes-pods"}) == 0 or absent(up{app="splunk-otel-collector", job="kubernetes-pods"}) > 0 + expr: sum(up{app="splunk-otel-collectorsd", job="kubernetes-pods"}) == 1 or absent(up{app="splunk-otel-collector", job="kubernetes-pods"}) > 0 for: 5m labels: team: "sre"