diff --git a/docs/administrator.md b/docs/administrator.md index 3552f958b..86ceca291 100644 --- a/docs/administrator.md +++ b/docs/administrator.md @@ -85,6 +85,12 @@ It is also possible to define `maintenanceWindows` in the Postgres manifest to better control when such automated upgrades should take place after increasing the version. +### Upgrade annotations + +When an upgrade is executed, the operator sets an annotation in the PostgreSQL resource, either `last-major-upgrade-success` if the upgrade succeeds, or `last-major-upgrade-failure` if it fails. The value of the annotation is a timestamp indicating when the upgrade occurred. + +If a PostgreSQL resource contains a failure annotation, the operator will not attempt to retry the upgrade during a sync event. To remove the failure annotation, you can revert the PostgreSQL version back to the current version. This action will trigger the removal of the failure annotation. + ## Non-default cluster domain If your cluster uses a DNS domain other than the default `cluster.local`, this diff --git a/e2e/tests/test_e2e.py b/e2e/tests/test_e2e.py index 06e5c5231..f89e2fb86 100644 --- a/e2e/tests/test_e2e.py +++ b/e2e/tests/test_e2e.py @@ -1185,13 +1185,19 @@ def get_docker_image(): @timeout_decorator.timeout(TEST_TIMEOUT_SEC) def test_major_version_upgrade(self): """ - Test major version upgrade + Test major version upgrade: with full upgrade, maintenance window, and annotation """ def check_version(): p = k8s.patroni_rest("acid-upgrade-test-0", "") version = p.get("server_version", 0) // 10000 return version + def get_annotations(): + pg_manifest = k8s.api.custom_objects_api.get_namespaced_custom_object( + "acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test") + annotations = pg_manifest["metadata"]["annotations"] + return annotations + k8s = self.k8s cluster_label = 'application=spilo,cluster-name=acid-upgrade-test' @@ -1209,30 +1215,33 @@ def check_version(): master_nodes, _ = k8s.get_cluster_nodes(cluster_labels=cluster_label) # should upgrade immediately - pg_patch_version_14 = { + pg_patch_version_13 = { "spec": { "postgresql": { - "version": "14" + "version": "13" } } } k8s.api.custom_objects_api.patch_namespaced_custom_object( - "acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_14) + "acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_13) self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync") - # should have finish failover k8s.wait_for_pod_failover(master_nodes, 'spilo-role=replica,' + cluster_label) k8s.wait_for_pod_start('spilo-role=master,' + cluster_label) k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label) - self.eventuallyEqual(check_version, 14, "Version should be upgraded from 12 to 14") + self.eventuallyEqual(check_version, 13, "Version should be upgraded from 12 to 13") + + # check if annotation for last upgrade's success is set + annotations = get_annotations() + self.assertIsNotNone(annotations.get("last-major-upgrade-success"), "Annotation for last upgrade's success is not set") # should not upgrade because current time is not in maintenanceWindow current_time = datetime.now() maintenance_window_future = f"{(current_time+timedelta(minutes=60)).strftime('%H:%M')}-{(current_time+timedelta(minutes=120)).strftime('%H:%M')}" - pg_patch_version_15 = { + pg_patch_version_14 = { "spec": { "postgresql": { - "version": "15" + "version": "14" }, "maintenanceWindows": [ maintenance_window_future @@ -1240,21 +1249,23 @@ def check_version(): } } k8s.api.custom_objects_api.patch_namespaced_custom_object( - "acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_15) + "acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_14) self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync") - # should have finish failover k8s.wait_for_pod_failover(master_nodes, 'spilo-role=master,' + cluster_label) k8s.wait_for_pod_start('spilo-role=master,' + cluster_label) k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label) - self.eventuallyEqual(check_version, 14, "Version should not be upgraded") + self.eventuallyEqual(check_version, 13, "Version should not be upgraded") + + second_annotations = get_annotations() + self.assertIsNone(second_annotations.get("last-major-upgrade-failure"), "Annotation for last upgrade's failure should not be set") # change the version again to trigger operator sync maintenance_window_current = f"{(current_time-timedelta(minutes=30)).strftime('%H:%M')}-{(current_time+timedelta(minutes=30)).strftime('%H:%M')}" - pg_patch_version_16 = { + pg_patch_version_15 = { "spec": { "postgresql": { - "version": "16" + "version": "15" }, "maintenanceWindows": [ maintenance_window_current @@ -1262,15 +1273,53 @@ def check_version(): } } + k8s.api.custom_objects_api.patch_namespaced_custom_object( + "acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_15) + self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync") + + k8s.wait_for_pod_failover(master_nodes, 'spilo-role=replica,' + cluster_label) + k8s.wait_for_pod_start('spilo-role=master,' + cluster_label) + k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label) + self.eventuallyEqual(check_version, 15, "Version should be upgraded from 13 to 15") + + # check if annotation for last upgrade's success is updated after second upgrade + third_annotations = get_annotations() + self.assertIsNotNone(third_annotations.get("last-major-upgrade-success"), "Annotation for last upgrade's success is not set") + self.assertNotEqual(annotations.get("last-major-upgrade-success"), third_annotations.get("last-major-upgrade-success"), "Annotation for last upgrade's success is not updated") + + # test upgrade with failed upgrade annotation + pg_patch_version_16 = { + "metadata": { + "annotations": { + "last-major-upgrade-failure": "2024-01-02T15:04:05Z" + }, + }, + "spec": { + "postgresql": { + "version": "16" + }, + }, + } k8s.api.custom_objects_api.patch_namespaced_custom_object( "acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_16) self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync") - # should have finish failover + k8s.wait_for_pod_failover(master_nodes, 'spilo-role=master,' + cluster_label) + k8s.wait_for_pod_start('spilo-role=master,' + cluster_label) + k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label) + self.eventuallyEqual(check_version, 15, "Version should not be upgraded because annotation for last upgrade's failure is set") + + # change the version back to 15 and should remove failure annotation + k8s.api.custom_objects_api.patch_namespaced_custom_object( + "acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_15) + self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync") + k8s.wait_for_pod_failover(master_nodes, 'spilo-role=replica,' + cluster_label) k8s.wait_for_pod_start('spilo-role=master,' + cluster_label) k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label) - self.eventuallyEqual(check_version, 16, "Version should be upgraded from 14 to 16") + + fourth_annotations = get_annotations() + self.assertIsNone(fourth_annotations.get("last-major-upgrade-failure"), "Annotation for last upgrade's failure is not removed") @timeout_decorator.timeout(TEST_TIMEOUT_SEC) def test_persistent_volume_claim_retention_policy(self): diff --git a/pkg/cluster/majorversionupgrade.go b/pkg/cluster/majorversionupgrade.go index 3d9482b25..f51e42415 100644 --- a/pkg/cluster/majorversionupgrade.go +++ b/pkg/cluster/majorversionupgrade.go @@ -1,12 +1,16 @@ package cluster import ( + "context" + "encoding/json" "fmt" "strings" "github.com/zalando/postgres-operator/pkg/spec" "github.com/zalando/postgres-operator/pkg/util" v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" ) // VersionMap Map of version numbers @@ -18,6 +22,11 @@ var VersionMap = map[string]int{ "16": 160000, } +const ( + majorVersionUpgradeSuccessAnnotation = "last-major-upgrade-success" + majorVersionUpgradeFailureAnnotation = "last-major-upgrade-failure" +) + // IsBiggerPostgresVersion Compare two Postgres version numbers func IsBiggerPostgresVersion(old string, new string) bool { oldN := VersionMap[old] @@ -54,6 +63,47 @@ func (c *Cluster) isUpgradeAllowedForTeam(owningTeam string) bool { return util.SliceContains(allowedTeams, owningTeam) } +func (c *Cluster) annotatePostgresResource(isSuccess bool) error { + annotations := make(map[string]string) + currentTime := metav1.Now().Format("2006-01-02T15:04:05Z") + if isSuccess { + annotations[majorVersionUpgradeSuccessAnnotation] = currentTime + } else { + annotations[majorVersionUpgradeFailureAnnotation] = currentTime + } + patchData, err := metaAnnotationsPatch(annotations) + if err != nil { + c.logger.Errorf("could not form patch for %s postgresql resource: %v", c.Name, err) + return err + } + _, err = c.KubeClient.Postgresqls(c.Namespace).Patch(context.Background(), c.Name, types.MergePatchType, patchData, metav1.PatchOptions{}) + if err != nil { + c.logger.Errorf("failed to patch annotations to postgresql resource: %v", err) + return err + } + return nil +} + +func (c *Cluster) removeFailuresAnnotation() error { + annotationToRemove := []map[string]string{ + { + "op": "remove", + "path": fmt.Sprintf("/metadata/annotations/%s", majorVersionUpgradeFailureAnnotation), + }, + } + removePatch, err := json.Marshal(annotationToRemove) + if err != nil { + c.logger.Errorf("could not form removal patch for %s postgresql resource: %v", c.Name, err) + return err + } + _, err = c.KubeClient.Postgresqls(c.Namespace).Patch(context.Background(), c.Name, types.JSONPatchType, removePatch, metav1.PatchOptions{}) + if err != nil { + c.logger.Errorf("failed to remove annotations from postgresql resource: %v", err) + return err + } + return nil +} + /* Execute upgrade when mode is set to manual or full or when the owning team is allowed for upgrade (and mode is "off"). @@ -69,10 +119,19 @@ func (c *Cluster) majorVersionUpgrade() error { desiredVersion := c.GetDesiredMajorVersionAsInt() if c.currentMajorVersion >= desiredVersion { + if _, exists := c.ObjectMeta.Annotations[majorVersionUpgradeFailureAnnotation]; exists { // if failure annotation exists, remove it + c.removeFailuresAnnotation() + c.logger.Infof("removing failure annotation as the cluster is already up to date") + } c.logger.Infof("cluster version up to date. current: %d, min desired: %d", c.currentMajorVersion, desiredVersion) return nil } + if _, exists := c.ObjectMeta.Annotations[majorVersionUpgradeFailureAnnotation]; exists { + c.logger.Infof("last major upgrade failed, skipping upgrade") + return nil + } + if !isInMainternanceWindow(c.Spec.MaintenanceWindows) { c.logger.Infof("skipping major version upgrade, not in maintenance window") return nil @@ -107,6 +166,7 @@ func (c *Cluster) majorVersionUpgrade() error { return nil } + isUpgradeSuccess := true numberOfPods := len(pods) if allRunning && masterPod != nil { c.logger.Infof("healthy cluster ready to upgrade, current: %d desired: %d", c.currentMajorVersion, desiredVersion) @@ -132,11 +192,14 @@ func (c *Cluster) majorVersionUpgrade() error { result, err = c.ExecCommand(podName, "/bin/su", "postgres", "-c", upgradeCommand) } if err != nil { + isUpgradeSuccess = false + c.annotatePostgresResource(isUpgradeSuccess) c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeWarning, "Major Version Upgrade", "upgrade from %d to %d FAILED: %v", c.currentMajorVersion, desiredVersion, err) return err } - c.logger.Infof("upgrade action triggered and command completed: %s", result[:100]) + c.annotatePostgresResource(isUpgradeSuccess) + c.logger.Infof("upgrade action triggered and command completed: %s", result[:100]) c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeNormal, "Major Version Upgrade", "upgrade from %d to %d finished", c.currentMajorVersion, desiredVersion) } }