Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement major upgrade result annotations #2727

Merged
merged 17 commits into from
Aug 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/administrator.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,12 @@ It is also possible to define `maintenanceWindows` in the Postgres manifest to
better control when such automated upgrades should take place after increasing
the version.

### Upgrade annotations

When an upgrade is executed, the operator sets an annotation in the PostgreSQL resource, either `last-major-upgrade-success` if the upgrade succeeds, or `last-major-upgrade-failure` if it fails. The value of the annotation is a timestamp indicating when the upgrade occurred.

If a PostgreSQL resource contains a failure annotation, the operator will not attempt to retry the upgrade during a sync event. To remove the failure annotation, you can revert the PostgreSQL version back to the current version. This action will trigger the removal of the failure annotation.

## Non-default cluster domain

If your cluster uses a DNS domain other than the default `cluster.local`, this
Expand Down
79 changes: 64 additions & 15 deletions e2e/tests/test_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -1185,13 +1185,19 @@ def get_docker_image():
@timeout_decorator.timeout(TEST_TIMEOUT_SEC)
def test_major_version_upgrade(self):
"""
Test major version upgrade
Test major version upgrade: with full upgrade, maintenance window, and annotation
"""
def check_version():
p = k8s.patroni_rest("acid-upgrade-test-0", "")
version = p.get("server_version", 0) // 10000
return version

def get_annotations():
pg_manifest = k8s.api.custom_objects_api.get_namespaced_custom_object(
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test")
annotations = pg_manifest["metadata"]["annotations"]
return annotations

k8s = self.k8s
cluster_label = 'application=spilo,cluster-name=acid-upgrade-test'

Expand All @@ -1209,68 +1215,111 @@ def check_version():

master_nodes, _ = k8s.get_cluster_nodes(cluster_labels=cluster_label)
# should upgrade immediately
pg_patch_version_14 = {
pg_patch_version_13 = {
"spec": {
"postgresql": {
"version": "14"
"version": "13"
}
}
}
k8s.api.custom_objects_api.patch_namespaced_custom_object(
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_14)
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_13)
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")

# should have finish failover
k8s.wait_for_pod_failover(master_nodes, 'spilo-role=replica,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=master,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
self.eventuallyEqual(check_version, 14, "Version should be upgraded from 12 to 14")
self.eventuallyEqual(check_version, 13, "Version should be upgraded from 12 to 13")

# check if annotation for last upgrade's success is set
annotations = get_annotations()
self.assertIsNotNone(annotations.get("last-major-upgrade-success"), "Annotation for last upgrade's success is not set")

# should not upgrade because current time is not in maintenanceWindow
current_time = datetime.now()
maintenance_window_future = f"{(current_time+timedelta(minutes=60)).strftime('%H:%M')}-{(current_time+timedelta(minutes=120)).strftime('%H:%M')}"
pg_patch_version_15 = {
pg_patch_version_14 = {
"spec": {
"postgresql": {
"version": "15"
"version": "14"
},
"maintenanceWindows": [
maintenance_window_future
]
}
}
k8s.api.custom_objects_api.patch_namespaced_custom_object(
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_15)
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_14)
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")

# should have finish failover
k8s.wait_for_pod_failover(master_nodes, 'spilo-role=master,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=master,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
self.eventuallyEqual(check_version, 14, "Version should not be upgraded")
self.eventuallyEqual(check_version, 13, "Version should not be upgraded")
idanovinda marked this conversation as resolved.
Show resolved Hide resolved

second_annotations = get_annotations()
self.assertIsNone(second_annotations.get("last-major-upgrade-failure"), "Annotation for last upgrade's failure should not be set")

# change the version again to trigger operator sync
maintenance_window_current = f"{(current_time-timedelta(minutes=30)).strftime('%H:%M')}-{(current_time+timedelta(minutes=30)).strftime('%H:%M')}"
pg_patch_version_16 = {
pg_patch_version_15 = {
"spec": {
"postgresql": {
"version": "16"
"version": "15"
},
"maintenanceWindows": [
maintenance_window_current
]
}
}

k8s.api.custom_objects_api.patch_namespaced_custom_object(
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_15)
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")

k8s.wait_for_pod_failover(master_nodes, 'spilo-role=replica,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=master,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
self.eventuallyEqual(check_version, 15, "Version should be upgraded from 13 to 15")

# check if annotation for last upgrade's success is updated after second upgrade
third_annotations = get_annotations()
self.assertIsNotNone(third_annotations.get("last-major-upgrade-success"), "Annotation for last upgrade's success is not set")
self.assertNotEqual(annotations.get("last-major-upgrade-success"), third_annotations.get("last-major-upgrade-success"), "Annotation for last upgrade's success is not updated")

# test upgrade with failed upgrade annotation
pg_patch_version_16 = {
"metadata": {
"annotations": {
"last-major-upgrade-failure": "2024-01-02T15:04:05Z"
},
},
"spec": {
"postgresql": {
"version": "16"
},
},
}
k8s.api.custom_objects_api.patch_namespaced_custom_object(
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_16)
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")

# should have finish failover
k8s.wait_for_pod_failover(master_nodes, 'spilo-role=master,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=master,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
self.eventuallyEqual(check_version, 15, "Version should not be upgraded because annotation for last upgrade's failure is set")

# change the version back to 15 and should remove failure annotation
k8s.api.custom_objects_api.patch_namespaced_custom_object(
"acid.zalan.do", "v1", "default", "postgresqls", "acid-upgrade-test", pg_patch_version_15)
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")

k8s.wait_for_pod_failover(master_nodes, 'spilo-role=replica,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=master,' + cluster_label)
k8s.wait_for_pod_start('spilo-role=replica,' + cluster_label)
self.eventuallyEqual(check_version, 16, "Version should be upgraded from 14 to 16")

fourth_annotations = get_annotations()
self.assertIsNone(fourth_annotations.get("last-major-upgrade-failure"), "Annotation for last upgrade's failure is not removed")

@timeout_decorator.timeout(TEST_TIMEOUT_SEC)
def test_persistent_volume_claim_retention_policy(self):
Expand Down
65 changes: 64 additions & 1 deletion pkg/cluster/majorversionupgrade.go
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
package cluster

import (
"context"
"encoding/json"
"fmt"
"strings"

"github.com/zalando/postgres-operator/pkg/spec"
"github.com/zalando/postgres-operator/pkg/util"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
)

// VersionMap Map of version numbers
Expand All @@ -18,6 +22,11 @@ var VersionMap = map[string]int{
"16": 160000,
}

const (
majorVersionUpgradeSuccessAnnotation = "last-major-upgrade-success"
majorVersionUpgradeFailureAnnotation = "last-major-upgrade-failure"
)

// IsBiggerPostgresVersion Compare two Postgres version numbers
func IsBiggerPostgresVersion(old string, new string) bool {
oldN := VersionMap[old]
Expand Down Expand Up @@ -54,6 +63,47 @@ func (c *Cluster) isUpgradeAllowedForTeam(owningTeam string) bool {
return util.SliceContains(allowedTeams, owningTeam)
}

func (c *Cluster) annotatePostgresResource(isSuccess bool) error {
annotations := make(map[string]string)
currentTime := metav1.Now().Format("2006-01-02T15:04:05Z")
if isSuccess {
annotations[majorVersionUpgradeSuccessAnnotation] = currentTime
} else {
annotations[majorVersionUpgradeFailureAnnotation] = currentTime
}
patchData, err := metaAnnotationsPatch(annotations)
if err != nil {
c.logger.Errorf("could not form patch for %s postgresql resource: %v", c.Name, err)
return err
}
_, err = c.KubeClient.Postgresqls(c.Namespace).Patch(context.Background(), c.Name, types.MergePatchType, patchData, metav1.PatchOptions{})
if err != nil {
c.logger.Errorf("failed to patch annotations to postgresql resource: %v", err)
return err
}
return nil
idanovinda marked this conversation as resolved.
Show resolved Hide resolved
}

func (c *Cluster) removeFailuresAnnotation() error {
annotationToRemove := []map[string]string{
{
"op": "remove",
"path": fmt.Sprintf("/metadata/annotations/%s", majorVersionUpgradeFailureAnnotation),
},
}
removePatch, err := json.Marshal(annotationToRemove)
if err != nil {
c.logger.Errorf("could not form removal patch for %s postgresql resource: %v", c.Name, err)
return err
}
_, err = c.KubeClient.Postgresqls(c.Namespace).Patch(context.Background(), c.Name, types.JSONPatchType, removePatch, metav1.PatchOptions{})
if err != nil {
c.logger.Errorf("failed to remove annotations from postgresql resource: %v", err)
return err
}
return nil
}

idanovinda marked this conversation as resolved.
Show resolved Hide resolved
/*
Execute upgrade when mode is set to manual or full or when the owning team is allowed for upgrade (and mode is "off").

Expand All @@ -69,10 +119,19 @@ func (c *Cluster) majorVersionUpgrade() error {
desiredVersion := c.GetDesiredMajorVersionAsInt()

if c.currentMajorVersion >= desiredVersion {
if _, exists := c.ObjectMeta.Annotations[majorVersionUpgradeFailureAnnotation]; exists { // if failure annotation exists, remove it
c.removeFailuresAnnotation()
c.logger.Infof("removing failure annotation as the cluster is already up to date")
}
c.logger.Infof("cluster version up to date. current: %d, min desired: %d", c.currentMajorVersion, desiredVersion)
return nil
}

if _, exists := c.ObjectMeta.Annotations[majorVersionUpgradeFailureAnnotation]; exists {
c.logger.Infof("last major upgrade failed, skipping upgrade")
return nil
}

if !isInMainternanceWindow(c.Spec.MaintenanceWindows) {
c.logger.Infof("skipping major version upgrade, not in maintenance window")
return nil
Expand Down Expand Up @@ -107,6 +166,7 @@ func (c *Cluster) majorVersionUpgrade() error {
return nil
}

isUpgradeSuccess := true
numberOfPods := len(pods)
if allRunning && masterPod != nil {
c.logger.Infof("healthy cluster ready to upgrade, current: %d desired: %d", c.currentMajorVersion, desiredVersion)
Expand All @@ -132,11 +192,14 @@ func (c *Cluster) majorVersionUpgrade() error {
result, err = c.ExecCommand(podName, "/bin/su", "postgres", "-c", upgradeCommand)
}
if err != nil {
isUpgradeSuccess = false
c.annotatePostgresResource(isUpgradeSuccess)
c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeWarning, "Major Version Upgrade", "upgrade from %d to %d FAILED: %v", c.currentMajorVersion, desiredVersion, err)
return err
}
c.logger.Infof("upgrade action triggered and command completed: %s", result[:100])

c.annotatePostgresResource(isUpgradeSuccess)
c.logger.Infof("upgrade action triggered and command completed: %s", result[:100])
c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeNormal, "Major Version Upgrade", "upgrade from %d to %d finished", c.currentMajorVersion, desiredVersion)
}
}
Expand Down
Loading