From 0fb943d55efc9bfbbe494130b4f350f0e151aa77 Mon Sep 17 00:00:00 2001 From: Salah Al Saleh Date: Thu, 6 Jun 2024 02:33:05 -0700 Subject: [PATCH 1/5] Clean up existing embedded cluster systemd service before creating symlink (#675) * Clean up existing embedded cluster systemd service before creating symlink * use lstat instead of stat * lstat on reset too * both 'worker' and 'controller' nodes will have 'worker' in the command, use 'enable-worker' instead * add comment --------- Co-authored-by: Andrew Lavery --- cmd/embedded-cluster/install.go | 20 +++----------------- cmd/embedded-cluster/join.go | 14 ++++++++------ cmd/embedded-cluster/restore.go | 6 +++--- cmd/embedded-cluster/uninstall.go | 2 +- 4 files changed, 15 insertions(+), 27 deletions(-) diff --git a/cmd/embedded-cluster/install.go b/cmd/embedded-cluster/install.go index ae0950d1a..c9f4f3665 100644 --- a/cmd/embedded-cluster/install.go +++ b/cmd/embedded-cluster/install.go @@ -79,20 +79,6 @@ func configureNetworkManager(c *cli.Context) error { return nil } -// runPostInstall is a helper function that run things just after the k0s install -// command ran. -func runPostInstall() error { - src := "/etc/systemd/system/k0scontroller.service" - dst := fmt.Sprintf("/etc/systemd/system/%s.service", defaults.BinaryName()) - if err := os.Symlink(src, dst); err != nil { - return fmt.Errorf("failed to create symlink: %w", err) - } - if _, err := helpers.RunCommand("systemctl", "daemon-reload"); err != nil { - return fmt.Errorf("unable to get reload systemctl daemon: %w", err) - } - return installAndEnableLocalArtifactMirror() -} - // RunHostPreflights runs the host preflights we found embedded in the binary // on all configured hosts. We attempt to read HostPreflights from all the // embedded Helm Charts and from the Kots Application Release files. @@ -505,9 +491,9 @@ var installCommand = &cli.Command{ metrics.ReportApplyFinished(c, err) return err } - logrus.Debugf("running post install") - if err := runPostInstall(); err != nil { - err := fmt.Errorf("unable to run post install: %w", err) + logrus.Debugf("creating systemd unit files") + if err := createSystemdUnitFiles(false); err != nil { + err := fmt.Errorf("unable to create systemd unit files: %w", err) metrics.ReportApplyFinished(c, err) return err } diff --git a/cmd/embedded-cluster/join.go b/cmd/embedded-cluster/join.go index 34512f5fa..9236457aa 100644 --- a/cmd/embedded-cluster/join.go +++ b/cmd/embedded-cluster/join.go @@ -212,7 +212,9 @@ var joinCommand = &cli.Command{ } logrus.Debugf("creating systemd unit files") - if err := createSystemdUnitFiles(jcmd.K0sJoinCommand); err != nil { + // both controller and worker nodes will have 'worker' in the join command, but only controllers will have 'enable-worker' + // https://github.com/replicatedhq/kots/blob/6a0602f4054d5d5f2d97e649b3303a059f0064d9/pkg/embeddedcluster/node_join.go#L183 + if err := createSystemdUnitFiles(!strings.Contains(jcmd.K0sJoinCommand, "enable-worker")); err != nil { err := fmt.Errorf("unable to create systemd unit files: %w", err) metrics.ReportJoinFailed(c.Context, jcmd.MetricsBaseURL, jcmd.ClusterID, err) return err @@ -386,22 +388,22 @@ func systemdUnitFileName() string { // createSystemdUnitFiles links the k0s systemd unit file. this also creates a new // systemd unit file for the local artifact mirror service. -func createSystemdUnitFiles(fullcmd string) error { +func createSystemdUnitFiles(isWorker bool) error { dst := systemdUnitFileName() - if _, err := os.Stat(dst); err == nil { + if _, err := os.Lstat(dst); err == nil { if err := os.Remove(dst); err != nil { return err } } src := "/etc/systemd/system/k0scontroller.service" - if strings.Contains(fullcmd, "worker") { + if isWorker { src = "/etc/systemd/system/k0sworker.service" } if err := os.Symlink(src, dst); err != nil { - return err + return fmt.Errorf("failed to create symlink: %w", err) } if _, err := helpers.RunCommand("systemctl", "daemon-reload"); err != nil { - return err + return fmt.Errorf("unable to get reload systemctl daemon: %w", err) } return installAndEnableLocalArtifactMirror() } diff --git a/cmd/embedded-cluster/restore.go b/cmd/embedded-cluster/restore.go index 9b90b4ba4..52cf7cf37 100644 --- a/cmd/embedded-cluster/restore.go +++ b/cmd/embedded-cluster/restore.go @@ -762,9 +762,9 @@ var restoreCommand = &cli.Command{ if err := installK0s(); err != nil { return fmt.Errorf("unable update cluster: %w", err) } - logrus.Debugf("running post install") - if err := runPostInstall(); err != nil { - return fmt.Errorf("unable to run post install: %w", err) + logrus.Debugf("creating systemd unit files") + if err := createSystemdUnitFiles(false); err != nil { + return fmt.Errorf("unable to create systemd unit files: %w", err) } logrus.Debugf("waiting for k0s to be ready") if err := waitForK0s(); err != nil { diff --git a/cmd/embedded-cluster/uninstall.go b/cmd/embedded-cluster/uninstall.go index 40aa04eec..b6afd9d88 100644 --- a/cmd/embedded-cluster/uninstall.go +++ b/cmd/embedded-cluster/uninstall.go @@ -412,7 +412,7 @@ var resetCommand = &cli.Command{ } } - if _, err := os.Stat(systemdUnitFileName()); err == nil { + if _, err := os.Lstat(systemdUnitFileName()); err == nil { if err := os.Remove(systemdUnitFileName()); err != nil { return fmt.Errorf("failed to remove systemd unit file: %w", err) } From c5b414808064964c3f70c75d74f018e317e66b9c Mon Sep 17 00:00:00 2001 From: Ricardo Maraschini Date: Thu, 6 Jun 2024 14:22:33 +0200 Subject: [PATCH 2/5] feat(ha): print warning on control plane node reset (#677) warn about removing the antepenultimate (third to last) controller node if ha is enabled. --- cmd/embedded-cluster/uninstall.go | 45 +++++++++++++++++++++++++++++++ e2e/install_test.go | 11 ++++++++ 2 files changed, 56 insertions(+) diff --git a/cmd/embedded-cluster/uninstall.go b/cmd/embedded-cluster/uninstall.go index b6afd9d88..bbfa2c1c2 100644 --- a/cmd/embedded-cluster/uninstall.go +++ b/cmd/embedded-cluster/uninstall.go @@ -11,9 +11,11 @@ import ( autopilot "github.com/k0sproject/k0s/pkg/apis/autopilot/v1beta2" "github.com/k0sproject/k0s/pkg/apis/k0s/v1beta1" "github.com/k0sproject/k0s/pkg/etcd" + embeddedclusterv1beta1 "github.com/replicatedhq/embedded-cluster-kinds/apis/v1beta1" "github.com/sirupsen/logrus" "github.com/urfave/cli/v2" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/labels" "sigs.k8s.io/controller-runtime/pkg/client" "github.com/replicatedhq/embedded-cluster/pkg/defaults" @@ -55,6 +57,8 @@ var ( k0s = "/usr/local/bin/k0s" ) +var haWarningMessage = "WARNING: High-availability clusters must maintain at least three controller nodes, but resetting this node will leave only two. This can lead to a loss of functionality and non-recoverable failures. You should re-add a third node as soon as possible." + // deleteNode removes the node from the cluster func (h *hostInfo) deleteNode(ctx context.Context) error { if h.KclientError != nil { @@ -287,6 +291,43 @@ func checkErrPrompt(c *cli.Context, err error) bool { return prompts.New().Confirm("Do you want to continue anyway?", false) } +// maybePrintHAWarning prints a warning message when the user is running a reset a node +// in a high availability cluster and there are only 3 control nodes. +func maybePrintHAWarning(c *cli.Context) error { + kubeconfig := defaults.PathToKubeConfig() + if _, err := os.Stat(kubeconfig); err != nil { + return nil + } + + os.Setenv("KUBECONFIG", kubeconfig) + kubecli, err := kubeutils.KubeClient() + if err != nil { + return fmt.Errorf("unable to create kube client: %w", err) + } + embeddedclusterv1beta1.AddToScheme(kubecli.Scheme()) + + if in, err := kubeutils.GetLatestInstallation(c.Context, kubecli); err != nil { + return fmt.Errorf("unable to get installation: %w", err) + } else if !in.Spec.HighAvailability { + return nil + } + + opts := &client.ListOptions{ + LabelSelector: labels.SelectorFromSet( + labels.Set{"node-role.kubernetes.io/control-plane": "true"}, + ), + } + var nodes corev1.NodeList + if err := kubecli.List(c.Context, &nodes, opts); err != nil { + return fmt.Errorf("unable to list nodes: %w", err) + } + if len(nodes.Items) == 3 { + logrus.Warn(haWarningMessage) + logrus.Info("") + } + return nil +} + var resetCommand = &cli.Command{ Name: "reset", Before: func(c *cli.Context) error { @@ -315,6 +356,10 @@ var resetCommand = &cli.Command{ }, Usage: fmt.Sprintf("Remove %s from the current node", binName), Action: func(c *cli.Context) error { + if err := maybePrintHAWarning(c); err != nil && !c.Bool("force") { + return err + } + logrus.Info("This will remove this node from the cluster and completely reset it, removing all data stored on the node.") logrus.Info("Do not reset another node until this is complete.") if !c.Bool("force") && !c.Bool("no-prompt") && !prompts.New().Confirm("Do you want to continue?", false) { diff --git a/e2e/install_test.go b/e2e/install_test.go index 74594282c..506db5efd 100644 --- a/e2e/install_test.go +++ b/e2e/install_test.go @@ -1051,6 +1051,17 @@ func TestMultiNodeHAInstallation(t *testing.T) { t.Fatalf("fail to check post ha state: %v", err) } + bin := strings.Split(command, " ")[0] + t.Logf("%s: resetting controller node", time.Now().Format(time.RFC3339)) + stdout, stderr, err = RunCommandOnNode(t, tc, 2, []string{bin, "reset", "--no-prompt"}) + if err != nil { + t.Fatalf("fail to remove controller node %s:", err) + } + if !strings.Contains(stderr, "High-availability clusters must maintain at least three controller nodes") { + t.Errorf("reset output does not contain the ha warning") + t.Logf("stdout: %s\nstderr: %s", stdout, stderr) + } + t.Logf("%s: test complete", time.Now().Format(time.RFC3339)) } From 9b75d0e87de72ba2b5241100024427b81e5b205d Mon Sep 17 00:00:00 2001 From: Andrew Lavery Date: Tue, 11 Jun 2024 18:00:16 +1000 Subject: [PATCH 3/5] handle github version tags without a 'v' (#682) ensure that resources are stored in the expected paths regardless of tag format --- .github/workflows/release-prod.yaml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/release-prod.yaml b/.github/workflows/release-prod.yaml index 157f9050c..72f094fee 100644 --- a/.github/workflows/release-prod.yaml +++ b/.github/workflows/release-prod.yaml @@ -14,7 +14,13 @@ jobs: with: fetch-depth: 0 - name: Extract tag name - run: echo "TAG_NAME=${GITHUB_REF#refs/tags/}" >> $GITHUB_ENV + run: | + # remove the "refs/tags/" prefix to get the tag that was pushed + export RAW_TAG=${GITHUB_REF#refs/tags/} + # add a 'v' prefix to the tag if it doesn't already have one + export V_TAG=$(echo "$RAW_TAG" | sed 's/^[^v]/v&/') + # store the tag name in a github environment variable for later steps + echo "TAG_NAME=${V_TAG}" >> $GITHUB_ENV - name: Set up Go uses: actions/setup-go@v5 with: From b6613ec70d68623aa2518e0e02065dc4176a3923 Mon Sep 17 00:00:00 2001 From: Salah Al Saleh Date: Tue, 11 Jun 2024 07:08:58 -0700 Subject: [PATCH 4/5] Add support for HA restores (#672) * Add support for HA restores --- .github/actions/e2e/action.yml | 4 + .github/workflows/pull-request.yaml | 5 + .github/workflows/release-dev.yaml | 4 + Makefile | 6 +- .../assets/resource-modifiers.yaml | 54 +++ cmd/embedded-cluster/join.go | 25 +- cmd/embedded-cluster/join_test.go | 18 + cmd/embedded-cluster/restore.go | 344 ++++++++++---- cmd/embedded-cluster/uninstall.go | 14 +- e2e/cluster/cluster.go | 12 +- e2e/install_test.go | 30 +- .../tests/create-backup/test.spec.ts | 3 +- e2e/restore_test.go | 440 +++++++++++++++++- e2e/scripts/airgap-prepare.sh | 28 +- e2e/scripts/airgap-update.sh | 2 +- e2e/scripts/bypass-kurl-proxy.sh | 1 + e2e/scripts/check-airgap-post-ha-state.sh | 9 + e2e/scripts/check-post-ha-state.sh | 6 + e2e/scripts/default-install.sh | 2 +- e2e/scripts/install-playwright.sh | 2 +- e2e/scripts/playwright.sh | 2 +- e2e/scripts/pre-minio-removal-install.sh | 2 +- e2e/scripts/reset-installation.sh | 6 +- e2e/scripts/restore-installation-airgap.exp | 42 +- e2e/scripts/restore-installation.exp | 24 +- .../restore-multi-node-airgap-phase1.exp | 172 +++++++ .../restore-multi-node-airgap-phase2.exp | 99 ++++ e2e/scripts/restore-multi-node-phase1.exp | 172 +++++++ e2e/scripts/restore-multi-node-phase2.exp | 83 ++++ e2e/scripts/resume-restore.exp | 90 ++-- e2e/scripts/single-node-airgap-install.sh | 2 +- e2e/scripts/single-node-install.sh | 2 +- e2e/scripts/unsupported-overrides.sh | 4 +- e2e/scripts/vandoor-prepare.sh | 5 +- e2e/scripts/wait-for-ready-nodes.sh | 7 + e2e/utils.go | 2 +- go.mod | 2 +- go.sum | 4 +- pkg/addons/adminconsole/adminconsole.go | 32 +- pkg/addons/applier.go | 4 +- pkg/addons/seaweedfs/seaweedfs.go | 47 ++ pkg/addons/seaweedfs/values.yaml | 13 +- ...ions-overrides-override-admin-console.yaml | 4 +- ...ns-overrides-override-multiple-charts.yaml | 4 +- ...extensions-overrides-override-unknown.yaml | 4 +- pkg/kotscli/kotscli.go | 2 +- pkg/kubeutils/kubeutils.go | 14 + 47 files changed, 1581 insertions(+), 272 deletions(-) create mode 100644 cmd/embedded-cluster/assets/resource-modifiers.yaml create mode 100755 e2e/scripts/restore-multi-node-airgap-phase1.exp create mode 100755 e2e/scripts/restore-multi-node-airgap-phase2.exp create mode 100755 e2e/scripts/restore-multi-node-phase1.exp create mode 100755 e2e/scripts/restore-multi-node-phase2.exp diff --git a/.github/actions/e2e/action.yml b/.github/actions/e2e/action.yml index e8139fa79..4bcca503c 100644 --- a/.github/actions/e2e/action.yml +++ b/.github/actions/e2e/action.yml @@ -4,6 +4,9 @@ inputs: test-name: description: 'individual test to run' required: true + is-large-runner: + description: 'Whether the test is running on a large runner' + required: true airgap-license-id: description: 'airgap-enabled license id to use for e2e tests' required: true @@ -61,6 +64,7 @@ runs: external_ids:ovn-encap-type=geneve \ external_ids:ovn-encap-ip=127.0.0.1 - name: Free up runner disk space + if: ${{ inputs.is-large-runner == 'false' }} shell: bash run: | df -h diff --git a/.github/workflows/pull-request.yaml b/.github/workflows/pull-request.yaml index 2e5aa734a..7134cd015 100644 --- a/.github/workflows/pull-request.yaml +++ b/.github/workflows/pull-request.yaml @@ -194,11 +194,15 @@ jobs: - TestMultiNodeHAInstallation - TestMultiNodeAirgapHAInstallation - TestMultiNodeAirgapUpgradeSameK0s + - TestMultiNodeHADisasterRecovery + - TestMultiNodeAirgapHADisasterRecovery include: - test: TestMultiNodeAirgapUpgrade runner: embedded-cluster - test: TestMultiNodeAirgapHAInstallation runner: embedded-cluster + - test: TestMultiNodeAirgapHADisasterRecovery + runner: embedded-cluster steps: - name: Checkout uses: actions/checkout@v4 @@ -211,6 +215,7 @@ jobs: - uses: ./.github/actions/e2e with: test-name: '${{ matrix.test }}' + is-large-runner: ${{ matrix.runner == 'embedded-cluster' }} airgap-license-id: ${{ secrets.STAGING_EMBEDDED_CLUSTER_AIRGAP_LICENSE_ID }} snapshot-license-id: ${{ secrets.STAGING_EMBEDDED_CLUSTER_SNAPSHOT_LICENSE_ID }} snapshot-license: ${{ secrets.STAGING_EMBEDDED_CLUSTER_SNAPSHOT_LICENSE }} diff --git a/.github/workflows/release-dev.yaml b/.github/workflows/release-dev.yaml index 735d68c6f..f2ec529a7 100644 --- a/.github/workflows/release-dev.yaml +++ b/.github/workflows/release-dev.yaml @@ -147,11 +147,15 @@ jobs: - TestMultiNodeHAInstallation - TestMultiNodeAirgapHAInstallation - TestMultiNodeAirgapUpgradeSameK0s + - TestMultiNodeHADisasterRecovery + - TestMultiNodeAirgapHADisasterRecovery include: - test: TestMultiNodeAirgapUpgrade runner: embedded-cluster - test: TestMultiNodeAirgapHAInstallation runner: embedded-cluster + - test: TestMultiNodeAirgapHADisasterRecovery + runner: embedded-cluster steps: - name: Checkout uses: actions/checkout@v4 diff --git a/Makefile b/Makefile index 477054d90..53fb77a8c 100644 --- a/Makefile +++ b/Makefile @@ -4,12 +4,12 @@ ARCH := $(shell uname -m) APP_NAME = embedded-cluster ADMIN_CONSOLE_CHART_URL = oci://registry.replicated.com/library ADMIN_CONSOLE_CHART_NAME = admin-console -ADMIN_CONSOLE_CHART_VERSION = 1.109.9-build.1 +ADMIN_CONSOLE_CHART_VERSION = 1.109.12 ADMIN_CONSOLE_IMAGE_OVERRIDE = ADMIN_CONSOLE_MIGRATIONS_IMAGE_OVERRIDE = EMBEDDED_OPERATOR_CHART_URL = oci://registry.replicated.com/library EMBEDDED_OPERATOR_CHART_NAME = embedded-cluster-operator -EMBEDDED_OPERATOR_CHART_VERSION = 0.34.6 +EMBEDDED_OPERATOR_CHART_VERSION = 0.34.9 EMBEDDED_OPERATOR_UTILS_IMAGE = busybox:1.36.1 EMBEDDED_CLUSTER_OPERATOR_IMAGE_OVERRIDE = OPENEBS_CHART_URL = https://openebs.github.io/openebs @@ -18,7 +18,7 @@ OPENEBS_CHART_VERSION = 4.0.1 OPENEBS_UTILS_VERSION = 4.0.0 SEAWEEDFS_CHART_URL = https://seaweedfs.github.io/seaweedfs/helm SEAWEEDFS_CHART_NAME = seaweedfs/seaweedfs -SEAWEEDFS_CHART_VERSION = 3.67.0 +SEAWEEDFS_CHART_VERSION = 3.68.0 REGISTRY_CHART_URL = https://helm.twun.io REGISTRY_CHART_NAME = twuni/docker-registry REGISTRY_CHART_VERSION = 2.2.3 diff --git a/cmd/embedded-cluster/assets/resource-modifiers.yaml b/cmd/embedded-cluster/assets/resource-modifiers.yaml new file mode 100644 index 000000000..e1485b5f4 --- /dev/null +++ b/cmd/embedded-cluster/assets/resource-modifiers.yaml @@ -0,0 +1,54 @@ +version: v1 +resourceModifierRules: +# convert kotsadm components (rqlite) to non-HA mode +# as kotsadm will always be restored to a single node +# because it is used during the restore process to add nodes +- conditions: + groupResource: statefulsets.apps + resourceNameRegex: "^kotsadm-rqlite$" + namespaces: + - kotsadm + patches: + - operation: replace + path: "/spec/replicas" + value: 1 + - operation: replace + path: "/spec/template/spec/containers/0/args/2" + value: "-bootstrap-expect=1" +# decouple kotsadm components PVCs from nodes +# this allows the PVCs to be created on the correct nodes +# when restoring HA kotsadm to a single node and then converting it to HA again +- conditions: + groupResource: persistentvolumeclaims + resourceNameRegex: "kotsadm-rqlite" + namespaces: + - kotsadm + mergePatches: + - patchData: | + { + "metadata": { + "annotations": { + "volume.kubernetes.io/selected-node": null + } + } + } +# preserve the registry service IP from the original cluster +- conditions: + groupResource: services + resourceNameRegex: "^registry$" + namespaces: + - registry + patches: + - operation: replace + path: "/spec/clusterIP" + value: "__REGISTRY_SERVICE_IP__" +# preserve the seaweedfs s3 service IP from the original cluster +- conditions: + groupResource: services + resourceNameRegex: "^ec-seaweedfs-s3$" + namespaces: + - seaweedfs + patches: + - operation: replace + path: "/spec/clusterIP" + value: "__SEAWEEDFS_S3_SERVICE_IP__" diff --git a/cmd/embedded-cluster/join.go b/cmd/embedded-cluster/join.go index 9236457aa..47e866dcd 100644 --- a/cmd/embedded-cluster/join.go +++ b/cmd/embedded-cluster/join.go @@ -18,8 +18,9 @@ import ( "github.com/urfave/cli/v2" "gopkg.in/yaml.v2" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/client" k8syaml "sigs.k8s.io/yaml" @@ -212,9 +213,8 @@ var joinCommand = &cli.Command{ } logrus.Debugf("creating systemd unit files") - // both controller and worker nodes will have 'worker' in the join command, but only controllers will have 'enable-worker' - // https://github.com/replicatedhq/kots/blob/6a0602f4054d5d5f2d97e649b3303a059f0064d9/pkg/embeddedcluster/node_join.go#L183 - if err := createSystemdUnitFiles(!strings.Contains(jcmd.K0sJoinCommand, "enable-worker")); err != nil { + // both controller and worker nodes will have 'worker' in the join command + if err := createSystemdUnitFiles(!strings.Contains(jcmd.K0sJoinCommand, "controller")); err != nil { err := fmt.Errorf("unable to create systemd unit files: %w", err) metrics.ReportJoinFailed(c.Context, jcmd.MetricsBaseURL, jcmd.ClusterID, err) return err @@ -461,17 +461,16 @@ func canEnableHA(ctx context.Context, kcli client.Client) (bool, error) { if installation.Spec.HighAvailability { return false, nil } - var nodes corev1.NodeList - labelSelector := labels.Set(map[string]string{ - "node-role.kubernetes.io/control-plane": "true", - }).AsSelector() - if err := kcli.List(ctx, &nodes, &client.ListOptions{LabelSelector: labelSelector}); err != nil { - return false, fmt.Errorf("unable to list nodes: %w", err) + if err := kcli.Get(ctx, types.NamespacedName{Name: ecRestoreStateCMName, Namespace: "embedded-cluster"}, &corev1.ConfigMap{}); err == nil { + return false, nil // cannot enable HA during a restore + } else if !errors.IsNotFound(err) { + return false, fmt.Errorf("unable to get restore state configmap: %w", err) } - if len(nodes.Items) < 3 { - return false, nil + ncps, err := kubeutils.NumOfControlPlaneNodes(ctx, kcli) + if err != nil { + return false, fmt.Errorf("unable to check control plane nodes: %w", err) } - return true, nil + return ncps >= 3, nil } // enableHA enables high availability in the installation object diff --git a/cmd/embedded-cluster/join_test.go b/cmd/embedded-cluster/join_test.go index 16e70192b..232a0af67 100644 --- a/cmd/embedded-cluster/join_test.go +++ b/cmd/embedded-cluster/join_test.go @@ -190,6 +190,24 @@ func Test_canEnableHA(t *testing.T) { }, want: false, }, + { + name: "high availability is not enabled and there is three or more controller nodes but a restore is in progress", + args: args{ + kcli: fake.NewClientBuilder().WithScheme(scheme).WithObjects( + &embeddedclusterv1beta1.Installation{ + ObjectMeta: metav1.ObjectMeta{Name: "test-installation"}, + Spec: embeddedclusterv1beta1.InstallationSpec{HighAvailability: false}, + }, + &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{Name: ecRestoreStateCMName, Namespace: "embedded-cluster"}, + }, + &corev1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node1", Labels: controllerLabels}}, + &corev1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node2", Labels: controllerLabels}}, + &corev1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node3", Labels: controllerLabels}}, + ).Build(), + }, + want: false, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { diff --git a/cmd/embedded-cluster/restore.go b/cmd/embedded-cluster/restore.go index 52cf7cf37..2381ba224 100644 --- a/cmd/embedded-cluster/restore.go +++ b/cmd/embedded-cluster/restore.go @@ -2,6 +2,7 @@ package main import ( "context" + _ "embed" "encoding/json" "fmt" "os" @@ -15,6 +16,7 @@ import ( "github.com/aws/aws-sdk-go/service/s3" "github.com/replicatedhq/embedded-cluster/pkg/addons" "github.com/replicatedhq/embedded-cluster/pkg/addons/adminconsole" + "github.com/replicatedhq/embedded-cluster/pkg/addons/seaweedfs" "github.com/replicatedhq/embedded-cluster/pkg/airgap" "github.com/replicatedhq/embedded-cluster/pkg/config" "github.com/replicatedhq/embedded-cluster/pkg/defaults" @@ -39,28 +41,37 @@ import ( type ecRestoreState string const ( - ecRestoreStateNew ecRestoreState = "new" - ecRestoreStateConfirmBackup ecRestoreState = "confirm-backup" - ecRestoreStateRestoreInfra ecRestoreState = "restore-infra" - ecRestoreStateRestoreRegistry ecRestoreState = "restore-registry" - ecRestoreStateRestoreECInstall ecRestoreState = "restore-ec-install" - ecRestoreStateWaitForNodes ecRestoreState = "wait-for-nodes" - ecRestoreStateRestoreApp ecRestoreState = "restore-app" + ecRestoreStateNew ecRestoreState = "new" + ecRestoreStateConfirmBackup ecRestoreState = "confirm-backup" + ecRestoreStateRestoreECInstall ecRestoreState = "restore-ec-install" + ecRestoreStateRestoreAdminConsole ecRestoreState = "restore-admin-console" + ecRestoreStateWaitForNodes ecRestoreState = "wait-for-nodes" + ecRestoreStateRestoreSeaweedFS ecRestoreState = "restore-seaweedfs" + ecRestoreStateRestoreRegistry ecRestoreState = "restore-registry" + ecRestoreStateRestoreECO ecRestoreState = "restore-embedded-cluster-operator" + ecRestoreStateRestoreApp ecRestoreState = "restore-app" ) var ecRestoreStates = []ecRestoreState{ ecRestoreStateNew, ecRestoreStateConfirmBackup, - ecRestoreStateRestoreInfra, ecRestoreStateRestoreECInstall, + ecRestoreStateRestoreAdminConsole, ecRestoreStateWaitForNodes, + ecRestoreStateRestoreSeaweedFS, + ecRestoreStateRestoreRegistry, + ecRestoreStateRestoreECO, ecRestoreStateRestoreApp, } const ( - ecRestoreStateCMName = "embedded-cluster-restore-state" + ecRestoreStateCMName = "embedded-cluster-restore-state" + resourceModifiersCMName = "restore-resource-modifiers" ) +//go:embed assets/resource-modifiers.yaml +var resourceModifiersYAML string + type s3BackupStore struct { endpoint string region string @@ -73,10 +84,12 @@ type s3BackupStore struct { type disasterRecoveryComponent string const ( - disasterRecoveryComponentInfra disasterRecoveryComponent = "infra" - disasterRecoveryComponentECInstall disasterRecoveryComponent = "ec-install" - disasterRecoveryComponentApp disasterRecoveryComponent = "app" - disasterRecoveryComponentRegistry disasterRecoveryComponent = "registry" + disasterRecoveryComponentECInstall disasterRecoveryComponent = "ec-install" + disasterRecoveryComponentAdminConsole disasterRecoveryComponent = "admin-console" + disasterRecoveryComponentSeaweedFS disasterRecoveryComponent = "seaweedfs" + disasterRecoveryComponentRegistry disasterRecoveryComponent = "registry" + disasterRecoveryComponentECO disasterRecoveryComponent = "embedded-cluster-operator" + disasterRecoveryComponentApp disasterRecoveryComponent = "app" ) type invalidBackupsError struct { @@ -360,7 +373,6 @@ func isBackupRestorable(backup *velerov1.Backup, rel *release.ChannelRelease, is if versionLabel := appsVersions[rel.AppSlug]; versionLabel != rel.VersionLabel { return false, fmt.Sprintf("has a different app version (%q) than the current version (%q)", versionLabel, rel.VersionLabel) } - if _, ok := backup.Annotations["kots.io/is-airgap"]; !ok { return false, "is missing the kots.io/is-airgap annotation" } @@ -374,10 +386,17 @@ func isBackupRestorable(backup *velerov1.Backup, rel *release.ChannelRelease, is return false, "is an airgap backup, but the restore is configured to be online" } } - return true, "" } +func isHighAvailabilityBackup(backup *velerov1.Backup) (bool, error) { + ha, ok := backup.Annotations["kots.io/embedded-cluster-is-ha"] + if !ok { + return false, fmt.Errorf("high availability annotation not found in backup") + } + return ha == "true", nil +} + // waitForBackups waits for backups to become available. // It returns a list of restorable backups, or an error if none are found. func waitForBackups(ctx context.Context, isAirgap bool) ([]velerov1.Backup, error) { @@ -496,23 +515,104 @@ func waitForVeleroRestoreCompleted(ctx context.Context, restoreName string) (*ve } } +// getRegistryIPFromBackup gets the registry service IP from a backup. +// It returns an empty string if the backup is not airgapped. +func getRegistryIPFromBackup(backup *velerov1.Backup) (string, error) { + isAirgap, ok := backup.Annotations["kots.io/is-airgap"] + if !ok { + return "", fmt.Errorf("unable to get airgap status from backup") + } + if isAirgap != "true" { + return "", nil + } + registryServiceHost, ok := backup.Annotations["kots.io/embedded-registry"] + if !ok { + return "", fmt.Errorf("embedded registry service IP annotation not found in backup") + } + return strings.Split(registryServiceHost, ":")[0], nil +} + +// getSeaweedFSS3ServiceIPFromBackup gets the seaweedfs s3 service IP from a backup. +// It returns an empty string if the backup is not airgapped or not high availability. +func getSeaweedFSS3ServiceIPFromBackup(backup *velerov1.Backup) (string, error) { + isAirgap, ok := backup.Annotations["kots.io/is-airgap"] + if !ok { + return "", fmt.Errorf("unable to get airgap status from backup") + } + if isAirgap != "true" { + return "", nil + } + highAvailability, err := isHighAvailabilityBackup(backup) + if err != nil { + return "", fmt.Errorf("unable to check high availability status: %w", err) + } + if !highAvailability { + return "", nil + } + swIP, ok := backup.Annotations["kots.io/embedded-cluster-seaweedfs-s3-ip"] + if !ok { + return "", fmt.Errorf("unable to get seaweedfs s3 service IP from backup") + } + return swIP, nil +} + +// ensureRestoreResourceModifiers ensures the necessary restore resource modifiers. +// Velero resource modifiers are used to modify the resources during a Velero restore by specifying json patches. +// The json patches are applied to the resources before they are restored. +// The json patches are specified in a configmap and the configmap is referenced in the restore object. +func ensureRestoreResourceModifiers(ctx context.Context, backup *velerov1.Backup) error { + registryServiceIP, err := getRegistryIPFromBackup(backup) + if err != nil { + return fmt.Errorf("unable to get registry service IP from backup: %w", err) + } + seaweedFSS3ServiceIP, err := getSeaweedFSS3ServiceIPFromBackup(backup) + if err != nil { + return fmt.Errorf("unable to get seaweedfs s3 service IP from backup: %w", err) + } + + modifiersYAML := strings.Replace(resourceModifiersYAML, "__REGISTRY_SERVICE_IP__", registryServiceIP, 1) + modifiersYAML = strings.Replace(modifiersYAML, "__SEAWEEDFS_S3_SERVICE_IP__", seaweedFSS3ServiceIP, 1) + + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: defaults.VeleroNamespace, + Name: resourceModifiersCMName, + }, + Data: map[string]string{ + "resource-modifiers.yaml": modifiersYAML, + }, + } + kcli, err := kubeutils.KubeClient() + if err != nil { + return fmt.Errorf("unable to create kube client: %w", err) + } + if err := kcli.Create(ctx, cm); err != nil && !errors.IsAlreadyExists(err) { + return fmt.Errorf("unable to create config map: %w", err) + } + return nil +} + // waitForDRComponent waits for a disaster recovery component to be restored. func waitForDRComponent(ctx context.Context, drComponent disasterRecoveryComponent, restoreName string) error { loading := spinner.Start() defer loading.Close() switch drComponent { - case disasterRecoveryComponentInfra: - loading.Infof("Restoring infrastructure") case disasterRecoveryComponentECInstall: loading.Infof("Restoring cluster state") - case disasterRecoveryComponentApp: - loading.Infof("Restoring application") + case disasterRecoveryComponentAdminConsole: + loading.Infof("Restoring the Admin Console") + case disasterRecoveryComponentSeaweedFS: + loading.Infof("Restoring registry data") case disasterRecoveryComponentRegistry: loading.Infof("Restoring registry") + case disasterRecoveryComponentECO: + loading.Infof("Restoring embedded cluster operator") + case disasterRecoveryComponentApp: + loading.Infof("Restoring application") } - // wait for restore to complete + // wait for velero restore to complete restore, err := waitForVeleroRestoreCompleted(ctx, restoreName) if err != nil { if restore != nil { @@ -521,40 +621,57 @@ func waitForDRComponent(ctx context.Context, drComponent disasterRecoveryCompone return fmt.Errorf("unable to wait for velero restore to complete: %w", err) } - if drComponent == disasterRecoveryComponentECInstall { - // wait for embedded cluster installation to reconcile + if drComponent == disasterRecoveryComponentAdminConsole { + // wait for admin console to be ready kcli, err := kubeutils.KubeClient() if err != nil { return fmt.Errorf("unable to create kube client: %w", err) } - if err := kubeutils.WaitForInstallation(ctx, kcli, loading); err != nil { - return fmt.Errorf("unable to wait for installation to be ready: %w", err) + if err := adminconsole.WaitForReady(ctx, kcli, defaults.KotsadmNamespace, loading); err != nil { + return fmt.Errorf("unable to wait for admin console: %w", err) + } + } else if drComponent == disasterRecoveryComponentSeaweedFS { + // wait for seaweedfs to be ready + kcli, err := kubeutils.KubeClient() + if err != nil { + return fmt.Errorf("unable to create kube client: %w", err) + } + if err := seaweedfs.WaitForReady(ctx, kcli, defaults.SeaweedFSNamespace, nil); err != nil { + return fmt.Errorf("unable to wait for seaweedfs to be ready: %w", err) } } else if drComponent == disasterRecoveryComponentRegistry { - // delete the `registry` service to allow the helm chart reconciliation to re-create it with the desired clusterIP + // wait for registry to be ready kcli, err := kubeutils.KubeClient() if err != nil { return fmt.Errorf("unable to create kube client: %w", err) } - if err := kcli.Delete(ctx, &corev1.Service{ - ObjectMeta: metav1.ObjectMeta{ - Name: "registry", - Namespace: defaults.RegistryNamespace, - }, - }); err != nil && !errors.IsNotFound(err) { - return fmt.Errorf("unable to delete registry service: %w", err) + if err := kubeutils.WaitForDeployment(ctx, kcli, defaults.RegistryNamespace, "registry"); err != nil { + return fmt.Errorf("unable to wait for registry to be ready: %w", err) + } + } else if drComponent == disasterRecoveryComponentECO { + // wait for embedded cluster operator to reconcile the installation + kcli, err := kubeutils.KubeClient() + if err != nil { + return fmt.Errorf("unable to create kube client: %w", err) + } + if err := kubeutils.WaitForInstallation(ctx, kcli, loading); err != nil { + return fmt.Errorf("unable to wait for installation to be ready: %w", err) } } switch drComponent { - case disasterRecoveryComponentInfra: - loading.Infof("Infrastructure restored!") case disasterRecoveryComponentECInstall: loading.Infof("Cluster state restored!") - case disasterRecoveryComponentApp: - loading.Infof("Application restored!") + case disasterRecoveryComponentAdminConsole: + loading.Infof("Admin Console restored!") + case disasterRecoveryComponentSeaweedFS: + loading.Infof("Registry data restored!") case disasterRecoveryComponentRegistry: loading.Infof("Registry restored!") + case disasterRecoveryComponentECO: + loading.Infof("Embedded cluster operator restored!") + case disasterRecoveryComponentApp: + loading.Infof("Application restored!") } return nil @@ -582,19 +699,18 @@ func restoreFromBackup(ctx context.Context, backup *velerov1.Backup, drComponent // create a new restore object if it doesn't exist if errors.IsNotFound(err) { - var restoreLabelSelector *metav1.LabelSelector - if drComponent == disasterRecoveryComponentRegistry { - restoreLabelSelector = &metav1.LabelSelector{ - MatchLabels: map[string]string{ - "app": "docker-registry", - }, - } - } else { - restoreLabelSelector = &metav1.LabelSelector{ - MatchLabels: map[string]string{ - "replicated.com/disaster-recovery": string(drComponent), - }, - } + restoreLabels := map[string]string{} + switch drComponent { + case disasterRecoveryComponentAdminConsole, disasterRecoveryComponentECO: + restoreLabels["replicated.com/disaster-recovery-chart"] = string(drComponent) + case disasterRecoveryComponentECInstall, disasterRecoveryComponentApp: + restoreLabels["replicated.com/disaster-recovery"] = string(drComponent) + case disasterRecoveryComponentSeaweedFS: + restoreLabels["app.kubernetes.io/name"] = "seaweedfs" + case disasterRecoveryComponentRegistry: + restoreLabels["app"] = "docker-registry" + default: + return fmt.Errorf("unknown disaster recovery component: %q", drComponent) } restore := &velerov1.Restore{ @@ -606,12 +722,24 @@ func restoreFromBackup(ctx context.Context, backup *velerov1.Backup, drComponent }, }, Spec: velerov1.RestoreSpec{ - BackupName: backup.Name, - LabelSelector: restoreLabelSelector, + BackupName: backup.Name, + LabelSelector: &metav1.LabelSelector{ + MatchLabels: restoreLabels, + }, RestorePVs: ptr.To(true), IncludeClusterResources: ptr.To(true), + ResourceModifier: &corev1.TypedLocalObjectReference{ + Kind: "ConfigMap", + Name: resourceModifiersCMName, + }, }, } + + // ensure restore resource modifiers first + if err := ensureRestoreResourceModifiers(ctx, backup); err != nil { + return fmt.Errorf("unable to ensure restore resource modifiers: %w", err) + } + _, err := veleroClient.Restores(defaults.VeleroNamespace).Create(ctx, restore, metav1.CreateOptions{}) if err != nil { return fmt.Errorf("unable to create restore: %w", err) @@ -623,37 +751,39 @@ func restoreFromBackup(ctx context.Context, backup *velerov1.Backup, drComponent } // waitForAdditionalNodes waits for for user to add additional nodes to the cluster. -func waitForAdditionalNodes(ctx context.Context) error { +func waitForAdditionalNodes(ctx context.Context, highAvailability bool) error { kcli, err := kubeutils.KubeClient() if err != nil { return fmt.Errorf("unable to create kube client: %w", err) } - loading := spinner.Start() - loading.Infof("Waiting for Admin Console to deploy") - if err := adminconsole.WaitForReady(ctx, kcli, defaults.KotsadmNamespace, loading); err != nil { - loading.Close() - return fmt.Errorf("unable to wait for admin console: %w", err) - } - loading.Closef("Admin Console is ready!") - successColor := "\033[32m" colorReset := "\033[0m" - joinNodesMsg := fmt.Sprintf("\nVisit the admin console if you need to add nodes to the cluster: %s%s%s\n", + joinNodesMsg := fmt.Sprintf("\nVisit the Admin Console if you need to add nodes to the cluster: %s%s%s\n", successColor, adminconsole.GetURL(), colorReset, ) logrus.Info(joinNodesMsg) for { p := prompts.New().Input("Type 'continue' when you are done adding nodes:", "", false) - if p == "continue" { - logrus.Info("") - break + if p != "continue" { + logrus.Info("Please type 'continue' to proceed") + continue + } + if highAvailability { + ncps, err := kubeutils.NumOfControlPlaneNodes(ctx, kcli) + if err != nil { + return fmt.Errorf("unable to check control plane nodes: %w", err) + } + if ncps < 3 { + logrus.Infof("You are restoring a high-availability cluster, which requires at least 3 controller nodes. You currently have %d. Please add more controller nodes.", ncps) + continue + } } - logrus.Info("Please type 'continue' to proceed") + break } - loading = spinner.Start() + loading := spinner.Start() loading.Infof("Waiting for all nodes to be ready") if err := kubeutils.WaitForNodes(ctx, kcli); err != nil { loading.Close() @@ -827,60 +957,90 @@ var restoreCommand = &cli.Command{ } fallthrough - case ecRestoreStateRestoreInfra: - logrus.Debugf("setting restore state to %q", ecRestoreStateRestoreInfra) - if err := setECRestoreState(c.Context, ecRestoreStateRestoreInfra, backupToRestore.Name); err != nil { + case ecRestoreStateRestoreECInstall: + logrus.Debugf("setting restore state to %q", ecRestoreStateRestoreECInstall) + if err := setECRestoreState(c.Context, ecRestoreStateRestoreECInstall, backupToRestore.Name); err != nil { + return fmt.Errorf("unable to set restore state: %w", err) + } + logrus.Debugf("restoring embedded cluster installation from backup %q", backupToRestore.Name) + if err := restoreFromBackup(c.Context, backupToRestore, disasterRecoveryComponentECInstall); err != nil { + return err + } + fallthrough + + case ecRestoreStateRestoreAdminConsole: + logrus.Debugf("setting restore state to %q", ecRestoreStateRestoreAdminConsole) + if err := setECRestoreState(c.Context, ecRestoreStateRestoreAdminConsole, backupToRestore.Name); err != nil { + return fmt.Errorf("unable to set restore state: %w", err) + } + logrus.Debugf("restoring admin console from backup %q", backupToRestore.Name) + if err := restoreFromBackup(c.Context, backupToRestore, disasterRecoveryComponentAdminConsole); err != nil { + return err + } + fallthrough + + case ecRestoreStateWaitForNodes: + logrus.Debugf("setting restore state to %q", ecRestoreStateWaitForNodes) + if err := setECRestoreState(c.Context, ecRestoreStateWaitForNodes, backupToRestore.Name); err != nil { return fmt.Errorf("unable to set restore state: %w", err) } + logrus.Debugf("checking if backup is high availability") + highAvailability, err := isHighAvailabilityBackup(backupToRestore) + if err != nil { + return err + } + logrus.Debugf("waiting for additional nodes to be added") + if err := waitForAdditionalNodes(c.Context, highAvailability); err != nil { + return err + } + fallthrough - logrus.Debugf("restoring infra from backup %q", backupToRestore.Name) - if err := restoreFromBackup(c.Context, backupToRestore, disasterRecoveryComponentInfra); err != nil { + case ecRestoreStateRestoreSeaweedFS: + // only restore seaweedfs in case of high availability and airgap + highAvailability, err := isHighAvailabilityBackup(backupToRestore) + if err != nil { return err } + if highAvailability && c.String("airgap-bundle") != "" { + logrus.Debugf("setting restore state to %q", ecRestoreStateRestoreSeaweedFS) + if err := setECRestoreState(c.Context, ecRestoreStateRestoreSeaweedFS, backupToRestore.Name); err != nil { + return fmt.Errorf("unable to set restore state: %w", err) + } + logrus.Debugf("restoring seaweedfs from backup %q", backupToRestore.Name) + if err := restoreFromBackup(c.Context, backupToRestore, disasterRecoveryComponentSeaweedFS); err != nil { + return err + } + } fallthrough case ecRestoreStateRestoreRegistry: + // only restore registry in case of airgap if c.String("airgap-bundle") != "" { logrus.Debugf("setting restore state to %q", ecRestoreStateRestoreRegistry) if err := setECRestoreState(c.Context, ecRestoreStateRestoreRegistry, backupToRestore.Name); err != nil { return fmt.Errorf("unable to set restore state: %w", err) } - logrus.Debugf("restoring embedded cluster registry from backup %q", backupToRestore.Name) - err := restoreFromBackup(c.Context, backupToRestore, disasterRecoveryComponentRegistry) - if err != nil { + if err := restoreFromBackup(c.Context, backupToRestore, disasterRecoveryComponentRegistry); err != nil { return err } - registryAddress, ok := backupToRestore.Annotations["kots.io/embedded-registry"] if !ok { return fmt.Errorf("unable to read registry address from backup") } - if err := airgap.AddInsecureRegistry(registryAddress); err != nil { return fmt.Errorf("failed to add insecure registry: %w", err) } } fallthrough - case ecRestoreStateRestoreECInstall: - logrus.Debugf("setting restore state to %q", ecRestoreStateRestoreECInstall) - if err := setECRestoreState(c.Context, ecRestoreStateRestoreECInstall, backupToRestore.Name); err != nil { - return fmt.Errorf("unable to set restore state: %w", err) - } - logrus.Debugf("restoring embedded cluster installation from backup %q", backupToRestore.Name) - if err := restoreFromBackup(c.Context, backupToRestore, disasterRecoveryComponentECInstall); err != nil { - return err - } - fallthrough - - case ecRestoreStateWaitForNodes: - logrus.Debugf("setting restore state to %q", ecRestoreStateWaitForNodes) - if err := setECRestoreState(c.Context, ecRestoreStateWaitForNodes, backupToRestore.Name); err != nil { + case ecRestoreStateRestoreECO: + logrus.Debugf("setting restore state to %q", ecRestoreStateRestoreECO) + if err := setECRestoreState(c.Context, ecRestoreStateRestoreECO, backupToRestore.Name); err != nil { return fmt.Errorf("unable to set restore state: %w", err) } - logrus.Debugf("waiting for additional nodes to be added") - if err := waitForAdditionalNodes(c.Context); err != nil { + logrus.Debugf("restoring embedded cluster operator from backup %q", backupToRestore.Name) + if err := restoreFromBackup(c.Context, backupToRestore, disasterRecoveryComponentECO); err != nil { return err } fallthrough diff --git a/cmd/embedded-cluster/uninstall.go b/cmd/embedded-cluster/uninstall.go index bbfa2c1c2..9f8822040 100644 --- a/cmd/embedded-cluster/uninstall.go +++ b/cmd/embedded-cluster/uninstall.go @@ -15,7 +15,6 @@ import ( "github.com/sirupsen/logrus" "github.com/urfave/cli/v2" corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/labels" "sigs.k8s.io/controller-runtime/pkg/client" "github.com/replicatedhq/embedded-cluster/pkg/defaults" @@ -312,16 +311,11 @@ func maybePrintHAWarning(c *cli.Context) error { return nil } - opts := &client.ListOptions{ - LabelSelector: labels.SelectorFromSet( - labels.Set{"node-role.kubernetes.io/control-plane": "true"}, - ), - } - var nodes corev1.NodeList - if err := kubecli.List(c.Context, &nodes, opts); err != nil { - return fmt.Errorf("unable to list nodes: %w", err) + ncps, err := kubeutils.NumOfControlPlaneNodes(c.Context, kubecli) + if err != nil { + return fmt.Errorf("unable to check control plane nodes: %w", err) } - if len(nodes.Items) == 3 { + if ncps == 3 { logrus.Warn(haWarningMessage) logrus.Info("") } diff --git a/e2e/cluster/cluster.go b/e2e/cluster/cluster.go index 9b480d2b7..031d71ceb 100644 --- a/e2e/cluster/cluster.go +++ b/e2e/cluster/cluster.go @@ -406,7 +406,7 @@ func CopyFilesToNode(in *Input, node string) { files := []File{ { SourcePath: in.LicensePath, - DestPath: "/tmp/license.yaml", + DestPath: "/assets/license.yaml", Mode: 0644, }, { @@ -421,12 +421,12 @@ func CopyFilesToNode(in *Input, node string) { }, { SourcePath: in.AirgapInstallBundlePath, - DestPath: "/tmp/ec-release.tgz", + DestPath: "/assets/ec-release.tgz", Mode: 0755, }, { SourcePath: in.AirgapUpgradeBundlePath, - DestPath: "/tmp/ec-release-upgrade.tgz", + DestPath: "/assets/ec-release-upgrade.tgz", Mode: 0755, }, } @@ -444,7 +444,7 @@ func CopyDirsToNode(in *Input, node string) { }, { SourcePath: "playwright", - DestPath: "/tmp/playwright", + DestPath: "/automation/playwright", }, } for _, dir := range dirs { @@ -489,7 +489,7 @@ func CopyFileToNode(in *Input, node string, file File) { } { RunCommandOnNode(in, cmd, node) } - in.T.Logf("Copying `%s` to node %s", file.DestPath, node) + in.T.Logf("Copying `%s` to `%s` on node %s", file.SourcePath, file.DestPath, node) client, err := lxd.ConnectLXDUnix(lxdSocket, nil) if err != nil { in.T.Fatalf("Failed to connect to LXD: %v", err) @@ -505,7 +505,7 @@ func CopyFileToNode(in *Input, node string, file File) { Type: "file", } if err := client.CreateContainerFile(node, file.DestPath, req); err != nil { - in.T.Fatalf("Failed to copy file %s: %v", file.SourcePath, err) + in.T.Fatalf("Failed to copy file `%s` to `%s` on node %s: %v", file.SourcePath, file.DestPath, node, err) } } diff --git a/e2e/install_test.go b/e2e/install_test.go index 506db5efd..c892b5883 100644 --- a/e2e/install_test.go +++ b/e2e/install_test.go @@ -587,7 +587,7 @@ func TestSingleNodeAirgapUpgrade(t *testing.T) { t.Fatalf("fail to install embedded-cluster on node %s: %v", tc.Nodes[0], err) } // remove the airgap bundle after installation - line = []string{"rm", "/tmp/release.airgap"} + line = []string{"rm", "/assets/release.airgap"} if _, _, err := RunCommandOnNode(t, tc, 0, line); err != nil { t.Fatalf("fail to remove airgap bundle on node %s: %v", tc.Nodes[0], err) } @@ -611,7 +611,7 @@ func TestSingleNodeAirgapUpgrade(t *testing.T) { t.Fatalf("fail to run airgap update: %v", err) } // remove the airgap bundle after upgrade - line = []string{"rm", "/tmp/upgrade/release.airgap"} + line = []string{"rm", "/assets/upgrade/release.airgap"} if _, _, err := RunCommandOnNode(t, tc, 0, line); err != nil { t.Fatalf("fail to remove airgap bundle on node %s: %v", tc.Nodes[0], err) } @@ -680,7 +680,7 @@ func TestMultiNodeAirgapUpgradeSameK0s(t *testing.T) { } // upgrade airgap bundle is only needed on the first node - line := []string{"rm", "/tmp/ec-release-upgrade.tgz"} + line := []string{"rm", "/assets/ec-release-upgrade.tgz"} if _, _, err := RunCommandOnNode(t, tc, 1, line); err != nil { t.Fatalf("fail to remove upgrade airgap bundle on node %s: %v", tc.Nodes[1], err) } @@ -697,7 +697,7 @@ func TestMultiNodeAirgapUpgradeSameK0s(t *testing.T) { t.Fatalf("fail to install embedded-cluster on node %s: %v", tc.Nodes[0], err) } // remove artifacts after installation to save space - line = []string{"rm", "/tmp/release.airgap"} + line = []string{"rm", "/assets/release.airgap"} if _, _, err := RunCommandOnNode(t, tc, 0, line); err != nil { t.Fatalf("fail to remove airgap bundle on node %s: %v", tc.Nodes[0], err) } @@ -740,7 +740,7 @@ func TestMultiNodeAirgapUpgradeSameK0s(t *testing.T) { t.Fatalf("fail to join worker node to the cluster: %v", err) } // remove artifacts after joining to save space - line = []string{"rm", "/tmp/release.airgap"} + line = []string{"rm", "/assets/release.airgap"} if _, _, err := RunCommandOnNode(t, tc, 1, line); err != nil { t.Fatalf("fail to remove airgap bundle on worker node: %v", err) } @@ -773,7 +773,7 @@ func TestMultiNodeAirgapUpgradeSameK0s(t *testing.T) { t.Fatalf("fail to run airgap update: %v", err) } // remove the airgap bundle and binary after upgrade - line = []string{"rm", "/tmp/upgrade/release.airgap"} + line = []string{"rm", "/assets/upgrade/release.airgap"} if _, _, err := RunCommandOnNode(t, tc, 0, line); err != nil { t.Fatalf("fail to remove airgap bundle on node %s: %v", tc.Nodes[0], err) } @@ -846,7 +846,7 @@ func TestMultiNodeAirgapUpgrade(t *testing.T) { } // upgrade airgap bundle is only needed on the first node - line := []string{"rm", "/tmp/ec-release-upgrade.tgz"} + line := []string{"rm", "/assets/ec-release-upgrade.tgz"} if _, _, err := RunCommandOnNode(t, tc, 1, line); err != nil { t.Fatalf("fail to remove upgrade airgap bundle on node %s: %v", tc.Nodes[1], err) } @@ -863,7 +863,7 @@ func TestMultiNodeAirgapUpgrade(t *testing.T) { t.Fatalf("fail to install embedded-cluster on node %s: %v", tc.Nodes[0], err) } // remove the airgap bundle and binary after installation - line = []string{"rm", "/tmp/release.airgap"} + line = []string{"rm", "/assets/release.airgap"} if _, _, err := RunCommandOnNode(t, tc, 0, line); err != nil { t.Fatalf("fail to remove airgap bundle on node %s: %v", tc.Nodes[0], err) } @@ -902,7 +902,7 @@ func TestMultiNodeAirgapUpgrade(t *testing.T) { t.Fatalf("fail to join worker node to the cluster: %v", err) } // remove the airgap bundle and binary after joining - line = []string{"rm", "/tmp/release.airgap"} + line = []string{"rm", "/assets/release.airgap"} if _, _, err := RunCommandOnNode(t, tc, 1, line); err != nil { t.Fatalf("fail to remove airgap bundle on worker node: %v", err) } @@ -931,7 +931,7 @@ func TestMultiNodeAirgapUpgrade(t *testing.T) { t.Fatalf("fail to run airgap update: %v", err) } // remove the airgap bundle and binary after upgrade - line = []string{"rm", "/tmp/upgrade/release.airgap"} + line = []string{"rm", "/assets/upgrade/release.airgap"} if _, _, err := RunCommandOnNode(t, tc, 0, line); err != nil { t.Fatalf("fail to remove airgap bundle on node %s: %v", tc.Nodes[0], err) } @@ -1082,7 +1082,7 @@ func TestMultiNodeAirgapHAInstallation(t *testing.T) { WithProxy: true, AirgapInstallBundlePath: airgapInstallBundlePath, }) - // defer cleanupCluster(t, tc) + defer cleanupCluster(t, tc) // delete airgap bundles once they've been copied to the nodes if err := os.Remove(airgapInstallBundlePath); err != nil { @@ -1129,7 +1129,7 @@ func TestMultiNodeAirgapHAInstallation(t *testing.T) { t.Fatalf("fail to install embedded-cluster on node %s: %v", tc.Nodes[0], err) } // remove artifacts after installation to save space - line = []string{"rm", "/tmp/release.airgap"} + line = []string{"rm", "/assets/release.airgap"} if _, _, err := RunCommandOnNode(t, tc, 0, line); err != nil { t.Fatalf("fail to remove airgap bundle on node %s: %v", tc.Nodes[0], err) } @@ -1171,7 +1171,7 @@ func TestMultiNodeAirgapHAInstallation(t *testing.T) { t.Fatalf("fail to join node 1 as a controller: %v", err) } // remove the airgap bundle and binary after joining - line = []string{"rm", "/tmp/release.airgap"} + line = []string{"rm", "/assets/release.airgap"} if _, _, err := RunCommandOnNode(t, tc, 1, line); err != nil { t.Fatalf("fail to remove airgap bundle on node 1: %v", err) } @@ -1201,7 +1201,7 @@ func TestMultiNodeAirgapHAInstallation(t *testing.T) { t.Fatalf("fail to join node 2 as a controller in ha mode: %v", err) } // remove the airgap bundle and binary after joining - line = []string{"rm", "/tmp/release.airgap"} + line = []string{"rm", "/assets/release.airgap"} if _, _, err := RunCommandOnNode(t, tc, 2, line); err != nil { t.Fatalf("fail to remove airgap bundle on node 2: %v", err) } @@ -1371,7 +1371,7 @@ func generateAndCopySupportBundle(t *testing.T, tc *cluster.Output) { } func copyPlaywrightReport(t *testing.T, tc *cluster.Output) { - line := []string{"tar", "-czf", "playwright-report.tar.gz", "-C", "/tmp/playwright/playwright-report", "."} + line := []string{"tar", "-czf", "playwright-report.tar.gz", "-C", "/automation/playwright/playwright-report", "."} if tc.Proxy != "" { t.Logf("%s: compressing playwright report on proxy node", time.Now().Format(time.RFC3339)) if _, _, err := RunCommandOnProxyNode(t, tc, line); err != nil { diff --git a/e2e/playwright/tests/create-backup/test.spec.ts b/e2e/playwright/tests/create-backup/test.spec.ts index c5e4d943b..744124cac 100644 --- a/e2e/playwright/tests/create-backup/test.spec.ts +++ b/e2e/playwright/tests/create-backup/test.spec.ts @@ -4,7 +4,6 @@ import { login, deployApp } from '../shared'; test('create backup', async ({ page }) => { test.setTimeout(5 * 60 * 1000); // 5 minutes await login(page); - await deployApp(page, expect); await page.locator('.NavItem').getByText('Disaster Recovery', { exact: true }).click(); await expect(page.getByText('Backup settings')).toBeVisible(); await page.getByPlaceholder('Bucket name').click(); @@ -23,7 +22,7 @@ test('create backup', async ({ page }) => { await expect(page.locator('.Loader')).toBeVisible(); await expect(page.getByRole('button', { name: 'Updating', exact: true })).toBeDisabled(); await expect(page.getByRole('button', { name: 'Update storage settings' })).not.toBeVisible(); - await expect(page.locator('form')).toContainText('Settings updated', { timeout: 60000 }); + await expect(page.locator('form')).toContainText('Settings updated', { timeout: 90000 }); await expect(page.locator('.Loader')).not.toBeVisible(); await expect(page.getByRole('button', { name: 'Update storage settings' })).toBeEnabled(); await page.locator('.subnav-item').getByText('Backups', { exact: true }).click(); diff --git a/e2e/restore_test.go b/e2e/restore_test.go index 383b61776..7c3490ad0 100644 --- a/e2e/restore_test.go +++ b/e2e/restore_test.go @@ -3,6 +3,7 @@ package e2e import ( "fmt" "os" + "strings" "sync" "testing" "time" @@ -59,6 +60,9 @@ func TestSingleNodeDisasterRecovery(t *testing.T) { if err := setupPlaywright(t, tc); err != nil { t.Fatalf("fail to setup playwright: %v", err) } + if _, _, err := runPlaywrightTest(t, tc, "deploy-app"); err != nil { + t.Fatalf("fail to run playwright test deploy-app: %v", err) + } if _, _, err := runPlaywrightTest(t, tc, "create-backup", testArgs...); err != nil { t.Fatalf("fail to run playwright test create-backup: %v", err) } @@ -127,6 +131,9 @@ func TestSingleNodeResumeDisasterRecovery(t *testing.T) { if err := setupPlaywright(t, tc); err != nil { t.Fatalf("fail to setup playwright: %v", err) } + if _, _, err := runPlaywrightTest(t, tc, "deploy-app"); err != nil { + t.Fatalf("fail to run playwright test deploy-app: %v", err) + } if _, _, err := runPlaywrightTest(t, tc, "create-backup", testArgs...); err != nil { t.Fatalf("fail to run playwright test create-backup: %v", err) } @@ -227,6 +234,9 @@ func TestSingleNodeAirgapDisasterRecovery(t *testing.T) { if err := setupPlaywright(t, tc); err != nil { t.Fatalf("fail to setup playwright: %v", err) } + if _, _, err := runPlaywrightTest(t, tc, "deploy-app"); err != nil { + t.Fatalf("fail to run playwright test deploy-app: %v", err) + } if _, _, err := runPlaywrightTest(t, tc, "create-backup", testArgs...); err != nil { t.Fatalf("fail to run playwright test create-backup: %v", err) } @@ -274,7 +284,7 @@ func TestSingleNodeAirgapDisasterRecovery(t *testing.T) { t.Fatalf("fail to run airgap update: %v", err) } // remove the airgap bundle after upgrade - line = []string{"rm", "/tmp/upgrade/release.airgap"} + line = []string{"rm", "/assets/upgrade/release.airgap"} if _, _, err := RunCommandOnNode(t, tc, 0, line); err != nil { t.Fatalf("fail to remove airgap bundle on node %s: %v", tc.Nodes[0], err) } @@ -291,3 +301,431 @@ func TestSingleNodeAirgapDisasterRecovery(t *testing.T) { t.Logf("%s: test complete", time.Now().Format(time.RFC3339)) } + +func TestMultiNodeHADisasterRecovery(t *testing.T) { + t.Parallel() + + requiredEnvVars := []string{ + "DR_AWS_S3_ENDPOINT", + "DR_AWS_S3_REGION", + "DR_AWS_S3_BUCKET", + "DR_AWS_S3_PREFIX", + "DR_AWS_ACCESS_KEY_ID", + "DR_AWS_SECRET_ACCESS_KEY", + } + for _, envVar := range requiredEnvVars { + if os.Getenv(envVar) == "" { + t.Fatalf("missing required environment variable: %s", envVar) + } + } + + testArgs := []string{} + for _, envVar := range requiredEnvVars { + testArgs = append(testArgs, os.Getenv(envVar)) + } + + tc := cluster.NewTestCluster(&cluster.Input{ + T: t, + Nodes: 3, + Image: "debian/12", + LicensePath: "snapshot-license.yaml", + EmbeddedClusterPath: "../output/bin/embedded-cluster", + }) + defer cleanupCluster(t, tc) + + // install "expect" dependency on node 0 as that's where the restore process will be initiated. + // install "expect" dependency on node 2 as that's where the HA join command will run. + t.Logf("%s: installing test dependencies on node 2", time.Now().Format(time.RFC3339)) + commands := [][]string{ + {"apt-get", "update", "-y"}, + {"apt-get", "install", "expect", "-y"}, + } + if err := RunCommandsOnNode(t, tc, 0, commands); err != nil { + t.Fatalf("fail to install test dependencies on node %s: %v", tc.Nodes[0], err) + } + if err := RunCommandsOnNode(t, tc, 2, commands); err != nil { + t.Fatalf("fail to install test dependencies on node %s: %v", tc.Nodes[2], err) + } + + t.Logf("%s: installing embedded-cluster on node 0", time.Now().Format(time.RFC3339)) + if _, _, err := RunCommandOnNode(t, tc, 0, []string{"single-node-install.sh", "ui"}); err != nil { + t.Fatalf("fail to install embedded-cluster on node %s: %v", tc.Nodes[0], err) + } + + if err := setupPlaywright(t, tc); err != nil { + t.Fatalf("fail to setup playwright: %v", err) + } + if _, _, err := runPlaywrightTest(t, tc, "deploy-app"); err != nil { + t.Fatalf("fail to run playwright test deploy-app: %v", err) + } + + // join a controller + t.Logf("%s: generating a new controller token command", time.Now().Format(time.RFC3339)) + stdout, stderr, err := runPlaywrightTest(t, tc, "get-join-controller-command") + if err != nil { + t.Fatalf("fail to generate controller join token:\nstdout: %s\nstderr: %s", stdout, stderr) + } + command, err := findJoinCommandInOutput(stdout) + if err != nil { + t.Fatalf("fail to find the join command in the output: %v", err) + } + t.Log("controller join token command:", command) + t.Logf("%s: joining node 1 to the cluster (controller)", time.Now().Format(time.RFC3339)) + if _, _, err := RunCommandOnNode(t, tc, 1, strings.Split(command, " ")); err != nil { + t.Fatalf("fail to join node 1 as a controller: %v", err) + } + + // join another controller in HA mode + t.Logf("%s: generating a new controller token command", time.Now().Format(time.RFC3339)) + stdout, stderr, err = runPlaywrightTest(t, tc, "get-join-controller-command") + if err != nil { + t.Fatalf("fail to generate controller join token:\nstdout: %s\nstderr: %s", stdout, stderr) + } + command, err = findJoinCommandInOutput(stdout) + if err != nil { + t.Fatalf("fail to find the join command in the output: %v", err) + } + t.Log("controller join token command:", command) + t.Logf("%s: joining node 2 to the cluster (controller) in ha mode", time.Now().Format(time.RFC3339)) + line := append([]string{"join-ha.exp"}, []string{command}...) + if _, _, err := RunCommandOnNode(t, tc, 2, line); err != nil { + t.Fatalf("fail to join node 2 as a controller in ha mode: %v", err) + } + + // wait for the nodes to report as ready. + t.Logf("%s: all nodes joined, waiting for them to be ready", time.Now().Format(time.RFC3339)) + stdout, _, err = RunCommandOnNode(t, tc, 0, []string{"wait-for-ready-nodes.sh", "3"}) + if err != nil { + t.Fatalf("fail to install embedded-cluster on node %s: %v", tc.Nodes[0], err) + } + t.Log(stdout) + + t.Logf("%s: checking installation state after enabling high availability", time.Now().Format(time.RFC3339)) + line = []string{"check-post-ha-state.sh", os.Getenv("SHORT_SHA")} + if _, _, err := RunCommandOnNode(t, tc, 0, line); err != nil { + t.Fatalf("fail to check post ha state: %v", err) + } + + if _, _, err := runPlaywrightTest(t, tc, "create-backup", testArgs...); err != nil { + t.Fatalf("fail to run playwright test create-backup: %v", err) + } + + // reset the cluster + line = []string{"reset-installation.sh", "--force", "--reboot"} + t.Logf("%s: resetting the installation on node 2", time.Now().Format(time.RFC3339)) + if _, _, err := RunCommandOnNode(t, tc, 2, line); err != nil { + t.Fatalf("fail to reset the installation: %v", err) + } + t.Logf("%s: resetting the installation on node 1", time.Now().Format(time.RFC3339)) + if _, _, err := RunCommandOnNode(t, tc, 1, line); err != nil { + t.Fatalf("fail to reset the installation: %v", err) + } + t.Logf("%s: resetting the installation on node 0", time.Now().Format(time.RFC3339)) + if _, _, err := RunCommandOnNode(t, tc, 0, line); err != nil { + t.Fatalf("fail to reset the installation: %v", err) + } + + // wait for reboot + t.Logf("%s: waiting for nodes to reboot", time.Now().Format(time.RFC3339)) + time.Sleep(60 * time.Second) + + // begin restoring the cluster + t.Logf("%s: restoring the installation: phase 1", time.Now().Format(time.RFC3339)) + line = append([]string{"restore-multi-node-phase1.exp"}, testArgs...) + if _, _, err := RunCommandOnNode(t, tc, 0, line); err != nil { + t.Fatalf("fail to restore phase 1 of the installation: %v", err) + } + + // restore phase 1 completes when the prompt for adding nodes is reached. + // add the expected nodes to the cluster, then continue to phase 2. + + // join a controller + t.Logf("%s: generating a new controller token command", time.Now().Format(time.RFC3339)) + stdout, stderr, err = runPlaywrightTest(t, tc, "get-join-controller-command") + if err != nil { + t.Fatalf("fail to generate controller join token:\nstdout: %s\nstderr: %s", stdout, stderr) + } + command, err = findJoinCommandInOutput(stdout) + if err != nil { + t.Fatalf("fail to find the join command in the output: %v", err) + } + t.Log("controller join token command:", command) + t.Logf("%s: joining node 1 to the cluster (controller)", time.Now().Format(time.RFC3339)) + if _, _, err := RunCommandOnNode(t, tc, 1, strings.Split(command, " ")); err != nil { + t.Fatalf("fail to join node 1 as a controller: %v", err) + } + + // join another controller in non-HA mode + t.Logf("%s: generating a new controller token command", time.Now().Format(time.RFC3339)) + stdout, stderr, err = runPlaywrightTest(t, tc, "get-join-controller-command") + if err != nil { + t.Fatalf("fail to generate controller join token:\nstdout: %s\nstderr: %s", stdout, stderr) + } + command, err = findJoinCommandInOutput(stdout) + if err != nil { + t.Fatalf("fail to find the join command in the output: %v", err) + } + t.Log("controller join token command:", command) + t.Logf("%s: joining node 2 to the cluster (controller)", time.Now().Format(time.RFC3339)) + if _, _, err := RunCommandOnNode(t, tc, 2, strings.Split(command, " ")); err != nil { + t.Fatalf("fail to join node 2 as a controller: %v", err) + } + + // wait for the nodes to report as ready. + t.Logf("%s: all nodes joined, waiting for them to be ready", time.Now().Format(time.RFC3339)) + stdout, _, err = RunCommandOnNode(t, tc, 0, []string{"wait-for-ready-nodes.sh", "3", "true"}) + if err != nil { + t.Fatalf("fail to install embedded-cluster on node %s: %v", tc.Nodes[0], err) + } + t.Log(stdout) + + t.Logf("%s: restoring the installation: phase 2", time.Now().Format(time.RFC3339)) + if _, _, err := RunCommandOnNode(t, tc, 0, []string{"restore-multi-node-phase2.exp"}); err != nil { + t.Fatalf("fail to restore phase 2 of the installation: %v", err) + } + + t.Logf("%s: checking installation state after restoring the high availability backup", time.Now().Format(time.RFC3339)) + line = []string{"check-post-ha-state.sh", os.Getenv("SHORT_SHA"), "true"} + if _, _, err := RunCommandOnNode(t, tc, 0, line); err != nil { + t.Fatalf("fail to check post ha state: %v", err) + } + + t.Logf("%s: test complete", time.Now().Format(time.RFC3339)) +} + +func TestMultiNodeAirgapHADisasterRecovery(t *testing.T) { + t.Parallel() + + requiredEnvVars := []string{ + "DR_AWS_S3_ENDPOINT", + "DR_AWS_S3_REGION", + "DR_AWS_S3_BUCKET", + "DR_AWS_S3_PREFIX", + "DR_AWS_ACCESS_KEY_ID", + "DR_AWS_SECRET_ACCESS_KEY", + } + for _, envVar := range requiredEnvVars { + if os.Getenv(envVar) == "" { + t.Fatalf("missing required environment variable: %s", envVar) + } + } + + testArgs := []string{} + for _, envVar := range requiredEnvVars { + testArgs = append(testArgs, os.Getenv(envVar)) + } + + t.Logf("%s: downloading airgap file", time.Now().Format(time.RFC3339)) + airgapInstallBundlePath := "/tmp/airgap-install-bundle.tar.gz" + downloadAirgapBundle(t, fmt.Sprintf("appver-%s", os.Getenv("SHORT_SHA")), airgapInstallBundlePath, os.Getenv("AIRGAP_SNAPSHOT_LICENSE_ID")) + + tc := cluster.NewTestCluster(&cluster.Input{ + T: t, + Nodes: 3, + Image: "debian/12", + WithProxy: true, + AirgapInstallBundlePath: airgapInstallBundlePath, + }) + defer cleanupCluster(t, tc) + + // install "expect" dependency on node 0 as that's where the restore process will be initiated. + // install "expect" dependency on node 2 as that's where the HA join command will run. + t.Logf("%s: installing test dependencies", time.Now().Format(time.RFC3339)) + commands := [][]string{ + {"apt-get", "update", "-y"}, + {"apt-get", "install", "expect", "curl", "-y"}, + } + withEnv := WithEnv(map[string]string{ + "http_proxy": cluster.HTTPProxy, + "https_proxy": cluster.HTTPProxy, + }) + if err := RunCommandsOnNode(t, tc, 0, commands, withEnv); err != nil { + t.Fatalf("fail to install test dependencies on node %s: %v", tc.Nodes[0], err) + } + if err := RunCommandsOnNode(t, tc, 2, commands, withEnv); err != nil { + t.Fatalf("fail to install test dependencies on node %s: %v", tc.Nodes[2], err) + } + + // delete airgap bundles once they've been copied to the nodes + if err := os.Remove(airgapInstallBundlePath); err != nil { + t.Logf("failed to remove airgap install bundle: %v", err) + } + + t.Logf("%s: preparing embedded cluster airgap files", time.Now().Format(time.RFC3339)) + line := []string{"airgap-prepare.sh"} + if _, _, err := RunCommandOnNode(t, tc, 0, line); err != nil { + t.Fatalf("fail to prepare airgap files on node %s: %v", tc.Nodes[0], err) + } + + t.Logf("%s: installing embedded-cluster on node 0", time.Now().Format(time.RFC3339)) + line = []string{"single-node-airgap-install.sh", "--proxy"} + withEnv = WithEnv(map[string]string{ + "HTTP_PROXY": cluster.HTTPProxy, + "HTTPS_PROXY": cluster.HTTPProxy, + "NO_PROXY": "localhost,127.0.0.1,10.96.0.0/12,.svc,.local,.default,kubernetes,kotsadm-rqlite,kotsadm-api-node", + }) + if _, _, err := RunCommandOnNode(t, tc, 0, line, withEnv); err != nil { + t.Fatalf("fail to install embedded-cluster on node %s: %v", tc.Nodes[0], err) + } + + if err := setupPlaywright(t, tc); err != nil { + t.Fatalf("fail to setup playwright: %v", err) + } + if _, _, err := runPlaywrightTest(t, tc, "deploy-app"); err != nil { + t.Fatalf("fail to run playwright test deploy-app: %v", err) + } + + // join a controller + t.Logf("%s: generating a new controller token command", time.Now().Format(time.RFC3339)) + stdout, stderr, err := runPlaywrightTest(t, tc, "get-join-controller-command") + if err != nil { + t.Fatalf("fail to generate controller join token:\nstdout: %s\nstderr: %s", stdout, stderr) + } + command, err := findJoinCommandInOutput(stdout) + if err != nil { + t.Fatalf("fail to find the join command in the output: %v", err) + } + t.Log("controller join token command:", command) + t.Logf("%s: preparing embedded cluster airgap files on node 1", time.Now().Format(time.RFC3339)) + line = []string{"airgap-prepare.sh"} + if _, _, err := RunCommandOnNode(t, tc, 1, line); err != nil { + t.Fatalf("fail to prepare airgap files on node 1: %v", err) + } + t.Logf("%s: joining node 1 to the cluster (controller)", time.Now().Format(time.RFC3339)) + if _, _, err := RunCommandOnNode(t, tc, 1, strings.Split(command, " ")); err != nil { + t.Fatalf("fail to join node 1 as a controller: %v", err) + } + + // join another controller in HA mode + t.Logf("%s: generating a new controller token command", time.Now().Format(time.RFC3339)) + stdout, stderr, err = runPlaywrightTest(t, tc, "get-join-controller-command") + if err != nil { + t.Fatalf("fail to generate controller join token:\nstdout: %s\nstderr: %s", stdout, stderr) + } + command, err = findJoinCommandInOutput(stdout) + if err != nil { + t.Fatalf("fail to find the join command in the output: %v", err) + } + t.Log("controller join token command:", command) + t.Logf("%s: preparing embedded cluster airgap files on node 2", time.Now().Format(time.RFC3339)) + line = []string{"airgap-prepare.sh"} + if _, _, err := RunCommandOnNode(t, tc, 2, line); err != nil { + t.Fatalf("fail to prepare airgap files on node 2: %v", err) + } + t.Logf("%s: joining node 2 to the cluster (controller) in ha mode", time.Now().Format(time.RFC3339)) + line = append([]string{"join-ha.exp"}, []string{command}...) + if _, _, err := RunCommandOnNode(t, tc, 2, line); err != nil { + t.Fatalf("fail to join node 2 as a controller in ha mode: %v", err) + } + + // wait for the nodes to report as ready. + t.Logf("%s: all nodes joined, waiting for them to be ready", time.Now().Format(time.RFC3339)) + stdout, _, err = RunCommandOnNode(t, tc, 0, []string{"wait-for-ready-nodes.sh", "3"}) + if err != nil { + t.Fatalf("fail to install embedded-cluster on node %s: %v", tc.Nodes[0], err) + } + t.Log(stdout) + + t.Logf("%s: checking installation state after enabling high availability", time.Now().Format(time.RFC3339)) + line = []string{"check-airgap-post-ha-state.sh", os.Getenv("SHORT_SHA")} + if _, _, err := RunCommandOnNode(t, tc, 0, line); err != nil { + t.Fatalf("fail to check post ha state: %v", err) + } + + if _, _, err := runPlaywrightTest(t, tc, "create-backup", testArgs...); err != nil { + t.Fatalf("fail to run playwright test create-backup: %v", err) + } + + // reset the cluster + line = []string{"reset-installation.sh", "--force", "--reboot"} + t.Logf("%s: resetting the installation on node 2", time.Now().Format(time.RFC3339)) + if _, _, err := RunCommandOnNode(t, tc, 2, line); err != nil { + t.Fatalf("fail to reset the installation: %v", err) + } + t.Logf("%s: resetting the installation on node 1", time.Now().Format(time.RFC3339)) + if _, _, err := RunCommandOnNode(t, tc, 1, line); err != nil { + t.Fatalf("fail to reset the installation: %v", err) + } + t.Logf("%s: resetting the installation on node 0", time.Now().Format(time.RFC3339)) + if _, _, err := RunCommandOnNode(t, tc, 0, line); err != nil { + t.Fatalf("fail to reset the installation: %v", err) + } + + // wait for reboot + t.Logf("%s: waiting for nodes to reboot", time.Now().Format(time.RFC3339)) + time.Sleep(60 * time.Second) + + // begin restoring the cluster + t.Logf("%s: restoring the installation: phase 1", time.Now().Format(time.RFC3339)) + line = append([]string{"restore-multi-node-airgap-phase1.exp"}, testArgs...) + withEnv = WithEnv(map[string]string{ + "HTTP_PROXY": cluster.HTTPProxy, + "HTTPS_PROXY": cluster.HTTPProxy, + "NO_PROXY": "localhost,127.0.0.1,10.96.0.0/12,.svc,.local,.default,kubernetes,kotsadm-rqlite,kotsadm-api-node", + }) + if _, _, err := RunCommandOnNode(t, tc, 0, line, withEnv); err != nil { + t.Fatalf("fail to restore phase 1 of the installation: %v", err) + } + + // restore phase 1 completes when the prompt for adding nodes is reached. + // add the expected nodes to the cluster, then continue to phase 2. + + // join a controller + t.Logf("%s: generating a new controller token command", time.Now().Format(time.RFC3339)) + stdout, stderr, err = runPlaywrightTest(t, tc, "get-join-controller-command") + if err != nil { + t.Fatalf("fail to generate controller join token:\nstdout: %s\nstderr: %s", stdout, stderr) + } + command, err = findJoinCommandInOutput(stdout) + if err != nil { + t.Fatalf("fail to find the join command in the output: %v", err) + } + t.Log("controller join token command:", command) + t.Logf("%s: joining node 1 to the cluster (controller)", time.Now().Format(time.RFC3339)) + if _, _, err := RunCommandOnNode(t, tc, 1, strings.Split(command, " ")); err != nil { + t.Fatalf("fail to join node 1 as a controller: %v", err) + } + + // join another controller in non-HA mode + t.Logf("%s: generating a new controller token command", time.Now().Format(time.RFC3339)) + stdout, stderr, err = runPlaywrightTest(t, tc, "get-join-controller-command") + if err != nil { + t.Fatalf("fail to generate controller join token:\nstdout: %s\nstderr: %s", stdout, stderr) + } + command, err = findJoinCommandInOutput(stdout) + if err != nil { + t.Fatalf("fail to find the join command in the output: %v", err) + } + t.Log("controller join token command:", command) + t.Logf("%s: joining node 2 to the cluster (controller)", time.Now().Format(time.RFC3339)) + if _, _, err := RunCommandOnNode(t, tc, 2, strings.Split(command, " ")); err != nil { + t.Fatalf("fail to join node 2 as a controller: %v", err) + } + + // wait for the nodes to report as ready. + t.Logf("%s: all nodes joined, waiting for them to be ready", time.Now().Format(time.RFC3339)) + stdout, _, err = RunCommandOnNode(t, tc, 0, []string{"wait-for-ready-nodes.sh", "3", "true"}) + if err != nil { + t.Fatalf("fail to install embedded-cluster on node %s: %v", tc.Nodes[0], err) + } + t.Log(stdout) + + t.Logf("%s: restoring the installation: phase 2", time.Now().Format(time.RFC3339)) + line = []string{"restore-multi-node-airgap-phase2.exp"} + withEnv = WithEnv(map[string]string{ + "HTTP_PROXY": cluster.HTTPProxy, + "HTTPS_PROXY": cluster.HTTPProxy, + "NO_PROXY": "localhost,127.0.0.1,10.96.0.0/12,.svc,.local,.default,kubernetes,kotsadm-rqlite,kotsadm-api-node", + }) + if _, _, err := RunCommandOnNode(t, tc, 0, line, withEnv); err != nil { + t.Fatalf("fail to restore phase 2 of the installation: %v", err) + } + + t.Logf("%s: checking installation state after restoring the high availability backup", time.Now().Format(time.RFC3339)) + line = []string{"check-airgap-post-ha-state.sh", os.Getenv("SHORT_SHA"), "true"} + if _, _, err := RunCommandOnNode(t, tc, 0, line); err != nil { + t.Fatalf("fail to check post ha state: %v", err) + } + + t.Logf("%s: test complete", time.Now().Format(time.RFC3339)) +} diff --git a/e2e/scripts/airgap-prepare.sh b/e2e/scripts/airgap-prepare.sh index abde44cbe..69b88ed36 100755 --- a/e2e/scripts/airgap-prepare.sh +++ b/e2e/scripts/airgap-prepare.sh @@ -2,57 +2,57 @@ set -euox pipefail main() { - tar xzf /tmp/ec-release.tgz + tar xzf /assets/ec-release.tgz mv embedded-cluster-smoke-test-staging-app /usr/local/bin/embedded-cluster - mv license.yaml /tmp/license.yaml + mv license.yaml /assets/license.yaml for file in *.airgap; do if [ -e "$file" ] then - mv "$file" /tmp/release.airgap + mv "$file" /assets/release.airgap break fi done # delete the ec airgap release - rm /tmp/ec-release.tgz + rm /assets/ec-release.tgz - # if there is no file at /tmp/release.airgap, this is an error - if [ ! -e /tmp/release.airgap ] + # if there is no file at /assets/release.airgap, this is an error + if [ ! -e /assets/release.airgap ] then echo "No airgap file found" exit 1 fi - if [ -e /tmp/ec-release-upgrade.tgz ] + if [ -e /assets/ec-release-upgrade.tgz ] then mkdir -p upgrade - tar xzf /tmp/ec-release-upgrade.tgz -C upgrade + tar xzf /assets/ec-release-upgrade.tgz -C upgrade mv upgrade/embedded-cluster-smoke-test-staging-app /usr/local/bin/embedded-cluster-upgrade - mkdir -p /tmp/upgrade - mv upgrade/license.yaml /tmp/upgrade/license.yaml + mkdir -p /assets/upgrade + mv upgrade/license.yaml /assets/upgrade/license.yaml for file in upgrade/*.airgap; do if [ -e "$file" ] then - mv "$file" /tmp/upgrade/release.airgap + mv "$file" /assets/upgrade/release.airgap break fi done - # if there is no file at /tmp/upgrade/release.airgap, this is an error - if [ ! -e /tmp/upgrade/release.airgap ] + # if there is no file at /assets/upgrade/release.airgap, this is an error + if [ ! -e /assets/upgrade/release.airgap ] then echo "No upgrade airgap file found" exit 1 fi # delete the ec upgrade airgap release - rm /tmp/ec-release-upgrade.tgz + rm /assets/ec-release-upgrade.tgz fi } diff --git a/e2e/scripts/airgap-update.sh b/e2e/scripts/airgap-update.sh index 67f47d3c2..4bdbe47a1 100755 --- a/e2e/scripts/airgap-update.sh +++ b/e2e/scripts/airgap-update.sh @@ -3,7 +3,7 @@ set -euox pipefail main() { echo "upgrading from airgap bundle" - embedded-cluster-upgrade update --airgap-bundle /tmp/upgrade/release.airgap + embedded-cluster-upgrade update --airgap-bundle /assets/upgrade/release.airgap } export KUBECONFIG=/var/lib/k0s/pki/admin.conf diff --git a/e2e/scripts/bypass-kurl-proxy.sh b/e2e/scripts/bypass-kurl-proxy.sh index a2e0d83ec..66e66181e 100755 --- a/e2e/scripts/bypass-kurl-proxy.sh +++ b/e2e/scripts/bypass-kurl-proxy.sh @@ -11,6 +11,7 @@ main() { namespace: kotsadm labels: replicated.com/disaster-recovery: infra + replicated.com/disaster-recovery-chart: admin-console spec: type: NodePort ports: diff --git a/e2e/scripts/check-airgap-post-ha-state.sh b/e2e/scripts/check-airgap-post-ha-state.sh index c875b7896..c0e951b9a 100755 --- a/e2e/scripts/check-airgap-post-ha-state.sh +++ b/e2e/scripts/check-airgap-post-ha-state.sh @@ -61,6 +61,7 @@ ensure_app_not_upgraded() { main() { local version="appver-$1" + local from_restore="${2:-}" sleep 10 # wait for kubectl to become available echo "pods" @@ -83,6 +84,14 @@ main() { kubectl get statefulset -n seaweedfs seaweedfs-volume -o jsonpath='{.status.readyReplicas}' | grep -q 3 kubectl get statefulset -n seaweedfs seaweedfs-master -o jsonpath='{.status.readyReplicas}' | grep -q 1 + if [ "$from_restore" == "true" ]; then + # ensure volumes were restored + kubectl get podvolumerestore -n velero | grep kotsadm | grep -c backup | grep -q 1 + kubectl get podvolumerestore -n velero | grep seaweedfs-filer | grep -c data-filer | grep -q 3 + kubectl get podvolumerestore -n velero | grep seaweedfs-filer | grep -c seaweedfs-filer-log-volume | grep -q 3 + kubectl get podvolumerestore -n velero | grep seaweedfs-volume | grep -c data | grep -q 3 + fi + if ! wait_for_nginx_pods; then echo "Failed waiting for the application's nginx pods" exit 1 diff --git a/e2e/scripts/check-post-ha-state.sh b/e2e/scripts/check-post-ha-state.sh index edb79dc4d..8332a9876 100755 --- a/e2e/scripts/check-post-ha-state.sh +++ b/e2e/scripts/check-post-ha-state.sh @@ -43,6 +43,7 @@ ensure_app_not_upgraded() { main() { local version="$1" + local from_restore="${2:-}" sleep 10 # wait for kubectl to become available echo "pods" @@ -57,6 +58,11 @@ main() { # ensure rqlite is running in HA mode kubectl get sts -n kotsadm kotsadm-rqlite -o jsonpath='{.status.readyReplicas}' | grep -q 3 + if [ "$from_restore" == "true" ]; then + # ensure volumes were restored + kubectl get podvolumerestore -n velero | grep kotsadm | grep -c backup | grep -q 1 + fi + if ! wait_for_nginx_pods; then echo "Failed waiting for the application's nginx pods" exit 1 diff --git a/e2e/scripts/default-install.sh b/e2e/scripts/default-install.sh index 2b8009f3b..5ba3ca291 100755 --- a/e2e/scripts/default-install.sh +++ b/e2e/scripts/default-install.sh @@ -56,7 +56,7 @@ check_openebs_storage_class() { } main() { - if embedded-cluster install --no-prompt --license /tmp/license.yaml 2>&1 | tee /tmp/log ; then + if embedded-cluster install --no-prompt --license /assets/license.yaml 2>&1 | tee /tmp/log ; then echo "Expected installation to fail with a license provided" exit 1 fi diff --git a/e2e/scripts/install-playwright.sh b/e2e/scripts/install-playwright.sh index 81536793b..5005b48f7 100755 --- a/e2e/scripts/install-playwright.sh +++ b/e2e/scripts/install-playwright.sh @@ -13,7 +13,7 @@ main() { echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_$NODE_MAJOR.x nodistro main" | tee /etc/apt/sources.list.d/nodesource.list apt-get update && apt-get install nodejs -y - cd /tmp/playwright + cd /automation/playwright npm ci npx playwright install --with-deps } diff --git a/e2e/scripts/playwright.sh b/e2e/scripts/playwright.sh index 004a9b352..1fbcd93ef 100755 --- a/e2e/scripts/playwright.sh +++ b/e2e/scripts/playwright.sh @@ -25,7 +25,7 @@ main() { fi export BASE_URL="http://10.0.0.2:30001" - cd /tmp/playwright + cd /automation/playwright npx playwright test "$test_name" } diff --git a/e2e/scripts/pre-minio-removal-install.sh b/e2e/scripts/pre-minio-removal-install.sh index 0d1765c7f..f9fb8d07d 100755 --- a/e2e/scripts/pre-minio-removal-install.sh +++ b/e2e/scripts/pre-minio-removal-install.sh @@ -210,7 +210,7 @@ main() { exit 1 fi - if ! embedded-cluster install --no-prompt --license /tmp/license.yaml 2>&1 | tee /tmp/log ; then + if ! embedded-cluster install --no-prompt --license /assets/license.yaml 2>&1 | tee /tmp/log ; then echo "Failed to install embedded-cluster" exit 1 fi diff --git a/e2e/scripts/reset-installation.sh b/e2e/scripts/reset-installation.sh index 8b2e8ba00..f595cf8b2 100755 --- a/e2e/scripts/reset-installation.sh +++ b/e2e/scripts/reset-installation.sh @@ -2,7 +2,9 @@ set -euox pipefail main() { - if ! embedded-cluster reset --no-prompt | tee /tmp/log ; then + local additional_flags=("$@") + + if ! embedded-cluster reset --no-prompt "${additional_flags[@]}" | tee /tmp/log ; then echo "Failed to uninstall embedded-cluster" exit 1 fi @@ -16,4 +18,4 @@ main() { export EMBEDDED_CLUSTER_METRICS_BASEURL="https://staging.replicated.app" export KUBECONFIG=/var/lib/k0s/pki/admin.conf export PATH=$PATH:/var/lib/embedded-cluster/bin -main +main "$@" diff --git a/e2e/scripts/restore-installation-airgap.exp b/e2e/scripts/restore-installation-airgap.exp index c56715a29..ba0a70622 100755 --- a/e2e/scripts/restore-installation-airgap.exp +++ b/e2e/scripts/restore-installation-airgap.exp @@ -12,7 +12,7 @@ set dr_aws_s3_prefix [lindex $argv 3] set dr_aws_access_key_id [lindex $argv 4] set dr_aws_secret_access_key [lindex $argv 5] -spawn embedded-cluster restore --airgap-bundle /tmp/release.airgap --proxy +spawn embedded-cluster restore --airgap-bundle /assets/release.airgap --proxy expect { "Enter information to configure access to your backup storage location." {} @@ -117,23 +117,7 @@ expect { } expect { - -timeout 60 "Infrastructure restored!" {} - timeout { - puts "\n\nFailed to restore infrastructure." - exit 1 - } -} - -expect { - -timeout 150 "Registry restored!" {} - timeout { - puts "\n\nFailed to restore registry." - exit 1 - } -} - -expect { - -timeout 150 "Cluster state restored!" {} + -timeout 30 "Cluster state restored!" {} timeout { puts "\n\nFailed to restore cluster state." exit 1 @@ -141,15 +125,15 @@ expect { } expect { - -timeout 60 "Admin Console is ready!" {} + -timeout 150 "Admin Console restored!" {} timeout { - puts "\n\nFailed to wait for admin console to be ready." + puts "\n\nFailed to restore admin console." exit 1 } } expect { - "Visit the admin console if you need to add nodes to the cluster" {} + "Visit the Admin Console if you need to add nodes to the cluster" {} timeout { puts "\n\nFailed to find admin console URL." exit 1 @@ -184,6 +168,22 @@ expect { } } +expect { + -timeout 150 "Registry restored!" {} + timeout { + puts "\n\nFailed to restore registry." + exit 1 + } +} + +expect { + -timeout 240 "Embedded cluster operator restored!" {} + timeout { + puts "\n\nFailed to restore embedded cluster operator." + exit 1 + } +} + expect { -timeout 60 "Application restored!" { exit 0 diff --git a/e2e/scripts/restore-installation.exp b/e2e/scripts/restore-installation.exp index 03d642ffe..4546dc1ab 100755 --- a/e2e/scripts/restore-installation.exp +++ b/e2e/scripts/restore-installation.exp @@ -117,15 +117,7 @@ expect { } expect { - -timeout 60 "Infrastructure restored!" {} - timeout { - puts "\n\nFailed to restore infrastructure." - exit 1 - } -} - -expect { - -timeout 150 "Cluster state restored!" {} + -timeout 30 "Cluster state restored!" {} timeout { puts "\n\nFailed to restore cluster state." exit 1 @@ -133,15 +125,15 @@ expect { } expect { - -timeout 60 "Admin Console is ready!" {} + -timeout 150 "Admin Console restored!" {} timeout { - puts "\n\nFailed to wait for admin console to be ready." + puts "\n\nFailed to restore admin console." exit 1 } } expect { - "Visit the admin console if you need to add nodes to the cluster" {} + "Visit the Admin Console if you need to add nodes to the cluster" {} timeout { puts "\n\nFailed to find admin console URL." exit 1 @@ -176,6 +168,14 @@ expect { } } +expect { + -timeout 240 "Embedded cluster operator restored!" {} + timeout { + puts "\n\nFailed to restore embedded cluster operator." + exit 1 + } +} + expect { -timeout 60 "Application restored!" { exit 0 diff --git a/e2e/scripts/restore-multi-node-airgap-phase1.exp b/e2e/scripts/restore-multi-node-airgap-phase1.exp new file mode 100755 index 000000000..300d17344 --- /dev/null +++ b/e2e/scripts/restore-multi-node-airgap-phase1.exp @@ -0,0 +1,172 @@ +#!/usr/bin/env expect + +set env(EMBEDDED_CLUSTER_PLAIN_PROMPTS) "true" +set env(EMBEDDED_CLUSTER_METRICS_BASEURL) "https://staging.replicated.app" +set env(KUBECONFIG) "/var/lib/k0s/pki/admin.conf" +set env(PATH) "$env(PATH):/var/lib/embedded-cluster/bin" + +set dr_aws_s3_endpoint [lindex $argv 0] +set dr_aws_s3_region [lindex $argv 1] +set dr_aws_s3_bucket [lindex $argv 2] +set dr_aws_s3_prefix [lindex $argv 3] +set dr_aws_access_key_id [lindex $argv 4] +set dr_aws_secret_access_key [lindex $argv 5] + +spawn embedded-cluster restore --airgap-bundle /assets/release.airgap --proxy + +expect { + "Enter information to configure access to your backup storage location." {} + timeout { + puts "\n\nFailed to find introduction." + exit 1 + } +} + +expect { + "S3 endpoint:" { + send "$dr_aws_s3_endpoint\r" + } + timeout { + puts "\n\nFailed to find 'S3 endpoint' prompt." + exit 1 + } +} + +expect { + "Region:" { + send "$dr_aws_s3_region\r" + } + timeout { + puts "\n\nFailed to find 'Region' prompt." + exit 1 + } +} + +expect { + "Bucket:" { + send "$dr_aws_s3_bucket\r" + } + timeout { + puts "\n\nFailed to find 'Bucket' prompt." + exit 1 + } +} + +expect { + "Prefix (press Enter to skip):" { + send "$dr_aws_s3_prefix\r" + } + timeout { + puts "\n\nFailed to find 'Prefix' prompt." + exit 1 + } +} + +expect { + "Access key ID:" { + send "$dr_aws_access_key_id\r" + } + timeout { + puts "\n\nFailed to find 'Access key ID' prompt." + exit 1 + } +} + +expect { + "Secret access key:" { + send "$dr_aws_secret_access_key\r" + } + timeout { + puts "\n\nFailed to find 'Secret access key' prompt." + exit 1 + } +} + +expect { + -timeout 210 "Velero is ready!" {} + timeout { + puts "\n\nFailed to wait for Velero to be ready." + exit 1 + } +} + +expect { + -timeout 30 "Backup storage location configured!" {} + timeout { + puts "\n\nFailed to configure backup storage location." + exit 1 + } +} + +expect { + -timeout 30 "Found 1 restorable backup!" {} + timeout { + puts "\n\nFailed to find a restorable backup." + exit 1 + } +} + +expect { + -re "Restore from backup.*\?" { + send "Y\r" + } + timeout { + puts "\n\nFailed to find 'Restore from backup' prompt." + exit 1 + } +} + +expect { + -timeout 30 "Cluster state restored!" {} + timeout { + puts "\n\nFailed to restore cluster state." + exit 1 + } +} + +expect { + -timeout 150 "Admin Console restored!" {} + timeout { + puts "\n\nFailed to restore admin console." + exit 1 + } +} + +expect { + "Visit the Admin Console if you need to add nodes to the cluster" {} + timeout { + puts "\n\nFailed to find admin console URL." + exit 1 + } +} + +expect { + "Type 'continue' when you are done adding nodes" { + send "continue\r" + } + timeout { + puts "\n\nFailed to find 'done adding nodes' prompt." + exit 1 + } +} + +expect { + "You are restoring a high-availability cluster, which requires at least 3 controller nodes. You currently have 1. Please add more controller nodes." {} + timeout { + puts "\n\nFailed to find '3 controllers required' warning" + exit 1 + } +} + +expect { + "Type 'continue' when you are done adding nodes" { + exit 0 + } + timeout { + puts "\n\nFailed to find 'done adding nodes' prompt." + exit 1 + } +} + +puts "\n\nCommand exited before finishing all validations." +exit 1 diff --git a/e2e/scripts/restore-multi-node-airgap-phase2.exp b/e2e/scripts/restore-multi-node-airgap-phase2.exp new file mode 100755 index 000000000..41f5ad42b --- /dev/null +++ b/e2e/scripts/restore-multi-node-airgap-phase2.exp @@ -0,0 +1,99 @@ +#!/usr/bin/env expect + +set env(EMBEDDED_CLUSTER_PLAIN_PROMPTS) "true" +set env(EMBEDDED_CLUSTER_METRICS_BASEURL) "https://staging.replicated.app" +set env(KUBECONFIG) "/var/lib/k0s/pki/admin.conf" +set env(PATH) "$env(PATH):/var/lib/embedded-cluster/bin" + +spawn embedded-cluster restore --airgap-bundle /assets/release.airgap --proxy + +expect { + "A previous restore operation was detected. Would you like to resume?" { + send "Y\r" + } + timeout { + puts "\n\nFailed to find 'previous restore operation was detected' prompt." + exit 1 + } +} + +expect { + "Resuming restore from backup" {} + timeout { + puts "\n\nFailed to find 'resuming restore from backup' message." + exit 1 + } +} + +expect { + "Visit the Admin Console if you need to add nodes to the cluster" {} + timeout { + puts "\n\nFailed to find admin console URL." + exit 1 + } +} + +expect { + "Type 'continue' when you are done adding nodes" { + send "\r" + } + timeout { + puts "\n\nFailed to find 'done adding nodes' prompt." + exit 1 + } +} + +expect { + "Please type 'continue' to proceed" { + send "continue\r" + } + timeout { + puts "\n\nFailed to find 'type continue to proceed' prompt." + exit 1 + } +} + +expect { + -timeout 30 "All nodes are ready!" {} + timeout { + puts "\n\nFailed to wait for nodes." + exit 1 + } +} + +expect { + -timeout 210 "Registry data restored!" {} + timeout { + puts "\n\nFailed to restore seaweedfs." + exit 1 + } +} + +expect { + -timeout 90 "Registry restored!" {} + timeout { + puts "\n\nFailed to restore registry." + exit 1 + } +} + +expect { + -timeout 240 "Embedded cluster operator restored!" {} + timeout { + puts "\n\nFailed to restore embedded cluster operator." + exit 1 + } +} + +expect { + -timeout 60 "Application restored!" { + exit 0 + } + timeout { + puts "\n\nFailed to restore application." + exit 1 + } +} + +puts "\n\nCommand exited before finishing all validations." +exit 1 diff --git a/e2e/scripts/restore-multi-node-phase1.exp b/e2e/scripts/restore-multi-node-phase1.exp new file mode 100755 index 000000000..46d657810 --- /dev/null +++ b/e2e/scripts/restore-multi-node-phase1.exp @@ -0,0 +1,172 @@ +#!/usr/bin/env expect + +set env(EMBEDDED_CLUSTER_PLAIN_PROMPTS) "true" +set env(EMBEDDED_CLUSTER_METRICS_BASEURL) "https://staging.replicated.app" +set env(KUBECONFIG) "/var/lib/k0s/pki/admin.conf" +set env(PATH) "$env(PATH):/var/lib/embedded-cluster/bin" + +set dr_aws_s3_endpoint [lindex $argv 0] +set dr_aws_s3_region [lindex $argv 1] +set dr_aws_s3_bucket [lindex $argv 2] +set dr_aws_s3_prefix [lindex $argv 3] +set dr_aws_access_key_id [lindex $argv 4] +set dr_aws_secret_access_key [lindex $argv 5] + +spawn embedded-cluster restore + +expect { + "Enter information to configure access to your backup storage location." {} + timeout { + puts "\n\nFailed to find introduction." + exit 1 + } +} + +expect { + "S3 endpoint:" { + send "$dr_aws_s3_endpoint\r" + } + timeout { + puts "\n\nFailed to find 'S3 endpoint' prompt." + exit 1 + } +} + +expect { + "Region:" { + send "$dr_aws_s3_region\r" + } + timeout { + puts "\n\nFailed to find 'Region' prompt." + exit 1 + } +} + +expect { + "Bucket:" { + send "$dr_aws_s3_bucket\r" + } + timeout { + puts "\n\nFailed to find 'Bucket' prompt." + exit 1 + } +} + +expect { + "Prefix (press Enter to skip):" { + send "$dr_aws_s3_prefix\r" + } + timeout { + puts "\n\nFailed to find 'Prefix' prompt." + exit 1 + } +} + +expect { + "Access key ID:" { + send "$dr_aws_access_key_id\r" + } + timeout { + puts "\n\nFailed to find 'Access key ID' prompt." + exit 1 + } +} + +expect { + "Secret access key:" { + send "$dr_aws_secret_access_key\r" + } + timeout { + puts "\n\nFailed to find 'Secret access key' prompt." + exit 1 + } +} + +expect { + -timeout 210 "Velero is ready!" {} + timeout { + puts "\n\nFailed to wait for Velero to be ready." + exit 1 + } +} + +expect { + -timeout 30 "Backup storage location configured!" {} + timeout { + puts "\n\nFailed to configure backup storage location." + exit 1 + } +} + +expect { + -timeout 30 "Found 1 restorable backup!" {} + timeout { + puts "\n\nFailed to find a restorable backup." + exit 1 + } +} + +expect { + -re "Restore from backup.*\?" { + send "Y\r" + } + timeout { + puts "\n\nFailed to find 'Restore from backup' prompt." + exit 1 + } +} + +expect { + -timeout 30 "Cluster state restored!" {} + timeout { + puts "\n\nFailed to restore cluster state." + exit 1 + } +} + +expect { + -timeout 150 "Admin Console restored!" {} + timeout { + puts "\n\nFailed to restore admin console." + exit 1 + } +} + +expect { + "Visit the Admin Console if you need to add nodes to the cluster" {} + timeout { + puts "\n\nFailed to find admin console URL." + exit 1 + } +} + +expect { + "Type 'continue' when you are done adding nodes" { + send "continue\r" + } + timeout { + puts "\n\nFailed to find 'done adding nodes' prompt." + exit 1 + } +} + +expect { + "You are restoring a high-availability cluster, which requires at least 3 controller nodes. You currently have 1. Please add more controller nodes." {} + timeout { + puts "\n\nFailed to find '3 controllers required' warning" + exit 1 + } +} + +expect { + "Type 'continue' when you are done adding nodes" { + exit 0 + } + timeout { + puts "\n\nFailed to find 'done adding nodes' prompt." + exit 1 + } +} + +puts "\n\nCommand exited before finishing all validations." +exit 1 diff --git a/e2e/scripts/restore-multi-node-phase2.exp b/e2e/scripts/restore-multi-node-phase2.exp new file mode 100755 index 000000000..1ec98c3ae --- /dev/null +++ b/e2e/scripts/restore-multi-node-phase2.exp @@ -0,0 +1,83 @@ +#!/usr/bin/env expect + +set env(EMBEDDED_CLUSTER_PLAIN_PROMPTS) "true" +set env(EMBEDDED_CLUSTER_METRICS_BASEURL) "https://staging.replicated.app" +set env(KUBECONFIG) "/var/lib/k0s/pki/admin.conf" +set env(PATH) "$env(PATH):/var/lib/embedded-cluster/bin" + +spawn embedded-cluster restore + +expect { + "A previous restore operation was detected. Would you like to resume?" { + send "Y\r" + } + timeout { + puts "\n\nFailed to find 'previous restore operation was detected' prompt." + exit 1 + } +} + +expect { + "Resuming restore from backup" {} + timeout { + puts "\n\nFailed to find 'resuming restore from backup' message." + exit 1 + } +} + +expect { + "Visit the Admin Console if you need to add nodes to the cluster" {} + timeout { + puts "\n\nFailed to find admin console URL." + exit 1 + } +} + +expect { + "Type 'continue' when you are done adding nodes" { + send "\r" + } + timeout { + puts "\n\nFailed to find 'done adding nodes' prompt." + exit 1 + } +} + +expect { + "Please type 'continue' to proceed" { + send "continue\r" + } + timeout { + puts "\n\nFailed to find 'type continue to proceed' prompt." + exit 1 + } +} + +expect { + -timeout 30 "All nodes are ready!" {} + timeout { + puts "\n\nFailed to wait for nodes." + exit 1 + } +} + +expect { + -timeout 240 "Embedded cluster operator restored!" {} + timeout { + puts "\n\nFailed to restore embedded cluster operator." + exit 1 + } +} + +expect { + -timeout 60 "Application restored!" { + exit 0 + } + timeout { + puts "\n\nFailed to restore application." + exit 1 + } +} + +puts "\n\nCommand exited before finishing all validations." +exit 1 diff --git a/e2e/scripts/resume-restore.exp b/e2e/scripts/resume-restore.exp index 05e6ac67c..52e89651c 100755 --- a/e2e/scripts/resume-restore.exp +++ b/e2e/scripts/resume-restore.exp @@ -167,10 +167,11 @@ expect { } } + expect { - "Restoring infrastructure" {} + "Restoring cluster state" {} timeout { - puts "\n\nFailed to find 'restoring infrastructure' spinner." + puts "\n\nFailed to find 'restoring cluster state' spinner." exit 1 } } @@ -197,25 +198,25 @@ expect { } expect { - "Restoring infrastructure" {} + "Restoring cluster state" {} timeout { - puts "\n\nFailed to find 'restoring infrastructure' spinner." + puts "\n\nFailed to find 'restoring cluster state' spinner." exit 1 } } expect { - -timeout 60 "Infrastructure restored!" {} + -timeout 150 "Cluster state restored!" {} timeout { - puts "\n\nFailed to restore infrastructure." + puts "\n\nFailed to restore cluster state." exit 1 } } expect { - "Restoring cluster state" {} + "Restoring the Admin Console" {} timeout { - puts "\n\nFailed to find 'restoring cluster state' spinner." + puts "\n\nFailed to find 'restoring the admin console' spinner." exit 1 } } @@ -242,31 +243,23 @@ expect { } expect { - "Restoring cluster state" {} - timeout { - puts "\n\nFailed to find 'restoring cluster state' spinner." - exit 1 - } -} - -expect { - -timeout 150 "Cluster state restored!" {} + "Restoring the Admin Console" {} timeout { - puts "\n\nFailed to restore cluster state." + puts "\n\nFailed to find 'restoring the admin console' spinner." exit 1 } } expect { - -timeout 60 "Admin Console is ready!" {} + -timeout 150 "Admin Console restored!" {} timeout { - puts "\n\nFailed to wait for admin console to be ready." + puts "\n\nFailed to restore admin console." exit 1 } } expect { - "Visit the admin console if you need to add nodes to the cluster" {} + "Visit the Admin Console if you need to add nodes to the cluster" {} timeout { puts "\n\nFailed to find admin console URL." exit 1 @@ -305,15 +298,7 @@ expect { } expect { - -timeout 60 "Admin Console is ready!" {} - timeout { - puts "\n\nFailed to wait for admin console to be ready." - exit 1 - } -} - -expect { - "Visit the admin console if you need to add nodes to the cluster" {} + "Visit the Admin Console if you need to add nodes to the cluster" {} timeout { puts "\n\nFailed to find admin console URL." exit 1 @@ -348,6 +333,51 @@ expect { } } +expect { + "Restoring embedded cluster operator" {} + timeout { + puts "\n\nFailed to find 'restoring embedded cluster operator' spinner." + exit 1 + } +} + +send_interrupt +spawn embedded-cluster restore + +expect { + "A previous restore operation was detected. Would you like to resume?" { + send "Y\r" + } + timeout { + puts "\n\nFailed to find 'previous restore operation was detected' prompt." + exit 1 + } +} + +expect { + "Resuming restore from backup" {} + timeout { + puts "\n\nFailed to find 'resuming restore from backup' message." + exit 1 + } +} + +expect { + "Restoring embedded cluster operator" {} + timeout { + puts "\n\nFailed to find 'restoring embedded cluster operator' spinner." + exit 1 + } +} + +expect { + -timeout 240 "Embedded cluster operator restored!" {} + timeout { + puts "\n\nFailed to restore embedded cluster operator." + exit 1 + } +} + expect { "Restoring application" {} timeout { diff --git a/e2e/scripts/single-node-airgap-install.sh b/e2e/scripts/single-node-airgap-install.sh index 3d5c2aa06..7665ba6c1 100755 --- a/e2e/scripts/single-node-airgap-install.sh +++ b/e2e/scripts/single-node-airgap-install.sh @@ -130,7 +130,7 @@ main() { additional_args="$1" echo "Running install with additional args: $additional_args" fi - if ! embedded-cluster install --no-prompt --license /tmp/license.yaml --airgap-bundle /tmp/release.airgap $additional_args 2>&1 | tee /tmp/log ; then + if ! embedded-cluster install --no-prompt --license /assets/license.yaml --airgap-bundle /assets/release.airgap $additional_args 2>&1 | tee /tmp/log ; then echo "Failed to install embedded-cluster" exit 1 fi diff --git a/e2e/scripts/single-node-install.sh b/e2e/scripts/single-node-install.sh index f222ef997..d892f1709 100755 --- a/e2e/scripts/single-node-install.sh +++ b/e2e/scripts/single-node-install.sh @@ -270,7 +270,7 @@ main() { exit 1 fi - if ! embedded-cluster install --no-prompt --license /tmp/license.yaml 2>&1 | tee /tmp/log ; then + if ! embedded-cluster install --no-prompt --license /assets/license.yaml 2>&1 | tee /tmp/log ; then echo "Failed to install embedded-cluster" kubectl get pods -A kubectl get storageclass -A diff --git a/e2e/scripts/unsupported-overrides.sh b/e2e/scripts/unsupported-overrides.sh index 450f982f3..6a310c93f 100755 --- a/e2e/scripts/unsupported-overrides.sh +++ b/e2e/scripts/unsupported-overrides.sh @@ -46,12 +46,12 @@ spec: name: embedded-cluster-operator namespace: embedded-cluster order: 2 - version: 0.34.3 + version: 0.34.9 - chartname: oci://registry.replicated.com/library/admin-console name: admin-console namespace: kotsadm order: 3 - version: 1.109.9-build.1 + version: 1.109.12 values: | isHA: false isHelmManaged: false diff --git a/e2e/scripts/vandoor-prepare.sh b/e2e/scripts/vandoor-prepare.sh index 14b40ac81..11f762148 100755 --- a/e2e/scripts/vandoor-prepare.sh +++ b/e2e/scripts/vandoor-prepare.sh @@ -6,8 +6,6 @@ main() { app_version_label="$1" local license_id= license_id="$2" - local is_airgap= - is_airgap="$3" apt-get update apt-get install curl ca-certificates -y @@ -16,8 +14,9 @@ main() { curl "https://staging.replicated.app/embedded/embedded-cluster-smoke-test-staging-app/ci/appver-${app_version_label}" -H "Authorization: ${license_id}" -o ec-release.tgz tar xzf ec-release.tgz + mkdir -p /assets mv embedded-cluster-smoke-test-staging-app /usr/local/bin/embedded-cluster - mv license.yaml /tmp/license.yaml + mv license.yaml /assets/license.yaml } main "$@" diff --git a/e2e/scripts/wait-for-ready-nodes.sh b/e2e/scripts/wait-for-ready-nodes.sh index 8edcb5c7a..f01201742 100755 --- a/e2e/scripts/wait-for-ready-nodes.sh +++ b/e2e/scripts/wait-for-ready-nodes.sh @@ -4,6 +4,8 @@ set -euox pipefail main() { expected_nodes="$1" + is_restore="${2:-}" + ready=$(kubectl get nodes | grep -v NotReady | grep -c Ready || true) counter=0 while [ "$ready" -lt "$expected_nodes" ]; do @@ -19,6 +21,11 @@ main() { done echo "All nodes are ready" + if [ "$is_restore" == "true" ]; then + # this is a restore operation where the app hasn't been restored yet, so goldpinger won't exist + exit 0 + fi + echo "checking that goldpinger has run on all nodes" kubectl get pods -n goldpinger local goldpinger_running_count= diff --git a/e2e/utils.go b/e2e/utils.go index 6a25af33f..636990450 100644 --- a/e2e/utils.go +++ b/e2e/utils.go @@ -135,6 +135,6 @@ func findJoinCommandInOutput(stdout string) (string, error) { // root and the embedded-cluster binary is on the PATH. command := strings.TrimPrefix(r.Command, "sudo ./") // replace the airgap bundle path (if any) with the local path. - command = strings.ReplaceAll(command, "embedded-cluster.airgap", "/tmp/release.airgap") + command = strings.ReplaceAll(command, "embedded-cluster.airgap", "/assets/release.airgap") return command, nil } diff --git a/go.mod b/go.mod index edb40307c..1241b29d4 100644 --- a/go.mod +++ b/go.mod @@ -15,7 +15,7 @@ require ( github.com/k0sproject/dig v0.2.0 github.com/k0sproject/k0s v1.29.6-0.20240527072442-22f6a125e881 github.com/replicatedhq/embedded-cluster-kinds v1.3.4 - github.com/replicatedhq/embedded-cluster-operator v0.34.6 + github.com/replicatedhq/embedded-cluster-operator v0.34.9 github.com/replicatedhq/embedded-cluster-utils v1.0.0 github.com/replicatedhq/kotskinds v0.0.0-20240523174825-f4d441adb453 github.com/replicatedhq/troubleshoot v0.93.1 diff --git a/go.sum b/go.sum index c48d0bb76..c2b5a903b 100644 --- a/go.sum +++ b/go.sum @@ -245,8 +245,8 @@ github.com/prometheus/procfs v0.12.0 h1:jluTpSng7V9hY0O2R9DzzJHYb2xULk9VTR1V1R/k github.com/prometheus/procfs v0.12.0/go.mod h1:pcuDEFsWDnvcgNzo4EEweacyhjeA9Zk3cnaOZAZEfOo= github.com/replicatedhq/embedded-cluster-kinds v1.3.4 h1:0PFElzdpjoNEhoFL44b2U+pLxm8n2e9V6I5TY05oa4A= github.com/replicatedhq/embedded-cluster-kinds v1.3.4/go.mod h1:YognvIhVsE5CevfCU0XLTMUCIAiXhWyYhwbU0EwCnvA= -github.com/replicatedhq/embedded-cluster-operator v0.34.6 h1:KSPTUmkaIie0neeb5agDTDrwNdgYHZG6h2iNZee9b/M= -github.com/replicatedhq/embedded-cluster-operator v0.34.6/go.mod h1:att9M2KlEdxXl1OHT6f3sVSE772IJEYzooCMVjJQFTI= +github.com/replicatedhq/embedded-cluster-operator v0.34.9 h1:P8bTz3bUYEi4enN9Tymk5JMqJtYmhf9kVo2WbaZuqEg= +github.com/replicatedhq/embedded-cluster-operator v0.34.9/go.mod h1:cG+or88M0qPkl/MbR9a+1aPKSyZU0J+4jSKlaJobtFQ= github.com/replicatedhq/embedded-cluster-utils v1.0.0 h1:Axdni1nYfl5zeOP9g5U79yvN8cRdClyU6hz0wV1Hmdc= github.com/replicatedhq/embedded-cluster-utils v1.0.0/go.mod h1:4JmMC2CwMCLxq05GEW3XSPPVotqyamAF/omrbB3pH+c= github.com/replicatedhq/kotskinds v0.0.0-20240523174825-f4d441adb453 h1:g8CQQ9R4gjIdoHuBX1LN1hmF3Omq2JfA040JfpfNVC8= diff --git a/pkg/addons/adminconsole/adminconsole.go b/pkg/addons/adminconsole/adminconsole.go index 9398945c2..3b3fbc94a 100644 --- a/pkg/addons/adminconsole/adminconsole.go +++ b/pkg/addons/adminconsole/adminconsole.go @@ -9,7 +9,6 @@ import ( "regexp" "time" - "github.com/k0sproject/dig" "github.com/k0sproject/k0s/pkg/apis/k0s/v1beta1" "github.com/replicatedhq/troubleshoot/pkg/apis/troubleshoot/v1beta2" "github.com/sirupsen/logrus" @@ -65,7 +64,7 @@ var helmValues = map[string]interface{}{ "embeddedClusterVersion": defaults.Version, "labels": map[string]interface{}{ "replicated.com/disaster-recovery": "infra", - "replicated.com/disaster-recovery-chart": "kotsadm", + "replicated.com/disaster-recovery-chart": "admin-console", }, "passwordSecretRef": map[string]interface{}{ "name": "kotsadm-password", @@ -136,21 +135,6 @@ func (a *AdminConsole) HostPreflights() (*v1beta2.HostPreflightSpec, error) { return release.GetHostPreflights() } -// getPasswordFromConfig returns the adminconsole password from the provided chart config. -func getPasswordFromConfig(chart *v1beta1.Chart) (string, error) { - if chart.Values == "" { - return "", nil - } - values := dig.Mapping{} - if err := yaml.Unmarshal([]byte(chart.Values), &values); err != nil { - return "", fmt.Errorf("unable to unmarshal values: %w", err) - } - if password, ok := values["password"].(string); ok { - return password, nil - } - return "", nil -} - // GetCurrentChartConfig returns the current adminconsole chart config from the cluster config. func (a *AdminConsole) GetCurrentChartConfig() *v1beta1.Chart { if a.config.Spec == nil || a.config.Spec.Extensions == nil { @@ -218,7 +202,7 @@ func (a *AdminConsole) GetAdditionalImages() []string { // Outro waits for the adminconsole to be ready. func (a *AdminConsole) Outro(ctx context.Context, cli client.Client) error { loading := spinner.Start() - loading.Infof("Waiting for Admin Console to deploy") + loading.Infof("Waiting for the Admin Console to deploy") defer loading.Close() if err := createKotsPasswordSecret(ctx, cli, a.namespace, Password); err != nil { @@ -292,7 +276,7 @@ func WaitForReady(ctx context.Context, cli client.Client, ns string, writer *spi count++ } if writer != nil { - writer.Infof("Waiting for Admin Console to deploy: %d/2 ready", count) + writer.Infof("Waiting for the Admin Console to deploy: %d/2 ready", count) } return count == 2, nil }); err != nil { @@ -335,8 +319,9 @@ func createRegistrySecret(ctx context.Context, cli client.Client, namespace stri Name: "registry-creds", Namespace: namespace, Labels: map[string]string{ - "kots.io/kotsadm": "true", - "replicated.com/disaster-recovery": "infra", + "kots.io/kotsadm": "true", + "replicated.com/disaster-recovery": "infra", + "replicated.com/disaster-recovery-chart": "admin-console", }, }, StringData: map[string]string{ @@ -372,8 +357,9 @@ func createKotsPasswordSecret(ctx context.Context, cli client.Client, namespace Name: "kotsadm-password", Namespace: namespace, Labels: map[string]string{ - "kots.io/kotsadm": "true", - "replicated.com/disaster-recovery": "infra", + "kots.io/kotsadm": "true", + "replicated.com/disaster-recovery": "infra", + "replicated.com/disaster-recovery-chart": "admin-console", }, }, Data: map[string][]byte{ diff --git a/pkg/addons/applier.go b/pkg/addons/applier.go index 4ae7297e9..61063313a 100644 --- a/pkg/addons/applier.go +++ b/pkg/addons/applier.go @@ -383,11 +383,11 @@ func printKotsadmLinkMessage(licenseFile string) error { colorReset := "\033[0m" var successMessage string if license != nil { - successMessage = fmt.Sprintf("Visit the admin console to configure and install %s: %s%s%s", + successMessage = fmt.Sprintf("Visit the Admin Console to configure and install %s: %s%s%s", license.Spec.AppSlug, successColor, adminconsole.GetURL(), colorReset, ) } else { - successMessage = fmt.Sprintf("Visit the admin console to configure and install your application: %s%s%s", + successMessage = fmt.Sprintf("Visit the Admin Console to configure and install your application: %s%s%s", successColor, adminconsole.GetURL(), colorReset, ) } diff --git a/pkg/addons/seaweedfs/seaweedfs.go b/pkg/addons/seaweedfs/seaweedfs.go index 9e8c7f517..acab51bd9 100644 --- a/pkg/addons/seaweedfs/seaweedfs.go +++ b/pkg/addons/seaweedfs/seaweedfs.go @@ -3,10 +3,14 @@ package seaweedfs import ( "context" "fmt" + "time" "github.com/k0sproject/k0s/pkg/apis/k0s/v1beta1" + "github.com/replicatedhq/embedded-cluster/pkg/kubeutils" + "github.com/replicatedhq/embedded-cluster/pkg/spinner" "github.com/replicatedhq/troubleshoot/pkg/apis/troubleshoot/v1beta2" "gopkg.in/yaml.v2" + "k8s.io/apimachinery/pkg/util/wait" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -97,6 +101,49 @@ func New(namespace string, config v1beta1.ClusterConfig, isAirgap bool) (*Seawee return &SeaweedFS{namespace: namespace, config: config, isAirgap: isAirgap}, nil } +// WaitForReady waits for SeaweedFS to be ready. +func WaitForReady(ctx context.Context, cli client.Client, ns string, writer *spinner.MessageWriter) error { + backoff := wait.Backoff{Steps: 60, Duration: 5 * time.Second, Factor: 1.0, Jitter: 0.1} + var lasterr error + if err := wait.ExponentialBackoffWithContext(ctx, backoff, func(ctx context.Context) (bool, error) { + var count int + ready, err := kubeutils.IsStatefulSetReady(ctx, cli, ns, "seaweedfs-filer") + if err != nil { + lasterr = fmt.Errorf("error checking status of seaweedfs-filer: %v", err) + return false, nil + } + if ready { + count++ + } + ready, err = kubeutils.IsStatefulSetReady(ctx, cli, ns, "seaweedfs-master") + if err != nil { + lasterr = fmt.Errorf("error checking status of seaweedfs-master: %v", err) + return false, nil + } + if ready { + count++ + } + ready, err = kubeutils.IsStatefulSetReady(ctx, cli, ns, "seaweedfs-volume") + if err != nil { + lasterr = fmt.Errorf("error checking status of seaweedfs-volume: %v", err) + return false, nil + } + if ready { + count++ + } + if writer != nil { + writer.Infof("Waiting for SeaweedFS to deploy: %d/3 ready", count) + } + return count == 3, nil + }); err != nil { + if lasterr == nil { + lasterr = err + } + return fmt.Errorf("error waiting for admin console: %v", lasterr) + } + return nil +} + func init() { helmValues = make(map[string]interface{}) if err := yaml.Unmarshal(helmValuesYAML, &helmValues); err != nil { diff --git a/pkg/addons/seaweedfs/values.yaml b/pkg/addons/seaweedfs/values.yaml index 3b6e6d671..a1e8b6781 100644 --- a/pkg/addons/seaweedfs/values.yaml +++ b/pkg/addons/seaweedfs/values.yaml @@ -19,7 +19,7 @@ master: ec.rebuild -force ec.balance -force volume.balance -force - volume.configure.replication -replication 002 -collectionPattern * + volume.configure.replication -replication 001 -collectionPattern * volume.fix.replication fs.meta.save -o filer-backup.meta fs.meta.load filer-backup.meta @@ -28,6 +28,8 @@ master: volume: # replicas must be at least replication (2) + 1 = 3 replicas: 3 + podAnnotations: + backup.velero.io/backup-volumes: data affinity: | # schedule on control-plane nodes nodeAffinity: @@ -60,17 +62,22 @@ volume: maxVolumes: 50 filer: replicas: 3 + podAnnotations: + backup.velero.io/backup-volumes: data-filer,seaweedfs-filer-log-volume data: type: "persistentVolumeClaim" # openebs-hostpath storage does not limit the size of the volume size: "1Gi" storageClass: "openebs-hostpath" logs: - hostPathPrefix: "/var/lib/embedded-cluster/seaweedfs/storage" + type: "persistentVolumeClaim" + # openebs-hostpath storage does not limit the size of the volume + size: "1Gi" + storageClass: "openebs-hostpath" s3: enabled: true enableAuth: true existingConfigSecret: secret-seaweedfs-s3 createBuckets: - name: registry - anonymousRead: false + anonymousRead: false \ No newline at end of file diff --git a/pkg/config/testdata/builtin-extensions-overrides-override-admin-console.yaml b/pkg/config/testdata/builtin-extensions-overrides-override-admin-console.yaml index aa0ed6c56..a570147f7 100644 --- a/pkg/config/testdata/builtin-extensions-overrides-override-admin-console.yaml +++ b/pkg/config/testdata/builtin-extensions-overrides-override-admin-console.yaml @@ -30,7 +30,7 @@ clusterConfig: | nodePort: 30000 labels: replicated.com/disaster-recovery: infra - replicated.com/disaster-recovery-chart: kotsadm + replicated.com/disaster-recovery-chart: admin-console minimalRBAC: false service: enabled: false @@ -53,7 +53,7 @@ expected: | nodePort: 40000 labels: replicated.com/disaster-recovery: infra - replicated.com/disaster-recovery-chart: kotsadm + replicated.com/disaster-recovery-chart: admin-console minimalRBAC: false service: enabled: false diff --git a/pkg/config/testdata/builtin-extensions-overrides-override-multiple-charts.yaml b/pkg/config/testdata/builtin-extensions-overrides-override-multiple-charts.yaml index 9b44bb483..b7cfc3e61 100644 --- a/pkg/config/testdata/builtin-extensions-overrides-override-multiple-charts.yaml +++ b/pkg/config/testdata/builtin-extensions-overrides-override-multiple-charts.yaml @@ -38,7 +38,7 @@ clusterConfig: | nodePort: 30000 labels: replicated.com/disaster-recovery: infra - replicated.com/disaster-recovery-chart: kotsadm + replicated.com/disaster-recovery-chart: admin-console minimalRBAC: false service: enabled: false @@ -66,7 +66,7 @@ expected: | nodePort: 30000 labels: replicated.com/disaster-recovery: infra - replicated.com/disaster-recovery-chart: kotsadm + replicated.com/disaster-recovery-chart: admin-console test: test minimalRBAC: false service: diff --git a/pkg/config/testdata/builtin-extensions-overrides-override-unknown.yaml b/pkg/config/testdata/builtin-extensions-overrides-override-unknown.yaml index 6f0fc29a8..8ac26d724 100644 --- a/pkg/config/testdata/builtin-extensions-overrides-override-unknown.yaml +++ b/pkg/config/testdata/builtin-extensions-overrides-override-unknown.yaml @@ -29,7 +29,7 @@ clusterConfig: | nodePort: 30000 labels: replicated.com/disaster-recovery: infra - replicated.com/disaster-recovery-chart: kotsadm + replicated.com/disaster-recovery-chart: admin-console minimalRBAC: false service: enabled: false @@ -57,7 +57,7 @@ expected: | nodePort: 30000 labels: replicated.com/disaster-recovery: infra - replicated.com/disaster-recovery-chart: kotsadm + replicated.com/disaster-recovery-chart: admin-console minimalRBAC: false service: enabled: false diff --git a/pkg/kotscli/kotscli.go b/pkg/kotscli/kotscli.go index 8d79596b1..14abe7e22 100644 --- a/pkg/kotscli/kotscli.go +++ b/pkg/kotscli/kotscli.go @@ -199,7 +199,7 @@ func MaskKotsOutputForAirgap() spinner.MaskFn { current = message case strings.Contains(message, "Pushing embedded cluster artifacts"): current = message - case strings.Contains(message, "Waiting for Admin Console"): + case strings.Contains(message, "Waiting for the Admin Console"): current = "Finalizing Admin Console" case strings.Contains(message, "Finished!"): current = message diff --git a/pkg/kubeutils/kubeutils.go b/pkg/kubeutils/kubeutils.go index 55f44c0d7..c89664ec2 100644 --- a/pkg/kubeutils/kubeutils.go +++ b/pkg/kubeutils/kubeutils.go @@ -10,6 +10,7 @@ import ( appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/wait" "sigs.k8s.io/controller-runtime/pkg/client" @@ -388,3 +389,16 @@ func WaitForKubernetes(ctx context.Context, cli client.Client) <-chan error { return errch } + +func NumOfControlPlaneNodes(ctx context.Context, cli client.Client) (int, error) { + opts := &client.ListOptions{ + LabelSelector: labels.SelectorFromSet( + labels.Set{"node-role.kubernetes.io/control-plane": "true"}, + ), + } + var nodes corev1.NodeList + if err := cli.List(ctx, &nodes, opts); err != nil { + return 0, err + } + return len(nodes.Items), nil +} From c5e0dd312da64bcbfd8b4d7b7216d501a0e465d4 Mon Sep 17 00:00:00 2001 From: Salah Al Saleh Date: Tue, 11 Jun 2024 08:16:30 -0700 Subject: [PATCH 5/5] Pass is-large-runner to release-dev workflow (#695) --- .github/workflows/release-dev.yaml | 1 + e2e/restore_test.go | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/release-dev.yaml b/.github/workflows/release-dev.yaml index f2ec529a7..ba60c52d9 100644 --- a/.github/workflows/release-dev.yaml +++ b/.github/workflows/release-dev.yaml @@ -167,6 +167,7 @@ jobs: - uses: ./.github/actions/e2e with: test-name: '${{ matrix.test }}' + is-large-runner: ${{ matrix.runner == 'embedded-cluster' }} airgap-license-id: ${{ secrets.STAGING_EMBEDDED_CLUSTER_AIRGAP_LICENSE_ID }} snapshot-license-id: ${{ secrets.STAGING_EMBEDDED_CLUSTER_SNAPSHOT_LICENSE_ID }} snapshot-license: ${{ secrets.STAGING_EMBEDDED_CLUSTER_SNAPSHOT_LICENSE }} diff --git a/e2e/restore_test.go b/e2e/restore_test.go index 7c3490ad0..b6885e224 100644 --- a/e2e/restore_test.go +++ b/e2e/restore_test.go @@ -500,7 +500,7 @@ func TestMultiNodeAirgapHADisasterRecovery(t *testing.T) { "DR_AWS_S3_ENDPOINT", "DR_AWS_S3_REGION", "DR_AWS_S3_BUCKET", - "DR_AWS_S3_PREFIX", + "DR_AWS_S3_PREFIX_AIRGAP", "DR_AWS_ACCESS_KEY_ID", "DR_AWS_SECRET_ACCESS_KEY", }