Skip to content
This repository is currently being migrated. It's locked while the migration is in progress.

Commit

Permalink
Work on the move command (#222)
Browse files Browse the repository at this point in the history
Overhaul many of the user facing messages to improve readability and visibility of what's happening during the process.
Now also starts by reporting what type of scenario was detected to be running, I.E. Moving master, Moving replica or Promoting master.

Improved code comments in some places.

Added more information to the command description.

Removed the node IDs from messages due to not adding much. These are internal IDs known to CP, not k8s IDs the user can refer to.

Changed replica sync message to format the bytes into a human readable unit first (i.e. GiB, MiB, etc).
These updates now happen every 8sec (+3 sec) as these were happening slightly too often.

Force a timeout of 30 minutes when the existing one is smaller than that value. This happens only when a new replica has to be created.

Removed unused variable (timeout) from struct that holds all the vars for the move cmd.
  • Loading branch information
Ricardo-Osorio committed Jan 23, 2023
1 parent 6c877ad commit fb29e14
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 20 deletions.
74 changes: 57 additions & 17 deletions cmd/update/volume_move.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ type volumeMoveCommand struct {
// inputs
namespace string
volumeID string
timeout time.Duration
fromNode string
toNode string

Expand All @@ -48,7 +47,6 @@ func (c *volumeMoveCommand) runWithCtx(ctx context.Context, cmd *cobra.Command,
return err
}

// fetch IDs
nsID := id.Namespace(c.namespace)

if !useIDs {
Expand All @@ -71,7 +69,6 @@ func (c *volumeMoveCommand) runWithCtx(ctx context.Context, cmd *cobra.Command,
c.toNode = toNode.ID.String()
}

// fetch volume
vol, err := c.getVolume(ctx, nsID)
if err != nil {
return err
Expand Down Expand Up @@ -108,14 +105,43 @@ func (c *volumeMoveCommand) runWithCtx(ctx context.Context, cmd *cobra.Command,
}

if !isMovingMaster && hasReplicaOnTargetNode {
c.display.TextMessage(ctx, c.writer, "volume already has a replica on both [source] and [destination] nodes, nothing to do")
c.display.TextMessage(ctx, c.writer, "Volume already has a replica on both [source] and [destination] nodes, nothing to do")
return nil
}

// after we know a valid scenario has been triggered we return what will
// happen back to the user in a user-friendly msg
if isMovingMaster {
if hasReplicaOnTargetNode {
c.display.TextMessage(ctx, c.writer, "Performing action: promoting replica")
} else {
c.display.TextMessage(ctx, c.writer, "Performing action: moving master")
}
} else {
c.display.TextMessage(ctx, c.writer, "Performing action: moving replica")
}

originalReplicaCount := len(vol.Replicas)

// create a new replica on the [destination] node if there isn't one yet
if !hasReplicaOnTargetNode {
c.display.TextMessage(ctx, c.writer, `Creating a new replica on [destination] node.
The duration of syncing operations depend on the rate of data transfers and network bandwidth.
A default timeout of 30min is in place but it should be adjusted with the global --timeout flag to adapt to both volume and environment.
`)

// when we need to sync a new replica, force the use of a timeout with
// a minimum of 30 minutes (if not already set) to avoid unecessary
// errors due to not using a long enough timeout.
//
// most users will be moving pretty large volumes of data anyways.
if deadline, _ := ctx.Deadline(); time.Until(deadline) < time.Minute*30 {
var cancel context.CancelFunc
ctx, cancel = context.WithTimeout(context.Background(), time.Minute*30)
defer cancel()
}

params := &apiclient.AddDeploymentOnNodeRequestParams{
NodeID: id.Node(c.toNode),
}
Expand All @@ -133,17 +159,21 @@ func (c *volumeMoveCommand) runWithCtx(ctx context.Context, cmd *cobra.Command,
// Failing to do so is not a big problem as api-manager will eventually
// sync the labels and bring the number down to match the pvc but it may
// take a while to act (up to 1h).
//
// function defers the variable values from when created and not when it
// gets called thus timeout, original replica count, etc need to be
// up-to-date here or obtained from the API when needed.
defer func() {
updatedVol, err := c.client.GetVolume(ctx, nsID, vol.ID)
updatedVol, err := c.getVolume(ctx, nsID)
if err != nil {
c.display.TextMessage(ctx, c.writer, "failed to get updated volume config")
c.display.TextMessage(ctx, c.writer, fmt.Sprintf("Failed to restore original replica count: failed to get updated volume config: %s", err.Error()))
return
}

currentReplicaCount := len(updatedVol.Replicas)

if currentReplicaCount-originalReplicaCount != 1 {
c.display.TextMessage(ctx, c.writer, "replica count of volume has been changed outside of move operation")
c.display.TextMessage(ctx, c.writer, "Replica count has been changed outside of move operation, aborting automatic deletion. Volume replicas needs to be managed manually.")
return
}

Expand All @@ -153,20 +183,23 @@ func (c *volumeMoveCommand) runWithCtx(ctx context.Context, cmd *cobra.Command,
}
err = c.client.SetReplicas(ctx, nsID, vol.ID, uint64(originalReplicaCount), setReplicasParam)
if err != nil {
c.display.TextMessage(ctx, c.writer, "failed to restore original replica count")
c.display.TextMessage(ctx, c.writer, fmt.Sprintf("Failed to restore original replica count: %s. Can be addressed manually or wait for it to be re-synced with PVC.", err.Error()))
}

c.display.TextMessage(ctx, c.writer, "Restored volume original replica count.")
}()

// wait for the a new synced replica on [destination] node
c.display.TextMessage(ctx, c.writer, "Replica syncing...")
err = c.waitForSyncedReplicaOnNode(ctx, nsID)
if err != nil {
return err
}

c.display.TextMessage(ctx, c.writer, fmt.Sprintf("replica on [destination] node %s fully synced", c.toNode))
c.display.TextMessage(ctx, c.writer, "Replica on [destination] node fully synced")
}

// moving a replica is done here, only preced if we are handling a master
// moving a replica is done here
if !isMovingMaster {
return nil
}
Expand All @@ -177,7 +210,7 @@ func (c *volumeMoveCommand) runWithCtx(ctx context.Context, cmd *cobra.Command,
return fmt.Errorf("failed to get updated volume config: %s", err)
}

// find the deployment IDs of the newly created replica and the master being moved
// find the deployment ID of the newly added replica
var newReplicaID id.Deployment
for _, rep := range vol.Replicas {
if rep.Node.String() == c.toNode {
Expand All @@ -187,17 +220,18 @@ func (c *volumeMoveCommand) runWithCtx(ctx context.Context, cmd *cobra.Command,

err = c.client.AttemptPromotion(ctx, string(nsID), vol.ID.String(), newReplicaID.String())
if err != nil {
c.display.TextMessage(ctx, c.writer, "Failed to promote the new replica on [destination] node")
return err
}

c.display.TextMessage(ctx, c.writer, fmt.Sprintf("replica on [destination] node %s attempting promotion to master", c.toNode))
c.display.TextMessage(ctx, c.writer, "Replica on [destination] node attempting promotion to master")

err = c.waitForMasterPromote(ctx, nsID)
if err != nil {
return err
}

c.display.TextMessage(ctx, c.writer, fmt.Sprintf("replica on [destination] node %s successfully promoted to master", c.toNode))
c.display.TextMessage(ctx, c.writer, "Replica on [destination] node has now become the master")
return nil
}

Expand All @@ -207,7 +241,7 @@ func (c *volumeMoveCommand) waitForSyncedReplicaOnNode(ctx context.Context, nsID
case <-ctx.Done():
return fmt.Errorf("timed out while watching for desired state but the operation may still be running (server side) - %w", ctx.Err())

case <-time.After(time.Second * 5):
case <-time.After(time.Second * 8):
vol, err := c.getVolume(ctx, nsID)
if err != nil {
return fmt.Errorf("failed to get updated volume config from api: %s", err)
Expand Down Expand Up @@ -276,10 +310,16 @@ func newMoveCmd(w io.Writer, client interfaces.Client, config interfaces.ConfigP

cobraCommand := &cobra.Command{
Use: "move [volume name] [source node] [destination node]",
Short: "Updates a volume's placement",
Short: "Moves volume instances between nodes",
Long: `Moves a volume instance, primary of replica, between two nodes.
When moving a primary instance into a node that already hosts one of the replicas, it will instead try to move the roles, promoting the replica to be the new primary instance.
Moving a replica into a primary instance or swapping two replicas is not allowed.
`,
Example: `
+$ storageos update volume move my-volume-name source-node-id destination-node-id --namespace my-namespace-name --timeout 10m
+$ storageos update volume move pvc-6d531e0e-fc5a-4b35-b752-bf75fc5dc690 38b7f402-c602-49f9-96b7-bc213d02a737 d4e910ac-394b-4fdb-be51-27822c15b682 --namespace c65e6116-ed59-4869-bf97-4fef5b42ac8e --timeout 30m
+$ storageos update volume move my-volume-name source-node-id destination-node-id --namespace my-namespace-name --timeout 45m
+$ storageos update volume move pvc-6d531e0e-fc5a-4b35-b752-bf75fc5dc690 38b7f402-c602-49f9-96b7-bc213d02a737 d4e910ac-394b-4fdb-be51-27822c15b682 --namespace c65e6116-ed59-4869-bf97-4fef5b42ac8e --timeout 1h
`,
Args: argwrappers.WrapInvalidArgsError(func(cmd *cobra.Command, args []string) error {
if len(args) != 3 {
Expand Down
9 changes: 6 additions & 3 deletions output/textformat/update.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,10 +123,13 @@ func (d *Displayer) SetFailureMode(ctx context.Context, w io.Writer, updatedVol
}

// ReplicaSyncProgress prints a user friendly message showing the sync progress
// of a deployment. Meant to be used as a standalone and repeated message thus
// any metadata about the deployment should come beforehand on its own message
// of a deployment. Meant to be used as a standalone message that keeps being
// repeated giving the user insight about the state of the operation thus any
// metadata about the deployment or process where it is being executed should be
// on its own message.
func (d *Displayer) ReplicaSyncProgress(ctx context.Context, w io.Writer, syncProgress model.SyncProgress) error {
if _, err := fmt.Fprintf(w, "\tProgress (bytes left to sync): %d\n", syncProgress.BytesRemaining); err != nil {
bytesRemainingHumanFormat := size.Format(syncProgress.BytesRemaining)
if _, err := fmt.Fprintf(w, "\tThere are %s of data left to sync\n", bytesRemainingHumanFormat); err != nil {
return err
}
return nil
Expand Down

0 comments on commit fb29e14

Please sign in to comment.