Skip to content

Commit

Permalink
Trid 11175 delay clone split deletesnap
Browse files Browse the repository at this point in the history
* Delay clone split during 'snapshot delete' workflow
Co-authored-by: Clinton Knight <[email protected]>
  • Loading branch information
mravi-na authored Jan 29, 2024
1 parent f1d7e12 commit 769863e
Show file tree
Hide file tree
Showing 12 changed files with 194 additions and 11 deletions.
78 changes: 78 additions & 0 deletions storage_drivers/ontap/ontap_common.go
Original file line number Diff line number Diff line change
Expand Up @@ -1223,6 +1223,7 @@ const (
DefaultNfsMountOptionsDocker = "-o nfsvers=3"
DefaultNfsMountOptionsKubernetes = ""
DefaultSplitOnClone = "false"
DefaultCloneSplitDelay = 10
DefaultLuksEncryption = "false"
DefaultMirroring = "false"
DefaultLimitAggregateUsage = ""
Expand Down Expand Up @@ -1313,6 +1314,12 @@ func PopulateConfigurationDefaults(ctx context.Context, config *drivers.OntapSto
}
}

if config.CloneSplitDelay == "" {
config.CloneSplitDelay = strconv.FormatInt(DefaultCloneSplitDelay, 10)
} else if v, err := strconv.ParseInt(config.CloneSplitDelay, 10, 0); err != nil || v <= 0 {
return fmt.Errorf("invalid value for cloneSplitDelay: %v", config.CloneSplitDelay)
}

if config.FileSystemType == "" {
config.FileSystemType = drivers.DefaultFileSystemType
}
Expand Down Expand Up @@ -1389,6 +1396,7 @@ func PopulateConfigurationDefaults(ctx context.Context, config *drivers.OntapSto
"SecurityStyle": config.SecurityStyle,
"NfsMountOptions": config.NfsMountOptions,
"SplitOnClone": config.SplitOnClone,
"CloneSplitDelay": config.CloneSplitDelay,
"FileSystemType": config.FileSystemType,
"Encryption": config.Encryption,
"LUKSEncryption": config.LUKSEncryption,
Expand Down Expand Up @@ -1636,6 +1644,76 @@ func SplitVolumeFromBusySnapshot(
return nil
}

func SplitVolumeFromBusySnapshotWithDelay(
ctx context.Context, snapConfig *storage.SnapshotConfig, config *drivers.OntapStorageDriverConfig,
client api.OntapAPI, cloneSplitStart func(ctx context.Context, cloneName string) error,
cloneSplitTimers map[string]time.Time,
) {
snapshotID := snapConfig.ID()

// Find the configured value for cloneSplitDelay
delay, err := strconv.ParseInt(config.CloneSplitDelay, 10, 0)
if err != nil || delay < 0 {
// Should not come here, but just in case.
delay = DefaultCloneSplitDelay
}
cloneSplitDelay := time.Duration(delay) * time.Second

// If this is the first delete, just log the time and return.
firstDeleteTime, ok := cloneSplitTimers[snapshotID]
if !ok {
cloneSplitTimers[snapshotID] = time.Now()

Logc(ctx).WithFields(LogFields{
"snapshot": snapshotID,
"secondsBeforeSplit": fmt.Sprintf("%3.2f", cloneSplitDelay.Seconds()),
}).Warning("Initial locked snapshot delete, starting clone split timer.")

return
}

// This isn't the first delete, and the split is still running, so there is nothing to do.
if time.Now().Sub(firstDeleteTime) < 0 {

Logc(ctx).WithFields(LogFields{
"snapshot": snapshotID,
}).Warning("Retried locked snapshot delete, clone split still running.")

return
}

// This isn't the first delete, and the delay has not expired, so there is nothing to do.
if time.Now().Sub(firstDeleteTime) < cloneSplitDelay {

Logc(ctx).WithFields(LogFields{
"snapshot": snapshotID,
"secondsBeforeSplit": fmt.Sprintf("%3.2f", time.Now().Sub(firstDeleteTime).Seconds()),
}).Warning("Retried locked snapshot delete, clone split timer not yet expired.")

return
}

// The delay has expired, so start the split
splitErr := SplitVolumeFromBusySnapshot(ctx, snapConfig, config, client, cloneSplitStart)
if splitErr != nil {

// The split start failed, so reset the timer so we start again after another brief delay.
cloneSplitTimers[snapshotID] = time.Now()

Logc(ctx).WithFields(LogFields{
"snapshot": snapshotID,
"secondsBeforeSplit": fmt.Sprintf("%3.2f", cloneSplitDelay.Seconds()),
}).Warning("Retried locked snapshot delete, clone split failed, restarted timer.")

} else {

// The split start succeeded, so add enough time to the timer that we don't try to start it again.
cloneSplitTimers[snapshotID] = time.Now().Add(1 * time.Hour)

Logc(ctx).WithField("snapshot", snapshotID).Warning("Retried locked snapshot delete, clone split started.")
}
}

// getVolumeExternal is a method that accepts info about a volume
// as returned by the storage backend and formats it as a VolumeExternal
// object.
Expand Down
42 changes: 42 additions & 0 deletions storage_drivers/ontap/ontap_common_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4734,6 +4734,42 @@ func TestSplitVolumeFromBusySnapshot(t *testing.T) {
assert.NoError(t, err)
}

func TestSplitVolumeFromBusySnapshotWithDelay(t *testing.T) {
ctx := context.Background()
mockCtrl := gomock.NewController(t)
mockAPI := mockapi.NewMockOntapAPI(mockCtrl)
commonConfig := &drivers.CommonStorageDriverConfig{
DebugTraceFlags: map[string]bool{"method": true},
}
config := &drivers.OntapStorageDriverConfig{
CommonStorageDriverConfig: commonConfig,
}
snapConfig := &storage.SnapshotConfig{
VolumeInternalName: "fakeVolInternalName",
InternalName: "fakeInternalName",
}

// Test 1: No snap found
cloneSplitTimers := make(map[string]time.Time)
SplitVolumeFromBusySnapshotWithDelay(ctx, snapConfig, config, mockAPI, mockCloneSplitStart, cloneSplitTimers)

// Test 2: time difference is < 0
cloneSplitTimers[snapConfig.ID()] = time.Now().Add(10 * time.Millisecond)
SplitVolumeFromBusySnapshotWithDelay(ctx, snapConfig, config, mockAPI, mockCloneSplitStart, cloneSplitTimers)

time.Sleep(50 * time.Millisecond)
// Test 3: time difference < config.CloneSplitDelay, which is defaulted to 10 seconds.
SplitVolumeFromBusySnapshotWithDelay(ctx, snapConfig, config, mockAPI, mockCloneSplitStart, cloneSplitTimers)

// Test 4: SplitVolumeFromBusySnapshot returning error
// Add the time for first delete in past so that SplitVolume is called.
cloneSplitTimers[snapConfig.ID()] = time.Now().Add(-10 * time.Second)
mockAPI.EXPECT().VolumeListBySnapshotParent(ctx, snapConfig.InternalName,
snapConfig.VolumeInternalName).Return(api.VolumeNameList{},
fmt.Errorf("error returned by VolumeListBySnapshotParent"))
SplitVolumeFromBusySnapshotWithDelay(ctx, snapConfig, config, mockAPI, mockCloneSplitStart, cloneSplitTimers)
}

func TestGetVolumeExternalCommon(t *testing.T) {
volume := api.Volume{
Name: "fakeVolume",
Expand Down Expand Up @@ -5122,6 +5158,12 @@ func TestPopulateConfigurationDefaults(t *testing.T) {
err = PopulateConfigurationDefaults(ctx, config)

assert.Error(t, err)

// Test 8 - Invalid value for cloneSplitDelay
config.SplitOnClone = "true"
config.CloneSplitDelay = "-123"
err = PopulateConfigurationDefaults(ctx, config)
assert.Error(t, err)
}

func TestNewOntapTelemetry(t *testing.T) {
Expand Down
14 changes: 12 additions & 2 deletions storage_drivers/ontap/ontap_nas.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ type NASStorageDriver struct {

physicalPools map[string]storage.Pool
virtualPools map[string]storage.Pool

cloneSplitTimers map[string]time.Time
}

func (d *NASStorageDriver) GetConfig() *drivers.OntapStorageDriverConfig {
Expand Down Expand Up @@ -146,6 +148,9 @@ func (d *NASStorageDriver) Initialize(
d.telemetry.TridentBackendUUID = backendUUID
d.telemetry.Start(ctx)

// Set up the clone split timers
d.cloneSplitTimers = make(map[string]time.Time)

d.initialized = true
return nil
}
Expand Down Expand Up @@ -926,12 +931,17 @@ func (d *NASStorageDriver) DeleteSnapshot(
if err := d.API.VolumeSnapshotDelete(ctx, snapConfig.InternalName, snapConfig.VolumeInternalName); err != nil {
if api.IsSnapshotBusyError(err) {
// Start a split here before returning the error so a subsequent delete attempt may succeed.
_ = SplitVolumeFromBusySnapshot(ctx, snapConfig, &d.Config, d.API, d.API.VolumeCloneSplitStart)
SplitVolumeFromBusySnapshotWithDelay(ctx, snapConfig, &d.Config, d.API,
d.API.VolumeCloneSplitStart, d.cloneSplitTimers)
}
// we must return the err, even if we started a split, so the snapshot delete is retried

// We must return the error, even if we started a split, so the snapshot delete is retried.
return err
}

// Clean up any split timer
delete(d.cloneSplitTimers, snapConfig.ID())

Logc(ctx).WithField("snapshotName", snapConfig.InternalName).Debug("Deleted snapshot.")
return nil
}
Expand Down
15 changes: 12 additions & 3 deletions storage_drivers/ontap/ontap_nas_flexgroup.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ type NASFlexGroupStorageDriver struct {
physicalPool storage.Pool
virtualPools map[string]storage.Pool
timeout time.Duration

cloneSplitTimers map[string]time.Time
}

func (d *NASFlexGroupStorageDriver) GetConfig() *drivers.OntapStorageDriverConfig {
Expand Down Expand Up @@ -132,6 +134,9 @@ func (d *NASFlexGroupStorageDriver) Initialize(
d.telemetry.TridentBackendUUID = backendUUID
d.telemetry.Start(ctx)

// Set up the clone split timers
d.cloneSplitTimers = make(map[string]time.Time)

d.initialized = true
return nil
}
Expand Down Expand Up @@ -1309,13 +1314,17 @@ func (d *NASFlexGroupStorageDriver) DeleteSnapshot(
if err := d.API.FlexgroupSnapshotDelete(ctx, snapConfig.InternalName, snapConfig.VolumeInternalName); err != nil {
if api.IsSnapshotBusyError(err) {
// Start a split here before returning the error so a subsequent delete attempt may succeed.
_ = SplitVolumeFromBusySnapshot(ctx, snapConfig, &d.Config, d.API,
d.API.FlexgroupCloneSplitStart)
SplitVolumeFromBusySnapshotWithDelay(ctx, snapConfig, &d.Config, d.API,
d.API.FlexgroupCloneSplitStart, d.cloneSplitTimers)
}
// we must return the err, even if we started a split, so the snapshot delete is retried

// We must return the error, even if we started a split, so the snapshot delete is retried.
return err
}

// Clean up any split timer
delete(d.cloneSplitTimers, snapConfig.ID())

Logc(ctx).WithField("snapshotName", snapConfig.InternalName).Debug("Deleted snapshot.")
return nil
}
Expand Down
3 changes: 3 additions & 0 deletions storage_drivers/ontap/ontap_nas_flexgroup_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2724,6 +2724,9 @@ func TestOntapNasFlexgroupStorageDriverVolumeDeleteSnapshot_Failure(t *testing.T
mockAPI.EXPECT().VolumeListBySnapshotParent(ctx, "snap1", "vol1").Return(childVols, nil)
mockAPI.EXPECT().FlexgroupCloneSplitStart(ctx, "vol1").Return(nil)

driver.cloneSplitTimers = make(map[string]time.Time)
// Use DefaultCloneSplitDelay to set time to past. It is defaulted to 10 seconds.
driver.cloneSplitTimers[snapConfig.ID()] = time.Now().Add(-10 * time.Second)
result := driver.DeleteSnapshot(ctx, snapConfig, volConfig)

assert.Error(t, result, "FlexgroupSnapshotDelete sucessfully executed")
Expand Down
14 changes: 12 additions & 2 deletions storage_drivers/ontap/ontap_nas_qtree.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ type NASQtreeStorageDriver struct {

physicalPools map[string]storage.Pool
virtualPools map[string]storage.Pool

cloneSplitTimers map[string]time.Time
}

func (d *NASQtreeStorageDriver) GetConfig() *drivers.OntapStorageDriverConfig {
Expand Down Expand Up @@ -220,6 +222,9 @@ func (d *NASQtreeStorageDriver) Initialize(
d.telemetry.TridentBackendUUID = backendUUID
d.telemetry.Start(ctx)

// Set up the clone split timers.
d.cloneSplitTimers = make(map[string]time.Time)

d.initialized = true
return nil
}
Expand Down Expand Up @@ -969,12 +974,17 @@ func (d *NASQtreeStorageDriver) DeleteSnapshot(
if err = d.API.VolumeSnapshotDelete(ctx, snapConfig.InternalName, flexvol); err != nil {
if api.IsSnapshotBusyError(err) {
// Start a split here before returning the error so a subsequent delete attempt may succeed.
_ = SplitVolumeFromBusySnapshot(ctx, snapConfig, &d.Config, d.API, d.API.VolumeCloneSplitStart)
SplitVolumeFromBusySnapshotWithDelay(ctx, snapConfig, &d.Config, d.API,
d.API.VolumeCloneSplitStart, d.cloneSplitTimers)
}
// We must return the err, even if we started a split, so the snapshot delete is retried

// We must return the error, even if we started a split, so the snapshot delete is retried.
return err
}

// Clean up any split timer.
delete(d.cloneSplitTimers, snapConfig.ID())

Logc(ctx).WithField("snapshotName", snapConfig.InternalName).Debug("Deleted snapshot.")
return nil
}
Expand Down
3 changes: 3 additions & 0 deletions storage_drivers/ontap/ontap_nas_qtree_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4293,6 +4293,9 @@ func TestDeleteSnapshot_FailureSnapshotBusy(t *testing.T) {
mockAPI.EXPECT().VolumeListBySnapshotParent(ctx, "snap1", flexvol).Return(childVols, nil)
mockAPI.EXPECT().VolumeCloneSplitStart(ctx, flexvol).Return(nil)

driver.cloneSplitTimers = make(map[string]time.Time)
// Use DefaultCloneSplitDelay to set time to past. It is defaulted to 10 seconds.
driver.cloneSplitTimers[snapConfig.ID()] = time.Now().Add(-10 * time.Second)
result := driver.DeleteSnapshot(ctx, snapConfig, volConfig)

assert.NotNil(t, result, "result is nil")
Expand Down
3 changes: 3 additions & 0 deletions storage_drivers/ontap/ontap_nas_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1561,6 +1561,9 @@ func TestOntapNasStorageDriverVolumeDeleteSnapshot_Failure(t *testing.T) {
mockAPI.EXPECT().VolumeListBySnapshotParent(ctx, "snap1", "vol1").Return(childVols, nil)
mockAPI.EXPECT().VolumeCloneSplitStart(ctx, "vol1").Return(nil)

driver.cloneSplitTimers = make(map[string]time.Time)
// Use DefaultCloneSplitDelay to set time to past. It is defaulted to 10 seconds.
driver.cloneSplitTimers[snapConfig.ID()] = time.Now().Add(-10 * time.Second)
result := driver.DeleteSnapshot(ctx, snapConfig, volConfig)

assert.Error(t, result)
Expand Down
14 changes: 12 additions & 2 deletions storage_drivers/ontap/ontap_san.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ type SANStorageDriver struct {

physicalPools map[string]storage.Pool
virtualPools map[string]storage.Pool

cloneSplitTimers map[string]time.Time
}

func (d *SANStorageDriver) GetConfig() *drivers.OntapStorageDriverConfig {
Expand Down Expand Up @@ -153,6 +155,9 @@ func (d *SANStorageDriver) Initialize(
d.telemetry.TridentBackendUUID = backendUUID
d.telemetry.Start(ctx)

// Set up the clone split timers
d.cloneSplitTimers = make(map[string]time.Time)

d.initialized = true
return nil
}
Expand Down Expand Up @@ -959,12 +964,17 @@ func (d *SANStorageDriver) DeleteSnapshot(
if err != nil {
if api.IsSnapshotBusyError(err) {
// Start a split here before returning the error so a subsequent delete attempt may succeed.
_ = SplitVolumeFromBusySnapshot(ctx, snapConfig, &d.Config, d.API, d.API.VolumeCloneSplitStart)
SplitVolumeFromBusySnapshotWithDelay(ctx, snapConfig, &d.Config, d.API,
d.API.VolumeCloneSplitStart, d.cloneSplitTimers)
}
// we must return the err, even if we started a split, so the snapshot delete is retried

// We must return the error, even if we started a split, so the snapshot delete is retried.
return err
}

// Clean up any split timer
delete(d.cloneSplitTimers, snapConfig.ID())

Logc(ctx).WithField("snapshotName", snapConfig.InternalName).Debug("Deleted snapshot.")
return nil
}
Expand Down
15 changes: 13 additions & 2 deletions storage_drivers/ontap/ontap_san_nvme.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"regexp"
"strconv"
"strings"
"time"

"github.com/RoaringBitmap/roaring"

Expand Down Expand Up @@ -40,6 +41,8 @@ type NVMeStorageDriver struct {

physicalPools map[string]storage.Pool
virtualPools map[string]storage.Pool

cloneSplitTimers map[string]time.Time
}

const (
Expand Down Expand Up @@ -170,6 +173,9 @@ func (d *NVMeStorageDriver) Initialize(
d.telemetry.TridentBackendUUID = backendUUID
d.telemetry.Start(ctx)

// Set up the clone split timers
d.cloneSplitTimers = make(map[string]time.Time)

d.initialized = true
return nil
}
Expand Down Expand Up @@ -938,12 +944,17 @@ func (d *NVMeStorageDriver) DeleteSnapshot(
if err != nil {
if api.IsSnapshotBusyError(err) {
// Start a split here before returning the error so a subsequent delete attempt may succeed.
_ = SplitVolumeFromBusySnapshot(ctx, snapConfig, &d.Config, d.API, d.API.VolumeCloneSplitStart)
SplitVolumeFromBusySnapshotWithDelay(ctx, snapConfig, &d.Config, d.API,
d.API.VolumeCloneSplitStart, d.cloneSplitTimers)
}
// we must return the err, even if we started a split, so the snapshot delete is retried

// we must return the error, even if we started a split, so the snapshot delete is retried.
return err
}

// Clean up any split timer.
delete(d.cloneSplitTimers, snapConfig.ID())

Logc(ctx).WithField("snapshotName", snapConfig.InternalName).Debug("Deleted snapshot.")
return nil
}
Expand Down
Loading

0 comments on commit 769863e

Please sign in to comment.