diff --git a/storage_drivers/ontap/ontap_common.go b/storage_drivers/ontap/ontap_common.go index 9efffbf7b..26f5fd1c3 100644 --- a/storage_drivers/ontap/ontap_common.go +++ b/storage_drivers/ontap/ontap_common.go @@ -1223,6 +1223,7 @@ const ( DefaultNfsMountOptionsDocker = "-o nfsvers=3" DefaultNfsMountOptionsKubernetes = "" DefaultSplitOnClone = "false" + DefaultCloneSplitDelay = 10 DefaultLuksEncryption = "false" DefaultMirroring = "false" DefaultLimitAggregateUsage = "" @@ -1313,6 +1314,12 @@ func PopulateConfigurationDefaults(ctx context.Context, config *drivers.OntapSto } } + if config.CloneSplitDelay == "" { + config.CloneSplitDelay = strconv.FormatInt(DefaultCloneSplitDelay, 10) + } else if v, err := strconv.ParseInt(config.CloneSplitDelay, 10, 0); err != nil || v <= 0 { + return fmt.Errorf("invalid value for cloneSplitDelay: %v", config.CloneSplitDelay) + } + if config.FileSystemType == "" { config.FileSystemType = drivers.DefaultFileSystemType } @@ -1389,6 +1396,7 @@ func PopulateConfigurationDefaults(ctx context.Context, config *drivers.OntapSto "SecurityStyle": config.SecurityStyle, "NfsMountOptions": config.NfsMountOptions, "SplitOnClone": config.SplitOnClone, + "CloneSplitDelay": config.CloneSplitDelay, "FileSystemType": config.FileSystemType, "Encryption": config.Encryption, "LUKSEncryption": config.LUKSEncryption, @@ -1636,6 +1644,76 @@ func SplitVolumeFromBusySnapshot( return nil } +func SplitVolumeFromBusySnapshotWithDelay( + ctx context.Context, snapConfig *storage.SnapshotConfig, config *drivers.OntapStorageDriverConfig, + client api.OntapAPI, cloneSplitStart func(ctx context.Context, cloneName string) error, + cloneSplitTimers map[string]time.Time, +) { + snapshotID := snapConfig.ID() + + // Find the configured value for cloneSplitDelay + delay, err := strconv.ParseInt(config.CloneSplitDelay, 10, 0) + if err != nil || delay < 0 { + // Should not come here, but just in case. + delay = DefaultCloneSplitDelay + } + cloneSplitDelay := time.Duration(delay) * time.Second + + // If this is the first delete, just log the time and return. + firstDeleteTime, ok := cloneSplitTimers[snapshotID] + if !ok { + cloneSplitTimers[snapshotID] = time.Now() + + Logc(ctx).WithFields(LogFields{ + "snapshot": snapshotID, + "secondsBeforeSplit": fmt.Sprintf("%3.2f", cloneSplitDelay.Seconds()), + }).Warning("Initial locked snapshot delete, starting clone split timer.") + + return + } + + // This isn't the first delete, and the split is still running, so there is nothing to do. + if time.Now().Sub(firstDeleteTime) < 0 { + + Logc(ctx).WithFields(LogFields{ + "snapshot": snapshotID, + }).Warning("Retried locked snapshot delete, clone split still running.") + + return + } + + // This isn't the first delete, and the delay has not expired, so there is nothing to do. + if time.Now().Sub(firstDeleteTime) < cloneSplitDelay { + + Logc(ctx).WithFields(LogFields{ + "snapshot": snapshotID, + "secondsBeforeSplit": fmt.Sprintf("%3.2f", time.Now().Sub(firstDeleteTime).Seconds()), + }).Warning("Retried locked snapshot delete, clone split timer not yet expired.") + + return + } + + // The delay has expired, so start the split + splitErr := SplitVolumeFromBusySnapshot(ctx, snapConfig, config, client, cloneSplitStart) + if splitErr != nil { + + // The split start failed, so reset the timer so we start again after another brief delay. + cloneSplitTimers[snapshotID] = time.Now() + + Logc(ctx).WithFields(LogFields{ + "snapshot": snapshotID, + "secondsBeforeSplit": fmt.Sprintf("%3.2f", cloneSplitDelay.Seconds()), + }).Warning("Retried locked snapshot delete, clone split failed, restarted timer.") + + } else { + + // The split start succeeded, so add enough time to the timer that we don't try to start it again. + cloneSplitTimers[snapshotID] = time.Now().Add(1 * time.Hour) + + Logc(ctx).WithField("snapshot", snapshotID).Warning("Retried locked snapshot delete, clone split started.") + } +} + // getVolumeExternal is a method that accepts info about a volume // as returned by the storage backend and formats it as a VolumeExternal // object. diff --git a/storage_drivers/ontap/ontap_common_test.go b/storage_drivers/ontap/ontap_common_test.go index 3402e8694..4db0624fd 100644 --- a/storage_drivers/ontap/ontap_common_test.go +++ b/storage_drivers/ontap/ontap_common_test.go @@ -4734,6 +4734,42 @@ func TestSplitVolumeFromBusySnapshot(t *testing.T) { assert.NoError(t, err) } +func TestSplitVolumeFromBusySnapshotWithDelay(t *testing.T) { + ctx := context.Background() + mockCtrl := gomock.NewController(t) + mockAPI := mockapi.NewMockOntapAPI(mockCtrl) + commonConfig := &drivers.CommonStorageDriverConfig{ + DebugTraceFlags: map[string]bool{"method": true}, + } + config := &drivers.OntapStorageDriverConfig{ + CommonStorageDriverConfig: commonConfig, + } + snapConfig := &storage.SnapshotConfig{ + VolumeInternalName: "fakeVolInternalName", + InternalName: "fakeInternalName", + } + + // Test 1: No snap found + cloneSplitTimers := make(map[string]time.Time) + SplitVolumeFromBusySnapshotWithDelay(ctx, snapConfig, config, mockAPI, mockCloneSplitStart, cloneSplitTimers) + + // Test 2: time difference is < 0 + cloneSplitTimers[snapConfig.ID()] = time.Now().Add(10 * time.Millisecond) + SplitVolumeFromBusySnapshotWithDelay(ctx, snapConfig, config, mockAPI, mockCloneSplitStart, cloneSplitTimers) + + time.Sleep(50 * time.Millisecond) + // Test 3: time difference < config.CloneSplitDelay, which is defaulted to 10 seconds. + SplitVolumeFromBusySnapshotWithDelay(ctx, snapConfig, config, mockAPI, mockCloneSplitStart, cloneSplitTimers) + + // Test 4: SplitVolumeFromBusySnapshot returning error + // Add the time for first delete in past so that SplitVolume is called. + cloneSplitTimers[snapConfig.ID()] = time.Now().Add(-10 * time.Second) + mockAPI.EXPECT().VolumeListBySnapshotParent(ctx, snapConfig.InternalName, + snapConfig.VolumeInternalName).Return(api.VolumeNameList{}, + fmt.Errorf("error returned by VolumeListBySnapshotParent")) + SplitVolumeFromBusySnapshotWithDelay(ctx, snapConfig, config, mockAPI, mockCloneSplitStart, cloneSplitTimers) +} + func TestGetVolumeExternalCommon(t *testing.T) { volume := api.Volume{ Name: "fakeVolume", @@ -5122,6 +5158,12 @@ func TestPopulateConfigurationDefaults(t *testing.T) { err = PopulateConfigurationDefaults(ctx, config) assert.Error(t, err) + + // Test 8 - Invalid value for cloneSplitDelay + config.SplitOnClone = "true" + config.CloneSplitDelay = "-123" + err = PopulateConfigurationDefaults(ctx, config) + assert.Error(t, err) } func TestNewOntapTelemetry(t *testing.T) { diff --git a/storage_drivers/ontap/ontap_nas.go b/storage_drivers/ontap/ontap_nas.go index cb944e923..5f08cb4ed 100644 --- a/storage_drivers/ontap/ontap_nas.go +++ b/storage_drivers/ontap/ontap_nas.go @@ -53,6 +53,8 @@ type NASStorageDriver struct { physicalPools map[string]storage.Pool virtualPools map[string]storage.Pool + + cloneSplitTimers map[string]time.Time } func (d *NASStorageDriver) GetConfig() *drivers.OntapStorageDriverConfig { @@ -146,6 +148,9 @@ func (d *NASStorageDriver) Initialize( d.telemetry.TridentBackendUUID = backendUUID d.telemetry.Start(ctx) + // Set up the clone split timers + d.cloneSplitTimers = make(map[string]time.Time) + d.initialized = true return nil } @@ -926,12 +931,17 @@ func (d *NASStorageDriver) DeleteSnapshot( if err := d.API.VolumeSnapshotDelete(ctx, snapConfig.InternalName, snapConfig.VolumeInternalName); err != nil { if api.IsSnapshotBusyError(err) { // Start a split here before returning the error so a subsequent delete attempt may succeed. - _ = SplitVolumeFromBusySnapshot(ctx, snapConfig, &d.Config, d.API, d.API.VolumeCloneSplitStart) + SplitVolumeFromBusySnapshotWithDelay(ctx, snapConfig, &d.Config, d.API, + d.API.VolumeCloneSplitStart, d.cloneSplitTimers) } - // we must return the err, even if we started a split, so the snapshot delete is retried + + // We must return the error, even if we started a split, so the snapshot delete is retried. return err } + // Clean up any split timer + delete(d.cloneSplitTimers, snapConfig.ID()) + Logc(ctx).WithField("snapshotName", snapConfig.InternalName).Debug("Deleted snapshot.") return nil } diff --git a/storage_drivers/ontap/ontap_nas_flexgroup.go b/storage_drivers/ontap/ontap_nas_flexgroup.go index 6aa085bb5..3e5f17b72 100644 --- a/storage_drivers/ontap/ontap_nas_flexgroup.go +++ b/storage_drivers/ontap/ontap_nas_flexgroup.go @@ -39,6 +39,8 @@ type NASFlexGroupStorageDriver struct { physicalPool storage.Pool virtualPools map[string]storage.Pool timeout time.Duration + + cloneSplitTimers map[string]time.Time } func (d *NASFlexGroupStorageDriver) GetConfig() *drivers.OntapStorageDriverConfig { @@ -132,6 +134,9 @@ func (d *NASFlexGroupStorageDriver) Initialize( d.telemetry.TridentBackendUUID = backendUUID d.telemetry.Start(ctx) + // Set up the clone split timers + d.cloneSplitTimers = make(map[string]time.Time) + d.initialized = true return nil } @@ -1309,13 +1314,17 @@ func (d *NASFlexGroupStorageDriver) DeleteSnapshot( if err := d.API.FlexgroupSnapshotDelete(ctx, snapConfig.InternalName, snapConfig.VolumeInternalName); err != nil { if api.IsSnapshotBusyError(err) { // Start a split here before returning the error so a subsequent delete attempt may succeed. - _ = SplitVolumeFromBusySnapshot(ctx, snapConfig, &d.Config, d.API, - d.API.FlexgroupCloneSplitStart) + SplitVolumeFromBusySnapshotWithDelay(ctx, snapConfig, &d.Config, d.API, + d.API.FlexgroupCloneSplitStart, d.cloneSplitTimers) } - // we must return the err, even if we started a split, so the snapshot delete is retried + + // We must return the error, even if we started a split, so the snapshot delete is retried. return err } + // Clean up any split timer + delete(d.cloneSplitTimers, snapConfig.ID()) + Logc(ctx).WithField("snapshotName", snapConfig.InternalName).Debug("Deleted snapshot.") return nil } diff --git a/storage_drivers/ontap/ontap_nas_flexgroup_test.go b/storage_drivers/ontap/ontap_nas_flexgroup_test.go index 32b7901a0..de7f76ba7 100644 --- a/storage_drivers/ontap/ontap_nas_flexgroup_test.go +++ b/storage_drivers/ontap/ontap_nas_flexgroup_test.go @@ -2724,6 +2724,9 @@ func TestOntapNasFlexgroupStorageDriverVolumeDeleteSnapshot_Failure(t *testing.T mockAPI.EXPECT().VolumeListBySnapshotParent(ctx, "snap1", "vol1").Return(childVols, nil) mockAPI.EXPECT().FlexgroupCloneSplitStart(ctx, "vol1").Return(nil) + driver.cloneSplitTimers = make(map[string]time.Time) + // Use DefaultCloneSplitDelay to set time to past. It is defaulted to 10 seconds. + driver.cloneSplitTimers[snapConfig.ID()] = time.Now().Add(-10 * time.Second) result := driver.DeleteSnapshot(ctx, snapConfig, volConfig) assert.Error(t, result, "FlexgroupSnapshotDelete sucessfully executed") diff --git a/storage_drivers/ontap/ontap_nas_qtree.go b/storage_drivers/ontap/ontap_nas_qtree.go index 48fca4299..23926a5a2 100644 --- a/storage_drivers/ontap/ontap_nas_qtree.go +++ b/storage_drivers/ontap/ontap_nas_qtree.go @@ -62,6 +62,8 @@ type NASQtreeStorageDriver struct { physicalPools map[string]storage.Pool virtualPools map[string]storage.Pool + + cloneSplitTimers map[string]time.Time } func (d *NASQtreeStorageDriver) GetConfig() *drivers.OntapStorageDriverConfig { @@ -220,6 +222,9 @@ func (d *NASQtreeStorageDriver) Initialize( d.telemetry.TridentBackendUUID = backendUUID d.telemetry.Start(ctx) + // Set up the clone split timers. + d.cloneSplitTimers = make(map[string]time.Time) + d.initialized = true return nil } @@ -969,12 +974,17 @@ func (d *NASQtreeStorageDriver) DeleteSnapshot( if err = d.API.VolumeSnapshotDelete(ctx, snapConfig.InternalName, flexvol); err != nil { if api.IsSnapshotBusyError(err) { // Start a split here before returning the error so a subsequent delete attempt may succeed. - _ = SplitVolumeFromBusySnapshot(ctx, snapConfig, &d.Config, d.API, d.API.VolumeCloneSplitStart) + SplitVolumeFromBusySnapshotWithDelay(ctx, snapConfig, &d.Config, d.API, + d.API.VolumeCloneSplitStart, d.cloneSplitTimers) } - // We must return the err, even if we started a split, so the snapshot delete is retried + + // We must return the error, even if we started a split, so the snapshot delete is retried. return err } + // Clean up any split timer. + delete(d.cloneSplitTimers, snapConfig.ID()) + Logc(ctx).WithField("snapshotName", snapConfig.InternalName).Debug("Deleted snapshot.") return nil } diff --git a/storage_drivers/ontap/ontap_nas_qtree_test.go b/storage_drivers/ontap/ontap_nas_qtree_test.go index da9ed7093..8a7d186ce 100644 --- a/storage_drivers/ontap/ontap_nas_qtree_test.go +++ b/storage_drivers/ontap/ontap_nas_qtree_test.go @@ -4293,6 +4293,9 @@ func TestDeleteSnapshot_FailureSnapshotBusy(t *testing.T) { mockAPI.EXPECT().VolumeListBySnapshotParent(ctx, "snap1", flexvol).Return(childVols, nil) mockAPI.EXPECT().VolumeCloneSplitStart(ctx, flexvol).Return(nil) + driver.cloneSplitTimers = make(map[string]time.Time) + // Use DefaultCloneSplitDelay to set time to past. It is defaulted to 10 seconds. + driver.cloneSplitTimers[snapConfig.ID()] = time.Now().Add(-10 * time.Second) result := driver.DeleteSnapshot(ctx, snapConfig, volConfig) assert.NotNil(t, result, "result is nil") diff --git a/storage_drivers/ontap/ontap_nas_test.go b/storage_drivers/ontap/ontap_nas_test.go index b94e84554..bb9625977 100644 --- a/storage_drivers/ontap/ontap_nas_test.go +++ b/storage_drivers/ontap/ontap_nas_test.go @@ -1561,6 +1561,9 @@ func TestOntapNasStorageDriverVolumeDeleteSnapshot_Failure(t *testing.T) { mockAPI.EXPECT().VolumeListBySnapshotParent(ctx, "snap1", "vol1").Return(childVols, nil) mockAPI.EXPECT().VolumeCloneSplitStart(ctx, "vol1").Return(nil) + driver.cloneSplitTimers = make(map[string]time.Time) + // Use DefaultCloneSplitDelay to set time to past. It is defaulted to 10 seconds. + driver.cloneSplitTimers[snapConfig.ID()] = time.Now().Add(-10 * time.Second) result := driver.DeleteSnapshot(ctx, snapConfig, volConfig) assert.Error(t, result) diff --git a/storage_drivers/ontap/ontap_san.go b/storage_drivers/ontap/ontap_san.go index 4352cb974..6f1aae00f 100644 --- a/storage_drivers/ontap/ontap_san.go +++ b/storage_drivers/ontap/ontap_san.go @@ -38,6 +38,8 @@ type SANStorageDriver struct { physicalPools map[string]storage.Pool virtualPools map[string]storage.Pool + + cloneSplitTimers map[string]time.Time } func (d *SANStorageDriver) GetConfig() *drivers.OntapStorageDriverConfig { @@ -153,6 +155,9 @@ func (d *SANStorageDriver) Initialize( d.telemetry.TridentBackendUUID = backendUUID d.telemetry.Start(ctx) + // Set up the clone split timers + d.cloneSplitTimers = make(map[string]time.Time) + d.initialized = true return nil } @@ -959,12 +964,17 @@ func (d *SANStorageDriver) DeleteSnapshot( if err != nil { if api.IsSnapshotBusyError(err) { // Start a split here before returning the error so a subsequent delete attempt may succeed. - _ = SplitVolumeFromBusySnapshot(ctx, snapConfig, &d.Config, d.API, d.API.VolumeCloneSplitStart) + SplitVolumeFromBusySnapshotWithDelay(ctx, snapConfig, &d.Config, d.API, + d.API.VolumeCloneSplitStart, d.cloneSplitTimers) } - // we must return the err, even if we started a split, so the snapshot delete is retried + + // We must return the error, even if we started a split, so the snapshot delete is retried. return err } + // Clean up any split timer + delete(d.cloneSplitTimers, snapConfig.ID()) + Logc(ctx).WithField("snapshotName", snapConfig.InternalName).Debug("Deleted snapshot.") return nil } diff --git a/storage_drivers/ontap/ontap_san_nvme.go b/storage_drivers/ontap/ontap_san_nvme.go index 14f18493e..1cb160fbc 100644 --- a/storage_drivers/ontap/ontap_san_nvme.go +++ b/storage_drivers/ontap/ontap_san_nvme.go @@ -11,6 +11,7 @@ import ( "regexp" "strconv" "strings" + "time" "github.com/RoaringBitmap/roaring" @@ -40,6 +41,8 @@ type NVMeStorageDriver struct { physicalPools map[string]storage.Pool virtualPools map[string]storage.Pool + + cloneSplitTimers map[string]time.Time } const ( @@ -170,6 +173,9 @@ func (d *NVMeStorageDriver) Initialize( d.telemetry.TridentBackendUUID = backendUUID d.telemetry.Start(ctx) + // Set up the clone split timers + d.cloneSplitTimers = make(map[string]time.Time) + d.initialized = true return nil } @@ -938,12 +944,17 @@ func (d *NVMeStorageDriver) DeleteSnapshot( if err != nil { if api.IsSnapshotBusyError(err) { // Start a split here before returning the error so a subsequent delete attempt may succeed. - _ = SplitVolumeFromBusySnapshot(ctx, snapConfig, &d.Config, d.API, d.API.VolumeCloneSplitStart) + SplitVolumeFromBusySnapshotWithDelay(ctx, snapConfig, &d.Config, d.API, + d.API.VolumeCloneSplitStart, d.cloneSplitTimers) } - // we must return the err, even if we started a split, so the snapshot delete is retried + + // we must return the error, even if we started a split, so the snapshot delete is retried. return err } + // Clean up any split timer. + delete(d.cloneSplitTimers, snapConfig.ID()) + Logc(ctx).WithField("snapshotName", snapConfig.InternalName).Debug("Deleted snapshot.") return nil } diff --git a/storage_drivers/ontap/ontap_san_test.go b/storage_drivers/ontap/ontap_san_test.go index 840f6f1ae..693d2d74e 100644 --- a/storage_drivers/ontap/ontap_san_test.go +++ b/storage_drivers/ontap/ontap_san_test.go @@ -2089,6 +2089,9 @@ func TestOntapSanVolumeSnapshotDelete_fail(t *testing.T) { mockAPI.EXPECT().VolumeListBySnapshotParent(ctx, snapshotConfig.InternalName, snapshotConfig.VolumeInternalName).Return(nil, nil) + driver.cloneSplitTimers = make(map[string]time.Time) + // Use DefaultCloneSplitDelay to set time to past. It is defaulted to 10 seconds. + driver.cloneSplitTimers[snapshotConfig.ID()] = time.Now().Add(-10 * time.Second) err := driver.DeleteSnapshot(ctx, snapshotConfig, volConfig) assert.Error(t, err, "Snapshot destroyed, expected an error") diff --git a/storage_drivers/types.go b/storage_drivers/types.go index 8371a2e40..1a057b057 100644 --- a/storage_drivers/types.go +++ b/storage_drivers/types.go @@ -119,6 +119,7 @@ type OntapStorageDriverConfig struct { QtreesPerFlexvol string `json:"qtreesPerFlexvol"` // default to 200 LUNsPerFlexvol string `json:"lunsPerFlexvol"` // default to 100 EmptyFlexvolDeferredDeletePeriod string `json:"emptyFlexvolDeferredDeletePeriod"` // in seconds, default to 28800 + CloneSplitDelay string `json:"cloneSplitDelay,omitEmpty"` // in seconds, default to 10 NfsMountOptions string `json:"nfsMountOptions"` LimitAggregateUsage string `json:"limitAggregateUsage"` AutoExportPolicy bool `json:"autoExportPolicy"`