Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

scheduler: skip evict-leader-scheduler when setting schedule deny label #8303

Merged
merged 31 commits into from
Jun 24, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions client/http/interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
/* Scheduler-related interfaces */
GetSchedulers(context.Context) ([]string, error)
CreateScheduler(ctx context.Context, name string, storeID uint64) error
DeleteScheduler(ctx context.Context, name string) error
SetSchedulerDelay(context.Context, string, int64) error
/* Rule-related interfaces */
GetAllPlacementRuleBundles(context.Context) ([]*GroupBundle, error)
Expand All @@ -81,6 +82,10 @@
DeletePlacementRuleGroupByID(context.Context, string) error
GetAllRegionLabelRules(context.Context) ([]*LabelRule, error)
GetRegionLabelRulesByIDs(context.Context, []string) ([]*LabelRule, error)
// `SetRegionLabelRule` sets the label rule for a region.
// When a label rule (deny scheduler) is set,
// 1. All schedulers will be disabled except for the evict-leader-scheduler.
// 2. The merge-checker will be disabled, preventing these regions from being merged.
SetRegionLabelRule(context.Context, *LabelRule) error
PatchRegionLabelRules(context.Context, *LabelRulePatch) error
/* Scheduling-related interfaces */
Expand Down Expand Up @@ -762,6 +767,13 @@
WithBody(inputJSON))
}

func (c *client) DeleteScheduler(ctx context.Context, name string) error {
return c.request(ctx, newRequestInfo().
WithName(deleteSchedulerName).
WithURI(SchedulerByName(name)).
WithMethod(http.MethodDelete))

Check warning on line 774 in client/http/interface.go

View check run for this annotation

Codecov / codecov/patch

client/http/interface.go#L770-L774

Added lines #L770 - L774 were not covered by tests
}

// AccelerateSchedule accelerates the scheduling of the regions within the given key range.
// The keys in the key range should be encoded in the hex bytes format (without encoding to the UTF-8 bytes).
func (c *client) AccelerateSchedule(ctx context.Context, keyRange *KeyRange) error {
Expand Down
1 change: 1 addition & 0 deletions client/http/request_info.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ const (
getReplicateConfigName = "GetReplicateConfig"
getSchedulersName = "GetSchedulers"
createSchedulerName = "CreateScheduler"
deleteSchedulerName = "DeleteScheduler"
setSchedulerDelayName = "SetSchedulerDelay"
getAllPlacementRuleBundlesName = "GetAllPlacementRuleBundles"
getPlacementRuleBundleByGroupName = "GetPlacementRuleBundleByGroup"
Expand Down
6 changes: 5 additions & 1 deletion pkg/schedule/schedulers/scheduler_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -456,6 +456,7 @@ func (s *ScheduleController) Stop() {

// Schedule tries to create some operators.
func (s *ScheduleController) Schedule(diagnosable bool) []*operator.Operator {
_, isEvictLeaderScheduler := s.Scheduler.(*evictLeaderScheduler)
retry:
for i := 0; i < maxScheduleRetries; i++ {
// no need to retry if schedule should stop to speed exit
Expand Down Expand Up @@ -486,7 +487,10 @@ retry:
if labelMgr == nil {
continue
}
if labelMgr.ScheduleDisabled(region) {

// If the evict-leader-scheduler is disabled, it will obstruct the restart operation of tikv by the operator.
// Refer: https://docs.pingcap.com/tidb-in-kubernetes/stable/restart-a-tidb-cluster#perform-a-graceful-restart-to-a-single-tikv-pod
if labelMgr.ScheduleDisabled(region) && !isEvictLeaderScheduler {
okJiang marked this conversation as resolved.
Show resolved Hide resolved
denySchedulersByLabelerCounter.Inc()
continue retry
}
Expand Down
187 changes: 187 additions & 0 deletions tests/integrations/realcluster/scheduler_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
// Copyright 2023 TiKV Authors
okJiang marked this conversation as resolved.
Show resolved Hide resolved
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package realcluster

import (
"context"
"fmt"
"sort"
"testing"
"time"

"github.com/stretchr/testify/require"
pd "github.com/tikv/pd/client/http"
"github.com/tikv/pd/client/testutil"
"github.com/tikv/pd/pkg/schedule/labeler"
"github.com/tikv/pd/pkg/schedule/schedulers"
)

// https://github.com/tikv/pd/issues/6988#issuecomment-1694924611
// https://github.com/tikv/pd/issues/6897
func TestTransferLeader(t *testing.T) {
re := require.New(t)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()

resp, err := pdHTTPCli.GetLeader(ctx)
re.NoError(err)
oldLeader := resp.Name

var newLeader string
for i := 0; i < 2; i++ {
if resp.Name != fmt.Sprintf("pd-%d", i) {
newLeader = fmt.Sprintf("pd-%d", i)
}
}

// record scheduler
re.NoError(pdHTTPCli.CreateScheduler(ctx, schedulers.EvictLeaderName, 1))
defer func() {
re.NoError(pdHTTPCli.DeleteScheduler(ctx, schedulers.EvictLeaderName))
}()
res, err := pdHTTPCli.GetSchedulers(ctx)
re.NoError(err)
oldSchedulersLen := len(res)

re.NoError(pdHTTPCli.TransferLeader(ctx, newLeader))
// wait for transfer leader to new leader
time.Sleep(1 * time.Second)
resp, err = pdHTTPCli.GetLeader(ctx)
re.NoError(err)
re.Equal(newLeader, resp.Name)

res, err = pdHTTPCli.GetSchedulers(ctx)
re.NoError(err)
re.Len(res, oldSchedulersLen)

// transfer leader to old leader
re.NoError(pdHTTPCli.TransferLeader(ctx, oldLeader))
// wait for transfer leader
time.Sleep(1 * time.Second)
resp, err = pdHTTPCli.GetLeader(ctx)
re.NoError(err)
re.Equal(oldLeader, resp.Name)

res, err = pdHTTPCli.GetSchedulers(ctx)
re.NoError(err)
re.Len(res, oldSchedulersLen)
}

func TestRegionLabel_DenyScheduler(t *testing.T) {
okJiang marked this conversation as resolved.
Show resolved Hide resolved
re := require.New(t)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()

regions, err := pdHTTPCli.GetRegions(ctx)
re.NoError(err)
re.True(len(regions.Regions) >= 2)
region1, region2 := regions.Regions[0], regions.Regions[1]

re.NoError(pdHTTPCli.CreateScheduler(ctx, schedulers.ShuffleLeaderName, 0))
defer func() {
re.NoError(pdHTTPCli.DeleteScheduler(ctx, schedulers.ShuffleLeaderName))
}()

// wait leader transfer
testutil.Eventually(re, func() bool {
regions, err := pdHTTPCli.GetRegions(ctx)
re.NoError(err)
re.True(len(regions.Regions) > 0)
for _, region := range regions.Regions {
if region.ID == region1.ID && region.Leader.StoreID != region1.ID {
return true
}
if region.ID == region2.ID && region.Leader.StoreID != region2.ID {
return true
}
}
return false
})

// disable schedule for region1
labelRule := &pd.LabelRule{
ID: "rule1",
Labels: []pd.RegionLabel{{Key: "schedule", Value: "deny"}},
RuleType: "key-range",
Data: labeler.MakeKeyRanges(region1.StartKey, region1.EndKey),
}
err = pdHTTPCli.SetRegionLabelRule(ctx, labelRule)
re.NoError(err)
time.Sleep(time.Second)
labelRules, err := pdHTTPCli.GetAllRegionLabelRules(ctx)
re.NoError(err)
re.Len(labelRules, 2)
sort.Slice(labelRules, func(i, j int) bool {
return labelRules[i].ID < labelRules[j].ID
})
re.Equal(labelRule.ID, labelRules[1].ID)
re.Equal(labelRule.Labels, labelRules[1].Labels)
re.Equal(labelRule.RuleType, labelRules[1].RuleType)

regions, err = pdHTTPCli.GetRegions(ctx)
re.NoError(err)
for _, region := range regions.Regions {
if region.ID == region1.ID {
region1 = region
}
if region.ID == region2.ID {
region2 = region
}
}
// check shuffle leader scheduler of region1 has been disabled
for i := 0; i < 10; i++ {
regions, err := pdHTTPCli.GetRegions(ctx)
re.NoError(err)
for _, region := range regions.Regions {
if region.ID == region1.ID {
re.True(region.Leader.StoreID == region1.Leader.StoreID)
}
}
time.Sleep(50 * time.Millisecond)
}
// check shuffle leader scheduler of region2 has not been disabled
testutil.Eventually(re, func() bool {
regions, err := pdHTTPCli.GetRegions(ctx)
re.NoError(err)
for _, region := range regions.Regions {
if region.ID == region2.ID && region.Leader.StoreID != region2.Leader.StoreID {
return true
}
}
return false
})

oldRegions, err := pdHTTPCli.GetRegionsByStoreID(ctx, uint64(region1.Leader.StoreID))
re.NoError(err)
oldRegionMap := make(map[int64]bool, len(oldRegions.Regions))
for _, region := range oldRegions.Regions {
oldRegionMap[region.ID] = true
}
// enable evict leader scheduler, and check it works
re.NoError(pdHTTPCli.CreateScheduler(ctx, schedulers.EvictLeaderName, uint64(region1.Leader.StoreID)))
defer func() {
re.NoError(pdHTTPCli.DeleteScheduler(ctx, schedulers.EvictLeaderName))
}()
testutil.Eventually(re, func() bool {
regions, err := pdHTTPCli.GetRegions(ctx)
re.NoError(err)
for _, region := range regions.Regions {
if oldRegionMap[region.ID] && region.Leader.StoreID == region1.Leader.StoreID {
return false
}
}
return true
})
}
73 changes: 0 additions & 73 deletions tests/integrations/realcluster/transfer_leader_test.go

This file was deleted.

Loading