Skip to content

Commit

Permalink
Improve matching simulator isolation group metrics
Browse files Browse the repository at this point in the history
Record isolation group information for additional events and use it to calculate the median, mean, and max latency of events per task list and isolation group. Additionally record the percent of tasks that are dispatched to a poller with that same isolation group per task list and isolation group. With the current implementation no scenarios leak tasks to another isolation group.

Additionally provide a definition of getAllIsolationGroups so that the matching simulator doesn't deadlock due to panics in task list manager initialization.

Create 6 new scenarios for zonal isolation. The first three (few_pollers, many_pollers, and single_partition) test a scenario where the total task throughput is easily manageable with any number of pollers but the number of pollers/partitions significantly impacts the performance. The next two (zonal_isolation, zonal_isolation_skew) show a higher throughput scenario which should still be manageable by the specified pollers for each isolation group. The latter of the two has the tasks skewed to the maximum that pollers from a single group should be able to process (64/12/12/12) vs (25/25/25/25).

The final scenario, zonal_isolation_skew_extreme, has the tasks heavily skewed (90/3/3/3) beyond what a single group can handle.
  • Loading branch information
natemort committed Nov 21, 2024
1 parent 925e64d commit 2099c28
Show file tree
Hide file tree
Showing 12 changed files with 377 additions and 15 deletions.
21 changes: 21 additions & 0 deletions host/onebox.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ import (
"github.com/uber/cadence/common/domain"
"github.com/uber/cadence/common/dynamicconfig"
"github.com/uber/cadence/common/elasticsearch"
"github.com/uber/cadence/common/isolationgroup/isolationgroupapi"
"github.com/uber/cadence/common/log"
"github.com/uber/cadence/common/log/tag"
"github.com/uber/cadence/common/membership"
Expand Down Expand Up @@ -577,6 +578,7 @@ func (c *cadenceImpl) startFrontend(hosts map[string][]membership.HostInfo, star
params.ESClient = c.esClient
params.PinotConfig = c.pinotConfig
params.PinotClient = c.pinotClient
params.GetIsolationGroups = getFromDynamicConfig(params)
var err error
authorizer, err := authorization.NewAuthorizer(c.authorizationConfig, params.Logger, nil)
if err != nil {
Expand Down Expand Up @@ -661,6 +663,7 @@ func (c *cadenceImpl) startHistory(hosts map[string][]membership.HostInfo, start
params.ESConfig = c.esConfig
params.ESClient = c.esClient
params.PinotConfig = c.pinotConfig
params.GetIsolationGroups = getFromDynamicConfig(params)

var err error
params.PersistenceConfig, err = copyPersistenceConfig(c.persistenceConfig)
Expand Down Expand Up @@ -733,6 +736,7 @@ func (c *cadenceImpl) startMatching(hosts map[string][]membership.HostInfo, star
params.DynamicConfig = newIntegrationConfigClient(dynamicconfig.NewNopClient(), c.matchingDynCfgOverrides)
params.ArchivalMetadata = c.archiverMetadata
params.ArchiverProvider = c.archiverProvider
params.GetIsolationGroups = getFromDynamicConfig(params)

var err error
params.PersistenceConfig, err = copyPersistenceConfig(c.persistenceConfig)
Expand Down Expand Up @@ -793,6 +797,7 @@ func (c *cadenceImpl) startWorker(hosts map[string][]membership.HostInfo, startW
params.DynamicConfig = newIntegrationConfigClient(dynamicconfig.NewNopClient(), c.workerDynCfgOverrides)
params.ArchivalMetadata = c.archiverMetadata
params.ArchiverProvider = c.archiverProvider
params.GetIsolationGroups = getFromDynamicConfig(params)

var err error
params.PersistenceConfig, err = copyPersistenceConfig(c.persistenceConfig)
Expand Down Expand Up @@ -1096,3 +1101,19 @@ func (vm *versionMiddleware) Handle(ctx context.Context, req *transport.Request,

return h.Handle(ctx, req, resw)
}

func getFromDynamicConfig(params *resource.Params) func() []string {
return func() []string {
list, err := params.DynamicConfig.GetListValue(dynamicconfig.AllIsolationGroups, nil)
if err != nil {
params.Logger.Error("failed to get isolation groups from config", tag.Error(err))
return nil
}
res, err := isolationgroupapi.MapAllIsolationGroupsResponse(list)
if err != nil {
params.Logger.Error("failed to map isolation groups from config", tag.Error(err))
return nil
}
return res
}
}
28 changes: 18 additions & 10 deletions host/testdata/matching_simulation_zonal_isolation.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,27 +8,35 @@ historyconfig:
matchingconfig:
nummatchinghosts: 4
simulationconfig:
tasklistwritepartitions: 2
tasklistreadpartitions: 2
tasklistwritepartitions: 4
tasklistreadpartitions: 4
forwardermaxoutstandingpolls: 1
forwardermaxoutstandingtasks: 1
forwardermaxratepersecond: 10
forwardermaxchildrenpernode: 20
localpollwaittime: 10ms
localtaskwaittime: 10ms
tasks:
- numtaskgenerators: 2
taskspersecond: 80
maxtasktogenerate: 3000
isolationgroups: ['a', 'b']
- numtaskgenerators: 30
taskspersecond: 500
maxtasktogenerate: 5000
isolationgroups: ['a', 'b', 'c', 'd']
pollers:
- isolationgroup: 'a'
taskprocesstime: 1ms
numpollers: 4
taskprocesstime: 25ms
numpollers: 8
polltimeout: 60s
- isolationgroup: 'b'
taskprocesstime: 1ms
numpollers: 4
taskprocesstime: 25ms
numpollers: 8
polltimeout: 60s
- isolationgroup: 'c'
taskprocesstime: 25ms
numpollers: 8
polltimeout: 60s
- isolationgroup: 'd'
taskprocesstime: 25ms
numpollers: 8
polltimeout: 60s
workerconfig:
enableasyncwfconsumer: false
42 changes: 42 additions & 0 deletions host/testdata/matching_simulation_zonal_isolation_few_pollers.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
enablearchival: false
clusterno: 1
messagingclientconfig:
usemock: true
historyconfig:
numhistoryshards: 4
numhistoryhosts: 1
matchingconfig:
nummatchinghosts: 4
simulationconfig:
tasklistwritepartitions: 4
tasklistreadpartitions: 4
forwardermaxoutstandingpolls: 1
forwardermaxoutstandingtasks: 1
forwardermaxratepersecond: 10
forwardermaxchildrenpernode: 20
localpollwaittime: 10ms
localtaskwaittime: 10ms
tasks:
- numtaskgenerators: 30
taskspersecond: 500
maxtasktogenerate: 5000
isolationgroups: ['a', 'b', 'c', 'd']
pollers:
- isolationgroup: 'a'
taskprocesstime: 1ms
numpollers: 2
polltimeout: 60s
- isolationgroup: 'b'
taskprocesstime: 1ms
numpollers: 2
polltimeout: 60s
- isolationgroup: 'c'
taskprocesstime: 1ms
numpollers: 2
polltimeout: 60s
- isolationgroup: 'd'
taskprocesstime: 1ms
numpollers: 2
polltimeout: 60s
workerconfig:
enableasyncwfconsumer: false
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
enablearchival: false
clusterno: 1
messagingclientconfig:
usemock: true
historyconfig:
numhistoryshards: 4
numhistoryhosts: 1
matchingconfig:
nummatchinghosts: 4
simulationconfig:
tasklistwritepartitions: 4
tasklistreadpartitions: 4
forwardermaxoutstandingpolls: 1
forwardermaxoutstandingtasks: 1
forwardermaxratepersecond: 10
forwardermaxchildrenpernode: 20
localpollwaittime: 10ms
localtaskwaittime: 10ms
tasks:
- numtaskgenerators: 30
taskspersecond: 500
maxtasktogenerate: 5000
isolationgroups: ['a', 'b', 'c', 'd']
pollers:
- isolationgroup: 'a'
taskprocesstime: 1ms
numpollers: 16
polltimeout: 60s
- isolationgroup: 'b'
taskprocesstime: 1ms
numpollers: 16
polltimeout: 60s
- isolationgroup: 'c'
taskprocesstime: 1ms
numpollers: 16
polltimeout: 60s
- isolationgroup: 'd'
taskprocesstime: 1ms
numpollers: 16
polltimeout: 60s
workerconfig:
enableasyncwfconsumer: false
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
enablearchival: false
clusterno: 1
messagingclientconfig:
usemock: true
historyconfig:
numhistoryshards: 4
numhistoryhosts: 1
matchingconfig:
nummatchinghosts: 4
simulationconfig:
tasklistwritepartitions: 1
tasklistreadpartitions: 1
forwardermaxoutstandingpolls: 1
forwardermaxoutstandingtasks: 1
forwardermaxratepersecond: 10
forwardermaxchildrenpernode: 20
localpollwaittime: 10ms
localtaskwaittime: 10ms
tasks:
- numtaskgenerators: 30
taskspersecond: 500
maxtasktogenerate: 5000
isolationgroups: ['a', 'b', 'c', 'd']
pollers:
- isolationgroup: 'a'
taskprocesstime: 1ms
numpollers: 2
polltimeout: 60s
- isolationgroup: 'b'
taskprocesstime: 1ms
numpollers: 2
polltimeout: 60s
- isolationgroup: 'c'
taskprocesstime: 1ms
numpollers: 2
polltimeout: 60s
- isolationgroup: 'd'
taskprocesstime: 1ms
numpollers: 2
polltimeout: 60s
workerconfig:
enableasyncwfconsumer: false
46 changes: 46 additions & 0 deletions host/testdata/matching_simulation_zonal_isolation_skew.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
enablearchival: false
clusterno: 1
messagingclientconfig:
usemock: true
historyconfig:
numhistoryshards: 4
numhistoryhosts: 1
matchingconfig:
nummatchinghosts: 4
simulationconfig:
tasklistwritepartitions: 4
tasklistreadpartitions: 4
forwardermaxoutstandingpolls: 1
forwardermaxoutstandingtasks: 1
forwardermaxratepersecond: 10
forwardermaxchildrenpernode: 20
localpollwaittime: 10ms
localtaskwaittime: 10ms
tasks:
- numtaskgenerators: 10
taskspersecond: 180
maxtasktogenerate: 1800
isolationgroups: ['a', 'b', 'c']
- numtaskgenerators: 20
taskspersecond: 320
maxtasktogenerate: 3200
isolationgroups: [ 'd' ]
pollers:
- isolationgroup: 'a'
taskprocesstime: 25ms
numpollers: 8
polltimeout: 60s
- isolationgroup: 'b'
taskprocesstime: 25ms
numpollers: 8
polltimeout: 60s
- isolationgroup: 'c'
taskprocesstime: 25ms
numpollers: 8
polltimeout: 60s
- isolationgroup: 'd'
taskprocesstime: 25ms
numpollers: 8
polltimeout: 60s
workerconfig:
enableasyncwfconsumer: false
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
enablearchival: false
clusterno: 1
messagingclientconfig:
usemock: true
historyconfig:
numhistoryshards: 4
numhistoryhosts: 1
matchingconfig:
nummatchinghosts: 4
simulationconfig:
tasklistwritepartitions: 4
tasklistreadpartitions: 4
forwardermaxoutstandingpolls: 1
forwardermaxoutstandingtasks: 1
forwardermaxratepersecond: 10
forwardermaxchildrenpernode: 20
localpollwaittime: 10ms
localtaskwaittime: 10ms
tasks:
- numtaskgenerators: 3
taskspersecond: 50
maxtasktogenerate: 500
isolationgroups: ['a', 'b', 'c']
- numtaskgenerators: 27
taskspersecond: 450
maxtasktogenerate: 4500
isolationgroups: [ 'd' ]
pollers:
- isolationgroup: 'a'
taskprocesstime: 25ms
numpollers: 8
polltimeout: 60s
- isolationgroup: 'b'
taskprocesstime: 25ms
numpollers: 8
polltimeout: 60s
- isolationgroup: 'c'
taskprocesstime: 25ms
numpollers: 8
polltimeout: 60s
- isolationgroup: 'd'
taskprocesstime: 25ms
numpollers: 8
polltimeout: 60s
workerconfig:
enableasyncwfconsumer: false
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
enablearchival: false
clusterno: 1
messagingclientconfig:
usemock: true
historyconfig:
numhistoryshards: 4
numhistoryhosts: 1
matchingconfig:
nummatchinghosts: 4
simulationconfig:
tasklistwritepartitions: 4
tasklistreadpartitions: 4
forwardermaxoutstandingpolls: 1
forwardermaxoutstandingtasks: 1
forwardermaxratepersecond: 10000
forwardermaxchildrenpernode: 20
localpollwaittime: 10ms
localtaskwaittime: 10ms
tasks:
- numtaskgenerators: 10
taskspersecond: 180
maxtasktogenerate: 1800
isolationgroups: ['a', 'b', 'c']
- numtaskgenerators: 20
taskspersecond: 320
maxtasktogenerate: 3200
isolationgroups: [ 'd' ]
pollers:
- isolationgroup: 'a'
taskprocesstime: 25ms
numpollers: 8
polltimeout: 60s
- isolationgroup: 'b'
taskprocesstime: 25ms
numpollers: 8
polltimeout: 60s
- isolationgroup: 'c'
taskprocesstime: 25ms
numpollers: 8
polltimeout: 60s
- isolationgroup: 'd'
taskprocesstime: 25ms
numpollers: 8
polltimeout: 60s
workerconfig:
enableasyncwfconsumer: false
Loading

0 comments on commit 2099c28

Please sign in to comment.