Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement tracing and CPU profiling of long running block creations [NIT-2501] #2305

Merged
merged 10 commits into from
May 24, 2024
68 changes: 67 additions & 1 deletion execution/gethexec/sequencer.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,17 @@ import (
"fmt"
"math"
"math/big"
"os"
"runtime/debug"
"runtime/pprof"
"runtime/trace"
"strconv"
"strings"
"sync"
"sync/atomic"
"time"

"github.com/google/uuid"
"github.com/offchainlabs/nitro/arbutil"
"github.com/offchainlabs/nitro/execution"
"github.com/offchainlabs/nitro/util/arbmath"
Expand Down Expand Up @@ -76,6 +80,7 @@ type SequencerConfig struct {
NonceFailureCacheExpiry time.Duration `koanf:"nonce-failure-cache-expiry" reload:"hot"`
ExpectedSurplusSoftThreshold string `koanf:"expected-surplus-soft-threshold" reload:"hot"`
ExpectedSurplusHardThreshold string `koanf:"expected-surplus-hard-threshold" reload:"hot"`
EnableProfiling bool `koanf:"enable-profiling"`
expectedSurplusSoftThreshold int
expectedSurplusHardThreshold int
}
Expand Down Expand Up @@ -125,6 +130,7 @@ var DefaultSequencerConfig = SequencerConfig{
NonceFailureCacheExpiry: time.Second,
ExpectedSurplusSoftThreshold: "default",
ExpectedSurplusHardThreshold: "default",
EnableProfiling: true,
}

var TestSequencerConfig = SequencerConfig{
Expand All @@ -142,6 +148,7 @@ var TestSequencerConfig = SequencerConfig{
NonceFailureCacheExpiry: time.Second,
ExpectedSurplusSoftThreshold: "default",
ExpectedSurplusHardThreshold: "default",
EnableProfiling: false,
}

func SequencerConfigAddOptions(prefix string, f *flag.FlagSet) {
Expand All @@ -159,6 +166,7 @@ func SequencerConfigAddOptions(prefix string, f *flag.FlagSet) {
f.Duration(prefix+".nonce-failure-cache-expiry", DefaultSequencerConfig.NonceFailureCacheExpiry, "maximum amount of time to wait for a predecessor before rejecting a tx with nonce too high")
f.String(prefix+".expected-surplus-soft-threshold", DefaultSequencerConfig.ExpectedSurplusSoftThreshold, "if expected surplus is lower than this value, warnings are posted")
f.String(prefix+".expected-surplus-hard-threshold", DefaultSequencerConfig.ExpectedSurplusHardThreshold, "if expected surplus is lower than this value, new incoming transactions will be denied")
f.Bool(prefix+".enable-profiling", DefaultSequencerConfig.EnableProfiling, "enable CPU profiling and tracing")
}

type txQueueItem struct {
Expand Down Expand Up @@ -327,6 +335,7 @@ type Sequencer struct {
expectedSurplusMutex sync.RWMutex
expectedSurplus int64
expectedSurplusUpdated bool
enableProfiling bool
}

func NewSequencer(execEngine *ExecutionEngine, l1Reader *headerreader.HeaderReader, configFetcher SequencerConfigFetcher) (*Sequencer, error) {
Expand All @@ -353,6 +362,7 @@ func NewSequencer(execEngine *ExecutionEngine, l1Reader *headerreader.HeaderRead
l1Timestamp: 0,
pauseChan: nil,
onForwarderSet: make(chan struct{}, 1),
enableProfiling: config.EnableProfiling,
}
s.nonceFailures = &nonceFailureCache{
containers.NewLruCacheWithOnEvict(config.NonceCacheSize, s.onNonceFailureEvict),
Expand Down Expand Up @@ -758,6 +768,57 @@ func (s *Sequencer) precheckNonces(queueItems []txQueueItem) []txQueueItem {
return outputQueueItems
}

func deleteFiles(files ...*os.File) {
for _, f := range files {
if err := os.Remove(f.Name()); err != nil {
log.Error("Error removing file", "name", f.Name())
magicxyyz marked this conversation as resolved.
Show resolved Hide resolved
}
}
}

func closeFiles(files ...*os.File) {
for _, f := range files {
if err := os.Remove(f.Name()); err != nil {
log.Error("Error closing file", "name", f.Name())
magicxyyz marked this conversation as resolved.
Show resolved Hide resolved
}
}
}

// createBlockWithProfiling runs create block with tracing and CPU profiling
// enabled. If the block creation takes longer than 5 seconds, it keeps both
// and prints out filenames in an error log line.
func (s *Sequencer) createBlockWithProfiling(ctx context.Context) bool {
id := uuid.NewString()
pprofFile, err := os.CreateTemp("", id+".pprof")
magicxyyz marked this conversation as resolved.
Show resolved Hide resolved
if err != nil {
log.Error("Creating temporary file for profiling CPU", "error", err)
}
traceFile, err := os.CreateTemp("", id+".trace")
if err != nil {
log.Error("Creating temporary file for tracing", "error", err)
}
if err := pprof.StartCPUProfile(pprofFile); err != nil {
log.Error("Starting CPU profiling", "error", err)
deleteFiles(pprofFile)
}
if err := trace.Start(traceFile); err != nil {
magicxyyz marked this conversation as resolved.
Show resolved Hide resolved
deleteFiles(traceFile)
log.Error("Starting tracing", "error", err)
}
start := time.Now()
res := s.createBlock(ctx)
elapsed := time.Since(start)
pprof.StopCPUProfile()
trace.Stop()
anodar marked this conversation as resolved.
Show resolved Hide resolved
closeFiles(pprofFile, traceFile)
if elapsed > 5*time.Second {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does it make sense to make the time threshold configurable?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have similar argument here as above, this is just for debugging why do some queries take long time. (Just like in existing code:

if elapsed >= time.Second*5 {
)

My reasoning in this case for not having configurable is that I believe we already have too many flags, majority of which never actually varies. I'm hesitant to add another one like that, unless we have a good reason.

In any case, if you see more value in that, happy to make it configurable.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense, but I wander if 5 seconds for block creation isn't too high threshold.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Discussed with Lee, he suggested 2 seconds and I've changed to that.

log.Error("Block creation took longer than 5 seconds", "pprof", pprofFile.Name())
return res
}
deleteFiles(pprofFile, traceFile)
return res
}

func (s *Sequencer) createBlock(ctx context.Context) (returnValue bool) {
var queueItems []txQueueItem
var totalBatchSize int
Expand Down Expand Up @@ -1088,7 +1149,12 @@ func (s *Sequencer) Start(ctxIn context.Context) error {

s.CallIteratively(func(ctx context.Context) time.Duration {
nextBlock := time.Now().Add(s.config().MaxBlockSpeed)
madeBlock := s.createBlock(ctx)
var madeBlock bool
if s.enableProfiling {
madeBlock = s.createBlockWithProfiling(ctx)
} else {
madeBlock = s.createBlock(ctx)
}
if madeBlock {
// Note: this may return a negative duration, but timers are fine with that (they treat negative durations as 0).
return time.Until(nextBlock)
Expand Down
Loading