Skip to content

Commit

Permalink
chore: error handling (#280)
Browse files Browse the repository at this point in the history
* feat: better proposer error handling

* feat: fix proposer witnessgen timeout

* add
  • Loading branch information
ratankaliani authored Dec 14, 2024
1 parent f486cae commit ad870d8
Show file tree
Hide file tree
Showing 7 changed files with 155 additions and 112 deletions.
6 changes: 6 additions & 0 deletions proposer/op/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,10 @@ github.com/slack-go/slack v0.14.0 h1:6c0UTfbRnvRssZUsZ2qe0Iu07VAMPjRqOa6oX8ewF4k
github.com/slack-go/slack v0.14.0/go.mod h1:hlGi5oXA+Gt+yWTPP0plCdRKmjsDxecdHxYQdlMQKOw=
github.com/spaolacci/murmur3 v1.1.0 h1:7c1g84S4BPRrfL5Xrdp6fOJ206sU9y293DDHaoy0bLI=
github.com/spaolacci/murmur3 v1.1.0/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA=
github.com/spf13/cobra v1.7.0 h1:hyqWnYt1ZQShIddO5kBpj3vu05/++x6tJ6dg8EC572I=
github.com/spf13/cobra v1.7.0/go.mod h1:uLxZILRyS/50WlhOIKD7W6V5bgeIt+4sICxh6uRMrb0=
github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
github.com/status-im/keycard-go v0.2.0 h1:QDLFswOQu1r5jsycloeQh3bVU8n/NatHHaZobtDnDzA=
github.com/status-im/keycard-go v0.2.0/go.mod h1:wlp8ZLbsmrF6g6WjugPAx+IzoLrkdf9+mHxBEeo3Hbg=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
Expand Down Expand Up @@ -443,6 +447,8 @@ golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtn
golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
golang.org/x/tools v0.0.0-20201224043029-2b0845dc783e/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
golang.org/x/tools v0.24.0 h1:J1shsA93PJUEVaUSaay7UXAyE8aimq3GW0pjlolpa24=
golang.org/x/tools v0.24.0/go.mod h1:YhNqVBIfWHdzvTLs0d8LCuMhkKUgSUKldakyV7W/WDQ=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
Expand Down
47 changes: 3 additions & 44 deletions proposer/op/proposer/db/db.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ func InitDB(dbPath string, useCachedDb bool) (*ProofDB, error) {
}

// Use the TL;DR SQLite settings from https://kerkour.com/sqlite-for-servers.
connectionUrl := fmt.Sprintf("file:%s?_fk=1&journal_mode=WAL&synchronous=normal&cache_size=100000000&busy_timeout=15000&_txlock=immediate", dbPath)
connectionUrl := fmt.Sprintf("file:%s?_fk=1&journal_mode=WAL&synchronous=normal&cache_size=100000000&busy_timeout=30000&_txlock=immediate", dbPath)

writeDrv, err := sql.Open("sqlite3", connectionUrl)
if err != nil {
Expand All @@ -48,15 +48,15 @@ func InitDB(dbPath string, useCachedDb bool) (*ProofDB, error) {

// The write lock only allows one connection to the DB at a time.
writeDb.SetMaxOpenConns(1)
writeDb.SetConnMaxLifetime(time.Hour)
writeDb.SetConnMaxLifetime(10 * time.Minute)

readDrv, err := sql.Open("sqlite3", connectionUrl)
if err != nil {
return nil, fmt.Errorf("failed opening connection to sqlite: %v", err)
}
readDb := readDrv.DB()
readDb.SetMaxOpenConns(max(4, runtime.NumCPU()/4))
readDb.SetConnMaxLifetime(time.Hour)
readDb.SetConnMaxLifetime(10 * time.Minute)

readClient := ent.NewClient(ent.Driver(readDrv))
writeClient := ent.NewClient(ent.Driver(writeDrv))
Expand Down Expand Up @@ -256,47 +256,6 @@ func (db *ProofDB) GetLatestEndBlock() (uint64, error) {
return uint64(maxEnd.EndBlock), nil
}

// When restarting the L2OutputSubmitter, some proofs may have been left in a "requested" state without a prover request ID on the server. Until we
// implement a mechanism for querying the status of the witness generation, we need to time out these proofs after a period of time so they can be requested.
func (db *ProofDB) GetWitnessGenerationTimeoutProofsOnServer() ([]*ent.ProofRequest, error) {
currentTime := time.Now().Unix()
twentyMinutesAgo := currentTime - 20*60

proofs, err := db.readClient.ProofRequest.Query().
Where(
proofrequest.StatusEQ(proofrequest.StatusWITNESSGEN),
proofrequest.ProverRequestIDIsNil(),
proofrequest.LastUpdatedTimeLT(uint64(twentyMinutesAgo)),
).
All(context.Background())

if err != nil {
return nil, fmt.Errorf("failed to query witness generation timeout proofs: %w", err)
}

return proofs, nil
}

// If a proof failed to be sent to the prover network, it's status will be set to FAILED, but the prover request ID will be empty.
// This function returns all such proofs.
func (db *ProofDB) GetProofsFailedOnServer() ([]*ent.ProofRequest, error) {
proofs, err := db.readClient.ProofRequest.Query().
Where(
proofrequest.StatusEQ(proofrequest.StatusFAILED),
proofrequest.ProverRequestIDEQ(""),
).
All(context.Background())

if err != nil {
if ent.IsNotFound(err) {
return nil, nil
}
return nil, fmt.Errorf("failed to query failed proof: %w", err)
}

return proofs, nil
}

// GetAllProofsWithStatus returns all proofs with the given status.
func (db *ProofDB) GetAllProofsWithStatus(status proofrequest.Status) ([]*ent.ProofRequest, error) {
proofs, err := db.readClient.ProofRequest.Query().
Expand Down
5 changes: 5 additions & 0 deletions proposer/op/proposer/db/ent/migrate/schema.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 13 additions & 4 deletions proposer/op/proposer/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -639,13 +639,22 @@ func (l *L2OutputSubmitter) loopL2OO(ctx context.Context) {
continue
}

// 2) Check the statuses of all requested proofs.
// 2) Check the statuses of PROVING requests.
// If it's successfully returned, we validate that we have it on disk and set status = "COMPLETE".
// If it fails or times out, we set status = "FAILED" (and, if it's a span proof, split the request in half to try again).
l.Log.Info("Stage 2: Processing Pending Proofs...")
err = l.ProcessPendingProofs()
l.Log.Info("Stage 2: Processing PROVING requests...")
err = l.ProcessProvingRequests()
if err != nil {
l.Log.Error("failed to update requested proofs", "err", err)
l.Log.Error("failed to update PROVING requests", "err", err)
continue
}

// 3) Check the statuses of WITNESSGEN requests.
// If the witness generation request has been in the WITNESSGEN state for longer than the timeout, set status to FAILED and retry.
l.Log.Info("Stage 3: Processing WITNESSGEN requests...")
err = l.ProcessWitnessgenRequests()
if err != nil {
l.Log.Error("failed to update WITNESSGEN requests", "err", err)
continue
}

Expand Down
29 changes: 24 additions & 5 deletions proposer/op/proposer/prove.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,14 @@ import (
)

const PROOF_STATUS_TIMEOUT = 30 * time.Second
const WITNESS_GEN_TIMEOUT = 20 * time.Minute
const WITNESSGEN_TIMEOUT = 20 * time.Minute

// This limit is set to prevent overloading the witness generation server. Until Kona improves their native I/O API (https://github.com/anton-rs/kona/issues/553)
// the maximum number of concurrent witness generation requests is roughly num_cpu / 2. Set it to 5 for now to be safe.
const MAX_CONCURRENT_WITNESS_GEN = 5

// Process all of the pending proofs.
func (l *L2OutputSubmitter) ProcessPendingProofs() error {
// Process all of requests in PROVING state.
func (l *L2OutputSubmitter) ProcessProvingRequests() error {
// Get all proof requests that are currently in the PROVING state.
reqs, err := l.db.GetAllProofsWithStatus(proofrequest.StatusPROVING)
if err != nil {
Expand Down Expand Up @@ -65,6 +65,25 @@ func (l *L2OutputSubmitter) ProcessPendingProofs() error {
return nil
}

// Process all of requests in WITNESSGEN state.
func (l *L2OutputSubmitter) ProcessWitnessgenRequests() error {
// Get all proof requests that are currently in the WITNESSGEN state.
reqs, err := l.db.GetAllProofsWithStatus(proofrequest.StatusWITNESSGEN)
if err != nil {
return err
}
for _, req := range reqs {
// If the request has been in the WITNESSGEN state for longer than the timeout, set status to FAILED.
// This is a catch-all in case the witness generation state update failed.
if req.LastUpdatedTime+uint64(WITNESSGEN_TIMEOUT.Seconds()) < uint64(time.Now().Unix()) {
// Retry the request if it timed out.
l.RetryRequest(req, ProofStatusResponse{})
}
}

return nil
}

// Retry a proof request. Sets the status of a proof to FAILED and retries the proof based on the optional proof status response.
// If an error response is received:
// - Range Proof: Split in two if the block range is > 1. Retry the same request if range is 1 block.
Expand Down Expand Up @@ -297,13 +316,13 @@ func (l *L2OutputSubmitter) makeProofRequest(proofType proofrequest.Type, jsonBo
}
req.Header.Set("Content-Type", "application/json")

client := &http.Client{Timeout: WITNESS_GEN_TIMEOUT}
client := &http.Client{Timeout: WITNESSGEN_TIMEOUT}
resp, err := client.Do(req)
if err != nil {
if netErr, ok := err.(net.Error); ok && netErr.Timeout() {
l.Log.Error("Witness generation request timed out", "err", err)
l.Metr.RecordWitnessGenFailure("Timeout")
return nil, fmt.Errorf("request timed out after %s: %w", WITNESS_GEN_TIMEOUT, err)
return nil, fmt.Errorf("request timed out after %s: %w", WITNESSGEN_TIMEOUT, err)
}
return nil, fmt.Errorf("failed to send request: %w", err)
}
Expand Down
15 changes: 0 additions & 15 deletions proposer/op/proposer/range.go
Original file line number Diff line number Diff line change
Expand Up @@ -187,21 +187,6 @@ func (l *L2OutputSubmitter) GetRangeProofBoundaries(ctx context.Context) error {

spans := l.SplitRangeBasic(newL2StartBlock, newL2EndBlock)

// // Check if the safeDB is activated on the L2 node. If it is, we use the safeHead based range
// // splitting algorithm. Otherwise, we use the simple range splitting algorithm.
// safeDBActivated, err := l.isSafeDBActivated(ctx, rollupClient)
// if err != nil {
// l.Log.Warn("safeDB is not activated. Using simple range splitting algorithm.", "err", err)
// }
// if safeDBActivated {
// safeHeadSpans, err := l.SplitRangeBasedOnSafeHeads(ctx, newL2StartBlock, newL2EndBlock)
// if err == nil {
// spans = safeHeadSpans
// } else {
// l.Log.Warn("failed to split range based on safe heads, using basic range splitting", "err", err)
// }
// }

// Add each span to the DB. If there are no spans, we will not create any proofs.
for _, span := range spans {
err := l.db.NewEntry(proofrequest.TypeSPAN, span.Start, span.End)
Expand Down
Loading

0 comments on commit ad870d8

Please sign in to comment.