From 0551f7a3201d9eb6999ae04e1d290e839d07fc4c Mon Sep 17 00:00:00 2001 From: lklimek <842586+lklimek@users.noreply.github.com> Date: Thu, 14 Sep 2023 12:53:28 +0200 Subject: [PATCH 1/3] fix(evidence): send evidence only once (#683) Right now, evidence is sent to all peers, which is not necessary and can generate huge load during chain halt. Reproduction scenario: 1. Stop majority of chain nodes (except one) 2. Remove WAL from stopped nodes 3. Break the code so that the chain will be halted 4. Start stopped nodes As a result, these nodes will generate huge amount of evidence messages, which will be sent to all peers repeatedly. On big enough network, this will cause significant load. --- internal/evidence/reactor.go | 119 ++++++++++++++++------------------- internal/p2p/router.go | 2 +- 2 files changed, 54 insertions(+), 67 deletions(-) diff --git a/internal/evidence/reactor.go b/internal/evidence/reactor.go index 79478761f8..ff8f6b6309 100644 --- a/internal/evidence/reactor.go +++ b/internal/evidence/reactor.go @@ -4,11 +4,9 @@ import ( "context" "fmt" "runtime/debug" - "time" sync "github.com/sasha-s/go-deadlock" - clist "github.com/dashpay/tenderdash/internal/libs/clist" "github.com/dashpay/tenderdash/internal/p2p" "github.com/dashpay/tenderdash/libs/log" "github.com/dashpay/tenderdash/libs/service" @@ -22,12 +20,6 @@ const ( EvidenceChannel = p2p.ChannelID(0x38) maxMsgSize = 1048576 // 1MB TODO make it configurable - - // broadcast all uncommitted evidence this often. This sets when the reactor - // goes back to the start of the list and begins sending the evidence again. - // Most evidence should be committed in the very next block that is why we wait - // just over the block production rate before sending evidence again. - broadcastEvidenceIntervalS = 10 ) // GetChannelDescriptor produces an instance of a descriptor for this @@ -49,6 +41,8 @@ type Reactor struct { evpool *Pool chCreator p2p.ChannelCreator + evidenceCh p2p.Channel + peerEvents p2p.PeerEventSubscriber mtx sync.Mutex @@ -82,14 +76,14 @@ func NewReactor( // envelopes on each. In addition, it also listens for peer updates and handles // messages on that p2p channel accordingly. The caller must be sure to execute // OnStop to ensure the outbound p2p Channels are closed. No error is returned. -func (r *Reactor) OnStart(ctx context.Context) error { - ch, err := r.chCreator(ctx, GetChannelDescriptor()) +func (r *Reactor) OnStart(ctx context.Context) (err error) { + r.evidenceCh, err = r.chCreator(ctx, GetChannelDescriptor()) if err != nil { return err } - go r.processEvidenceCh(ctx, ch) - go r.processPeerUpdates(ctx, r.peerEvents(ctx, "evidence"), ch) + go r.processEvidenceCh(ctx) + go r.processPeerUpdates(ctx, r.peerEvents(ctx, "evidence")) return nil } @@ -111,23 +105,30 @@ func (r *Reactor) handleEvidenceMessage(ctx context.Context, envelope *p2p.Envel // Evidence is sent and received one by one ev, err := types.EvidenceFromProto(msg) if err != nil { - logger.Error("failed to convert evidence", "err", err) + logger.Error("failed to convert evidence", "error", err) return err } - if err := r.evpool.AddEvidence(ctx, ev); err != nil { - // If we're given invalid evidence by the peer, notify the router that - // we should remove this peer by returning an error. - if _, ok := err.(*types.ErrInvalidEvidence); ok { - return err + + // If the evidence is already pending or committed, we don't need to + // broadcast it again. + if !r.evpool.isPending(ev) && !r.evpool.isCommitted(ev) { + if err := r.evpool.AddEvidence(ctx, ev); err != nil { + // If we're given invalid evidence by the peer, notify the router that + // we should remove this peer by returning an error. + if _, ok := err.(*types.ErrInvalidEvidence); ok { + return err + } + logger.Error("failed to add evidence", "error", err) } + return r.broadcastEvidence(ctx, *msg, r.evidenceCh) } + logger.Debug("evidence already pending", "evidence", ev) + return nil default: return fmt.Errorf("received unknown message: %T", msg) } - - return nil } // handleMessage handles an Envelope sent from a peer on a specific p2p Channel. @@ -159,13 +160,13 @@ func (r *Reactor) handleMessage(ctx context.Context, envelope *p2p.Envelope) (er // processEvidenceCh implements a blocking event loop where we listen for p2p // Envelope messages from the evidenceCh. -func (r *Reactor) processEvidenceCh(ctx context.Context, evidenceCh p2p.Channel) { - iter := evidenceCh.Receive(ctx) +func (r *Reactor) processEvidenceCh(ctx context.Context) { + iter := r.evidenceCh.Receive(ctx) for iter.Next(ctx) { envelope := iter.Envelope() if err := r.handleMessage(ctx, envelope); err != nil { r.logger.Error("failed to process message", "ch_id", envelope.ChannelID, "envelope", envelope, "err", err) - if serr := evidenceCh.SendError(ctx, p2p.PeerError{ + if serr := r.evidenceCh.SendError(ctx, p2p.PeerError{ NodeID: envelope.From, Err: err, }); serr != nil { @@ -186,7 +187,7 @@ func (r *Reactor) processEvidenceCh(ctx context.Context, evidenceCh p2p.Channel) // connects/disconnects frequently from the broadcasting peer(s). // // REF: https://github.com/tendermint/tendermint/issues/4727 -func (r *Reactor) processPeerUpdate(ctx context.Context, peerUpdate p2p.PeerUpdate, evidenceCh p2p.Channel) { +func (r *Reactor) processPeerUpdate(ctx context.Context, peerUpdate p2p.PeerUpdate) { r.logger.Debug("received peer update", "peer", peerUpdate.NodeID, "status", peerUpdate.Status) r.mtx.Lock() @@ -209,7 +210,7 @@ func (r *Reactor) processPeerUpdate(ctx context.Context, peerUpdate p2p.PeerUpda if !ok { pctx, pcancel := context.WithCancel(ctx) r.peerRoutines[peerUpdate.NodeID] = pcancel - go r.broadcastEvidenceLoop(pctx, peerUpdate.NodeID, evidenceCh) + go r.syncEvidence(pctx, peerUpdate.NodeID) } case p2p.PeerStatusDown: @@ -227,31 +228,23 @@ func (r *Reactor) processPeerUpdate(ctx context.Context, peerUpdate p2p.PeerUpda // processPeerUpdates initiates a blocking process where we listen for and handle // PeerUpdate messages. When the reactor is stopped, we will catch the signal and // close the p2p PeerUpdatesCh gracefully. -func (r *Reactor) processPeerUpdates(ctx context.Context, peerUpdates *p2p.PeerUpdates, evidenceCh p2p.Channel) { +func (r *Reactor) processPeerUpdates(ctx context.Context, peerUpdates *p2p.PeerUpdates) { for { select { case peerUpdate := <-peerUpdates.Updates(): - r.processPeerUpdate(ctx, peerUpdate, evidenceCh) + r.processPeerUpdate(ctx, peerUpdate) case <-ctx.Done(): return } } } -// broadcastEvidenceLoop starts a blocking process that continuously reads pieces -// of evidence off of a linked-list and sends the evidence in a p2p Envelope to -// the given peer by ID. This should be invoked in a goroutine per unique peer +// syncEvidence starts a blocking process that sends all evidence to a newly +// connected peer. This should be invoked in a goroutine per unique peer // ID via an appropriate PeerUpdate. The goroutine can be signaled to gracefully // exit by either explicitly closing the provided doneCh or by the reactor // signaling to stop. -// -// TODO: This should be refactored so that we do not blindly gossip evidence -// that the peer has already received or may not be ready for. -// -// REF: https://github.com/tendermint/tendermint/issues/4727 -func (r *Reactor) broadcastEvidenceLoop(ctx context.Context, peerID types.NodeID, evidenceCh p2p.Channel) { - var next *clist.CElement - +func (r *Reactor) syncEvidence(ctx context.Context, peerID types.NodeID) { defer func() { r.mtx.Lock() delete(r.peerRoutines, peerID) @@ -266,25 +259,9 @@ func (r *Reactor) broadcastEvidenceLoop(ctx context.Context, peerID types.NodeID } }() - timer := time.NewTimer(0) - defer timer.Stop() - - for { - // This happens because the CElement we were looking at got garbage - // collected (removed). That is, .NextWaitChan() returned nil. So we can go - // ahead and start from the beginning. - if next == nil { - select { - case <-r.evpool.EvidenceWaitChan(): // wait until next evidence is available - if next = r.evpool.EvidenceFront(); next == nil { - continue - } - - case <-ctx.Done(): - return - } - } + next := r.evpool.EvidenceFront() + for next != nil { ev := next.Value.(types.Evidence) evProto, err := types.EvidenceToProto(ev) if err != nil { @@ -296,25 +273,35 @@ func (r *Reactor) broadcastEvidenceLoop(ctx context.Context, peerID types.NodeID // peer may receive this piece of evidence multiple times if it added and // removed frequently from the broadcasting peer. - if err := evidenceCh.Send(ctx, p2p.Envelope{ + if err := r.evidenceCh.Send(ctx, p2p.Envelope{ To: peerID, Message: evProto, }); err != nil { return } - r.logger.Debug("gossiped evidence to peer", "evidence", ev, "peer", peerID) + r.logger.Debug("evidence sync: sent evidence to peer", "evidence", ev, "peer", peerID) select { - case <-timer.C: - // start from the beginning after broadcastEvidenceIntervalS seconds - timer.Reset(time.Second * broadcastEvidenceIntervalS) - next = nil - - case <-next.NextWaitChan(): - next = next.Next() - case <-ctx.Done(): return + default: } + + next = next.Next() } + r.logger.Debug("evidence sync finished", "peer", peerID) +} + +// broadcastEvidence sends new evidence to all connected peers. +func (r *Reactor) broadcastEvidence(ctx context.Context, evidence tmproto.Evidence, evidenceCh p2p.Channel) error { + + if err := evidenceCh.Send(ctx, p2p.Envelope{ + Broadcast: true, + Message: &evidence, + }); err != nil { + return fmt.Errorf("failed to broadcast evidence: %w", err) + } + r.logger.Debug("evidence broadcasted", "evidence", evidence) + + return nil } diff --git a/internal/p2p/router.go b/internal/p2p/router.go index 7adc3d0dfa..ed6e2c0d52 100644 --- a/internal/p2p/router.go +++ b/internal/p2p/router.go @@ -359,7 +359,7 @@ func (r *Router) routeChannel( r.metrics.RouterPeerQueueSend.Observe(time.Since(start).Seconds()) case <-q.closed(): - r.logger.Debug("dropping message for unconnected peer", "peer", envelope.To, "channel", chID) + r.logger.Debug("dropping message on closed channel", "peer", envelope.To, "channel", chID) case <-ctx.Done(): return From 8d75f16db250dfda9e940cddc4ea9731b264ba40 Mon Sep 17 00:00:00 2001 From: lklimek <842586+lklimek@users.noreply.github.com> Date: Thu, 14 Sep 2023 14:59:34 +0200 Subject: [PATCH 2/3] fix: panic verifying evidence due to missing pubkey (#684) --- internal/evidence/verify.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/internal/evidence/verify.go b/internal/evidence/verify.go index 8e4d7998dc..bb3b4bd4dd 100644 --- a/internal/evidence/verify.go +++ b/internal/evidence/verify.go @@ -98,6 +98,9 @@ func VerifyDuplicateVote(e *types.DuplicateVoteEvidence, chainID string, valSet } proTxHash := val.ProTxHash pubKey := val.PubKey + if pubKey == nil { + return fmt.Errorf("we don't have a public key of validator %X at height %d", proTxHash, e.Height()) + } // H/R/S must be the same if e.VoteA.Height != e.VoteB.Height || From eb81c156a70ee9411dfee72280d3cfa2815ee05d Mon Sep 17 00:00:00 2001 From: Lukasz Klimek <842586+lklimek@users.noreply.github.com> Date: Thu, 14 Sep 2023 15:31:15 +0200 Subject: [PATCH 3/3] chore(release): update changelog and version to 0.13.1 --- CHANGELOG.md | 193 +++++++++++++++++++++------------------------ version/version.go | 2 +- 2 files changed, 91 insertions(+), 104 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 596afdac06..c2ebdd8ff3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [0.13.1] - 2023-09-14 + +### Bug Fixes + +- Send evidence only once (#683) +- Panic verifying evidence due to missing pubkey (#684) + ## [0.13.0] - 2023-09-13 ### Bug Fixes @@ -15,6 +22,7 @@ - Update changelog and version to 0.13.0-dev.2 (#664) - Improve logs (#679) - Update mocks and .proto files after merge 12 to 13 +- Update changelog and version to 0.13.0 ### Refactor @@ -37,6 +45,7 @@ ### Miscellaneous Tasks +- Update changelog and version to 0.12.0 - Update changelog and version to 0.13.0-dev.1 (#651) - Update changelog and version to 0.11.3 @@ -46,15 +55,21 @@ - Add ability to write logs in a file (#632) -### Miscellaneous Tasks - -- Catch up the changes from master into v0.11 dev (#629) - ### Backport - Catch up with the latest commits from v0.11 to v0.12 (#631) - Catch up the changes from v0.11 to v0.12 (#636) +## [0.11.2] - 2023-05-03 + +### Bug Fixes + +- Invalid threshold for `LLMQType_25_67` (#628) + +### Miscellaneous Tasks + +- Catch up the changes from master into v0.11 dev (#629) + ## [0.11.1] - 2023-05-02 ### Bug Fixes @@ -8129,6 +8144,11 @@ - Fix up doc to mention length of digits +### Documentation + +- Add note about putting GOPATH/bin on PATH +- Correction, closes #910 + ### Proposal - New Makefile standard template (#168) @@ -8139,22 +8159,50 @@ ### Testing +- Sunset tmlibs/process.Process +- Wait for node heights before checking app hash +- Fix ensureABCIIsUp +- Fix test/app/counter_test.sh - Longer timeout - Add some timeouts +### Abci-cli + +- Print OK if code is 0 +- Prefix flag variables with flag + +### Adr + +- Update 007 trust metric usage + ### All - Fix vet issues with build tags, formatting +### Appveyor + +- Use make + ### Batch - Progress ### Blockchain +- Add tests and more docs for BlockStore +- Update store comments +- Updated store docs/comments from review +- Deduplicate store header value tests +- Less fragile and involved tests for blockstore +- Block creator helper for compressing tests as per @ebuchman +- Note about store tests needing simplification ... - Test fixes - Update for new state +### Client + +- Use vars for retry intervals + ### Cmd/abci-cli - Use a single connection per session @@ -8172,6 +8220,7 @@ ### Common +- Comments for Service - No more relying on math/rand.DefaultSource - Use names prng and mrand - Use genius simplification of tests from @ebuchman @@ -8185,6 +8234,7 @@ ### Consensus +- Fix typo on ticker.go documentation - Fix makeBlockchainFromWAL - Remove log stmt. closes #987 - Note about duplicate evidence @@ -8202,6 +8252,10 @@ - Fix c and go iterators - Simplify exists check, fix IsKeyInDomain signature, Iterator Close +### Dummy + +- Include app.key tag + ### Evidence - More funcs in store.go @@ -8210,12 +8264,29 @@ - Reactor test - Reactor test +### Glide + +- Update grpc version + +### Linter + +- Enable in CI & make deterministic + ### Mempool +- Implement Mempool.CloseWAL +- Return error on cached txs +- Assert -> require in test - Remove Peer interface. use p2p.Peer +### P2p + +- Exponential backoff on reconnect. closes #939 + ### P2p/trust +- Split into multiple files and improve function order +- Lock on Copy() - Remove extra channels ### Protoc @@ -8224,9 +8295,14 @@ ### Rpc +- Make time human readable. closes #926 - GetHeight helper function - Fix getHeight +### Shame + +- Forgot to add new code pkg + ### Spec - Fixes from review @@ -8241,6 +8317,16 @@ ### Types +- Use data.Bytes directly in type.proto via gogo/protobuf. wow +- Consolidate some file +- Add note about ReadMessage having no cap +- RequestBeginBlock includes absent and byzantine validators +- Drop uint64 from protobuf.go +- IsOK() +- Int32 with gogo int +- Fix for broken customtype int in gogo +- Add MarshalJSON funcs for Response types with a Code +- Add UnmarshalJSON funcs for Response types - Compile type assertions to avoid sneaky runtime surprises - Check ResponseCheckTx too - Update String() test to assert Prevote type @@ -8261,105 +8347,6 @@ - Tendermint specification -## [0.14.0] - 2017-12-12 - -### Adr - -- Update 007 trust metric usage - -### Appveyor - -- Use make - -### Blockchain - -- Add tests and more docs for BlockStore -- Update store comments -- Updated store docs/comments from review -- Deduplicate store header value tests -- Less fragile and involved tests for blockstore -- Block creator helper for compressing tests as per @ebuchman -- Note about store tests needing simplification ... - -### Consensus - -- Fix typo on ticker.go documentation - -### Linter - -- Enable in CI & make deterministic - -### P2p - -- Exponential backoff on reconnect. closes #939 - -## [0.13.0] - 2017-12-06 - -### Documentation - -- Add note about putting GOPATH/bin on PATH -- Correction, closes #910 - -### Testing - -- Sunset tmlibs/process.Process -- Wait for node heights before checking app hash -- Fix ensureABCIIsUp -- Fix test/app/counter_test.sh - -### Abci-cli - -- Print OK if code is 0 -- Prefix flag variables with flag - -### Client - -- Use vars for retry intervals - -### Common - -- Comments for Service - -### Dummy - -- Include app.key tag - -### Glide - -- Update grpc version - -### Mempool - -- Implement Mempool.CloseWAL -- Return error on cached txs -- Assert -> require in test - -### P2p/trust - -- Split into multiple files and improve function order -- Lock on Copy() - -### Rpc - -- Make time human readable. closes #926 - -### Shame - -- Forgot to add new code pkg - -### Types - -- Use data.Bytes directly in type.proto via gogo/protobuf. wow -- Consolidate some file -- Add note about ReadMessage having no cap -- RequestBeginBlock includes absent and byzantine validators -- Drop uint64 from protobuf.go -- IsOK() -- Int32 with gogo int -- Fix for broken customtype int in gogo -- Add MarshalJSON funcs for Response types with a Code -- Add UnmarshalJSON funcs for Response types - ## [0.12.1] - 2017-11-28 ### Documentation diff --git a/version/version.go b/version/version.go index 1cc234dd30..70dec25555 100644 --- a/version/version.go +++ b/version/version.go @@ -9,7 +9,7 @@ var ( const ( // TMVersionDefault is the used as the fallback version for Tenderdash // when not using git describe. It is formatted with semantic versioning. - TMVersionDefault = "0.13.0" + TMVersionDefault = "0.13.1" // ABCISemVer is the semantic version of the ABCI library ABCISemVer = "0.23.0"