Skip to content

Commit

Permalink
fix twice call of leave
Browse files Browse the repository at this point in the history
Removed From cluster will be called twice when committing config_change
log and journal_type_t::HS_CTRL_DESTROY in the moved out member, so the
destroy future will be setvalue twice , which will lead to an error.
this PR fixes this bug
  • Loading branch information
JacksonYao287 authored and Hooper9973 committed Dec 18, 2024
1 parent 6756b81 commit c4a0697
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 25 deletions.
2 changes: 1 addition & 1 deletion conanfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

class HomestoreConan(ConanFile):
name = "homestore"
version = "6.5.26"
version = "6.5.27"

homepage = "https://github.com/eBay/Homestore"
description = "HomeStore Storage Engine"
Expand Down
31 changes: 15 additions & 16 deletions src/lib/replication/repl_dev/raft_repl_dev.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1183,7 +1183,7 @@ nuraft::ptr< nuraft::log_store > RaftReplDev::load_log_store() { return m_data_j

int32_t RaftReplDev::server_id() { return m_raft_server_id; }

bool RaftReplDev::is_destroy_pending() const { return (m_rd_sb->destroy_pending == 0x1); }
bool RaftReplDev::is_destroy_pending() const { return (*m_stage.access().get() == repl_dev_stage_t::DESTROYED); }
bool RaftReplDev::is_destroyed() const { return (*m_stage.access().get() == repl_dev_stage_t::PERMANENT_DESTROYED); }

/////////////////////////////////// nuraft_mesg::mesg_state_mgr overrides ////////////////////////////////////
Expand All @@ -1207,6 +1207,19 @@ void RaftReplDev::permanent_destroy() {
}

void RaftReplDev::leave() {
// this will be called in 3 cases :
// 1. commit log entry of journal_type_t::HS_CTRL_DESTROY
// 2. it is removed from the cluster and the new config(excluding this node) is being committed on this node
// 3. it is removed from the cluster , but the node is down and new config log(excluding this node) is not
// replicated to this removed node. when the node restart, leader will not send any append entry to this node,
// since it is not a member of the raft group. it will become a condidate and send request-vote request to other
// members of this raft group. a member will send RemovedFromCluster to the node if this member finds the node
// is no longer a member of the raft group.

// leave() will never be called concurrently, since config change and journal_type_t::HS_CTRL_DESTROY are all log
// entry, which will be committed sequentially.
if (is_destroy_pending()) return;

// We update that this repl_dev in destroyed state, actual clean up of resources happen in reaper thread later
m_stage.update([](auto* stage) { *stage = repl_dev_stage_t::DESTROYED; });
m_destroyed_time = Clock::now();
Expand Down Expand Up @@ -1288,21 +1301,7 @@ std::pair< bool, nuraft::cb_func::ReturnCode > RaftReplDev::handle_raft_event(nu
return {true, ret};
}

case nuraft::cb_func::Type::RemovedFromCluster: {
// a node will reach here when :
// 1. it is removed from the cluster and the new config(excluding this node) is being committed on this node
// 2. it is removed from the cluster , but the node is down and new config log(excluding this node) is not
// replicated to this removed node. when the node restart, leader will not send any append entry to this node,
// since it is not a member of the raft group. it will become a condidate and send request-vote request to other
// members of this raft group. a member will send RemovedFromCluster to the node if this member finds the node
// is no longer a member of the raft group.

// this will lazily cleanup the group
// TODO:cleanup this repl dev ASAP if necessary.
leave();

return {true, ret};
}
// RemovedFromCluster will be handled in nuraft_mesg::generic_raft_event_handler where leave() is called

// TODO: Add more type handler if necessary
default:
Expand Down
4 changes: 3 additions & 1 deletion src/lib/replication/repl_dev/raft_repl_dev.h
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,9 @@ class RaftReplDev : public ReplDev,
*
* @param num_reserved_entries The number of reserved entries of the replication log.
*/
void truncate(uint32_t num_reserved_entries) { m_data_journal->truncate(num_reserved_entries, m_compact_lsn.load()); }
void truncate(uint32_t num_reserved_entries) {
m_data_journal->truncate(num_reserved_entries, m_compact_lsn.load());
}

void wait_for_logstore_ready() { m_data_journal->wait_for_log_store_ready(); }

Expand Down
10 changes: 3 additions & 7 deletions src/tests/test_common/raft_repl_test_base.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -182,16 +182,12 @@ class TestReplicatedDB : public homestore::ReplDevListener {
return make_async_success<>();
}

static int64_t get_next_lsn(uint64_t& obj_id) {
return obj_id & ((1ULL << 63) - 1);
}
static void set_resync_msg_type_bit(uint64_t& obj_id) {
obj_id |= 1ULL << 63;
}
static int64_t get_next_lsn(uint64_t& obj_id) { return obj_id & ((1ULL << 63) - 1); }
static void set_resync_msg_type_bit(uint64_t& obj_id) { obj_id |= 1ULL << 63; }

int read_snapshot_obj(shared< snapshot_context > context, shared< snapshot_obj > snp_data) override {
auto s = std::dynamic_pointer_cast< nuraft_snapshot_context >(context)->nuraft_snapshot();
if(RaftStateMachine::is_hs_snp_obj(snp_data->offset)) {
if (RaftStateMachine::is_hs_snp_obj(snp_data->offset)) {
LOGERRORMOD(replication, "invalid snapshot offset={}", snp_data->offset);
return -1;
}
Expand Down

0 comments on commit c4a0697

Please sign in to comment.