From c388f69d4eddd058dd4a207e48612dc4baaf2e1b Mon Sep 17 00:00:00 2001 From: Xiaoxi Chen Date: Thu, 19 Dec 2024 15:51:28 +0800 Subject: [PATCH] Only call cp_flush for those consumer paticipated in this cp. If a consumer registered after a cp goes to flushing state, the on_switchover_cp cb will not be called for this consumer. In this CP, the ctx for this consumer is nullptr as the consumer never participant in the cp. Previous code calling cp_flush for every consumer, leaving the duty of properly handle the nullptr returned by cp->context(svc_id) to consumer. However, none of the existing consumer handled the case. As a result, we hit an occurance that Index generate a CP sololy, but before the cp fully flushed, other consumer registered and be called into cp_flush(), the replication service, doesnt properly handled the nullptr like below, `get_repl_dev_ctx` was called with this_ptr is null, it is dangerous as invalid memory get accessed. This change is a breaking change for consumer like HO so bump up the version. HomeObject participant the CP as CLIENT, current implementation of HO always returns nullptr for `on_switchover_cp` which will result the CLIENT be excluded from cp_flush after this commit merged. callstack: ``` homestore::ReplSvcCPContext::get_repl_dev_ctx (this=0x0, dev=0x56010ab52b00) at /home/ubuntu/HomeStore/src/lib/replication/service/raft_repl_service.cpp:521 0x0000560106d58f1e in homestore::RaftReplServiceCPHandler::cp_flush (this=, cp=0x56010a467940) at /home/ubuntu/HomeStore/src/lib/replication/service/raft_repl_service.cpp:549 ``` code: ``` auto cp_ctx = s_cast< ReplSvcCPContext* >(cp->context(cp_consumer_t::REPLICATION_SVC)); ... auto dev_ctx = cp_ctx->get_repl_dev_ctx(repl_dev.get()); ``` Signed-off-by: Xiaoxi Chen --- conanfile.py | 2 +- src/lib/checkpoint/cp_mgr.cpp | 9 ++++++--- src/lib/replication/service/generic_repl_svc.cpp | 4 +++- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/conanfile.py b/conanfile.py index f4d5fc38b..087d8ca98 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.6.0" + version = "6.6.1" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/lib/checkpoint/cp_mgr.cpp b/src/lib/checkpoint/cp_mgr.cpp index 7072d7c91..7fd6f7460 100644 --- a/src/lib/checkpoint/cp_mgr.cpp +++ b/src/lib/checkpoint/cp_mgr.cpp @@ -187,7 +187,8 @@ folly::Future< bool > CPManager::do_trigger_cp_flush(bool force, bool flush_on_s // sealer should be the first one to switch over auto& sealer_cp = m_cp_cb_table[(size_t)cp_consumer_t::SEALER]; if (sealer_cp) { - new_cp->m_contexts[(size_t)cp_consumer_t::SEALER] = std::move(sealer_cp->on_switchover_cp(cur_cp.get(), new_cp)); + new_cp->m_contexts[(size_t)cp_consumer_t::SEALER] = + std::move(sealer_cp->on_switchover_cp(cur_cp.get(), new_cp)); } // switch over other consumers for (size_t svcid = 0; svcid < (size_t)cp_consumer_t::SENTINEL; svcid++) { @@ -227,7 +228,8 @@ void CPManager::cp_start_flush(CP* cp) { for (size_t svcid = 0; svcid < (size_t)cp_consumer_t::SENTINEL; svcid++) { if (svcid == (size_t)cp_consumer_t::SEALER) { continue; } auto& consumer = m_cp_cb_table[svcid]; - if (consumer) { futs.emplace_back(std::move(consumer->cp_flush(cp))); } + bool participated = (cp->m_contexts[svcid] != nullptr); + if (consumer && participated) { futs.emplace_back(std::move(consumer->cp_flush(cp))); } } folly::collectAllUnsafe(futs).thenValue([this, cp](auto) { @@ -235,7 +237,8 @@ void CPManager::cp_start_flush(CP* cp) { // at last as the cp_lsn updated here. Other component should // at least flushed to cp_lsn. auto& sealer_cp = m_cp_cb_table[(size_t)cp_consumer_t::SEALER]; - if (sealer_cp) { sealer_cp->cp_flush(cp).wait(); } + bool participated = (cp->m_contexts[(size_t)cp_consumer_t::SEALER] != nullptr); + if (sealer_cp && participated) { sealer_cp->cp_flush(cp).wait(); } // All consumers have flushed for the cp on_cp_flush_done(cp); }); diff --git a/src/lib/replication/service/generic_repl_svc.cpp b/src/lib/replication/service/generic_repl_svc.cpp index 9aa2c044d..f5671cb16 100644 --- a/src/lib/replication/service/generic_repl_svc.cpp +++ b/src/lib/replication/service/generic_repl_svc.cpp @@ -152,7 +152,9 @@ AsyncReplResult<> SoloReplService::replace_member(group_id_t group_id, const rep return make_async_error<>(ReplServiceError::NOT_IMPLEMENTED); } -std::unique_ptr< CPContext > SoloReplServiceCPHandler::on_switchover_cp(CP* cur_cp, CP* new_cp) { return nullptr; } +std::unique_ptr< CPContext > SoloReplServiceCPHandler::on_switchover_cp(CP* cur_cp, CP* new_cp) { + return std::make_unique< CPContext >(new_cp); +} folly::Future< bool > SoloReplServiceCPHandler::cp_flush(CP* cp) { repl_service().iterate_repl_devs([cp](cshared< ReplDev >& repl_dev) {