Skip to content

Commit

Permalink
Fix shutdown sequence with cp mgr.
Browse files Browse the repository at this point in the history
  • Loading branch information
sanebay committed Jan 9, 2024
1 parent 7801ac6 commit a513a08
Show file tree
Hide file tree
Showing 7 changed files with 36 additions and 49 deletions.
1 change: 1 addition & 0 deletions src/include/homestore/checkpoint/cp_mgr.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ class CPManager {
superblk< cp_mgr_super_block > m_sb;
std::vector< iomgr::io_fiber_t > m_cp_io_fibers;
iomgr::timer_handle_t m_cp_timer_hdl;
std::atomic< bool > m_shutdown_initiated{false};

public:
CPManager();
Expand Down
4 changes: 1 addition & 3 deletions src/include/homestore/index/index_table.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
}

void destroy() override {
auto cpg = hs()->cp_mgr().cp_guard();
auto op_context = (void*)cpg.context(cp_consumer_t::INDEX_SVC);
Btree< K, V >::destroy_btree(op_context);
Btree< K, V >::destroy_btree(nullptr);
}

btree_status_t init() {
Expand Down
3 changes: 2 additions & 1 deletion src/lib/blkalloc/bitmap_blk_allocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ bool BitmapBlkAllocator::is_blk_alloced_on_disk(const BlkId& b, bool use_lock) c

BlkAllocStatus BitmapBlkAllocator::alloc_on_disk(BlkId const& bid) {
if (!is_persistent()) {
//for non-persistent bitmap nothing is needed to do. So always return success
// for non-persistent bitmap nothing is needed to do. So always return success
return BlkAllocStatus::SUCCESS;
}

Expand Down Expand Up @@ -149,6 +149,7 @@ void BitmapBlkAllocator::free_on_disk(BlkId const& bid) {
"Expected disk bits to set blk num {} num blks {}", b.blk_num(), b.blk_count());
}
}

m_disk_bm->reset_bits(b.blk_num(), b.blk_count());
}
};
Expand Down
21 changes: 19 additions & 2 deletions src/lib/checkpoint/cp_mgr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,17 @@ void CPManager::shutdown() {
LOGINFO("Stopping cp timer");
iomanager.cancel_timer(m_cp_timer_hdl, true);
m_cp_timer_hdl = iomgr::null_timer_handle;
m_shutdown_initiated = true;

auto cp = get_cur_cp();
delete (cp);
LOGINFO("Trigger cp flush");
auto fut = trigger_cp_flush(true /* force */);
auto success = std::move(fut).get();
HS_REL_ASSERT_EQ(success, true, "CP Flush failed");
LOGINFO("Trigger cp done");

delete (m_cur_cp);
rcu_xchg_pointer(&m_cur_cp, nullptr);

m_metrics.reset();
if (m_wd_cp) {
m_wd_cp->stop();
Expand Down Expand Up @@ -220,12 +227,22 @@ void CPManager::on_cp_flush_done(CP* cp) {
m_sb.write();

cleanup_cp(cp);

// Setting promise will cause the CP manager destructor to cleanup
// before getting a chance to do the checking if shutdown has been
// initiated or not.
auto shutdown_initiated = m_shutdown_initiated.load();
cp->m_comp_promise.setValue(true);

m_in_flush_phase = false;
m_wd_cp->reset_cp();
delete cp;

if (shutdown_initiated) {
// If shutdown initiated, dont trigger another CP.
return;
}

// Trigger CP in case there is one back to back CP
{
auto cur_cp = cp_guard();
Expand Down
5 changes: 3 additions & 2 deletions src/lib/homestore.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,9 @@ void HomeStore::shutdown() {

LOGINFO("Homestore shutdown is started");

m_cp_mgr->shutdown();
m_cp_mgr.reset();

if (has_repl_data_service()) {
s_cast< GenericReplService* >(m_repl_service.get())->stop();
m_repl_service.reset();
Expand All @@ -254,8 +257,6 @@ void HomeStore::shutdown() {

m_dev_mgr->close_devices();
m_dev_mgr.reset();
m_cp_mgr->shutdown();
m_cp_mgr.reset();

HomeStore::reset_instance();
LOGINFO("Homestore is completed its shutdown");
Expand Down
5 changes: 0 additions & 5 deletions src/lib/index/index_service.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,11 +78,6 @@ void IndexService::start() {

void IndexService::stop() {
std::unique_lock lg(m_index_map_mtx);
auto fut = homestore::hs()->cp_mgr().trigger_cp_flush(true /* force */);
auto success = std::move(fut).get();
HS_REL_ASSERT_EQ(success, true, "CP Flush failed");
LOGINFO("CP Flush completed");

for (auto [id, tbl] : m_index_map) {
tbl->destroy();
}
Expand Down
46 changes: 10 additions & 36 deletions src/tests/test_meta_blk_mgr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,8 +120,12 @@ class VMetaBlkMgrTest : public ::testing::Test {
}

void restart_homestore() {
auto before_services_starting_cb = [this]() {
register_client();
register_client_inlcuding_dependencies();
};
test_common::HSTestHelper::start_homestore("test_meta_blk_mgr", {{HS_SERVICE::META, {.size_pct = 85.0}}},
nullptr /* before_svc_start_cb */, true /* restart */);
std::move(before_services_starting_cb), true /* restart */);
}

uint64_t io_cnt() const { return m_update_cnt + m_wrt_cnt + m_rm_cnt; }
Expand Down Expand Up @@ -457,33 +461,17 @@ class VMetaBlkMgrTest : public ::testing::Test {
return (aligned_rand(re) == s_cast< uint8_t >(1));
}

void recover() {
// TODO: This scan_blks and recover should be replaced with actual TestHelper::start_homestore with restart
// on. That way, we don't need to simulate all these calls here
// do recover and callbacks will be triggered;
m_cb_blks.clear();
hs()->cp_mgr().shutdown();
hs()->cp_mgr().start(false /* first_time_boot */);
m_mbm->recover(false);
}

void recover_with_on_complete() {
// TODO: This scan_blks and recover should be replaced with actual TestHelper::start_homestore with restart
// on. That way, we don't need to simulate all these calls here
// do recover and callbacks will be triggered;
// restart will cause recovery and callbacks will be triggered
m_cb_blks.clear();
hs()->cp_mgr().shutdown();
hs()->cp_mgr().start(false /* first_time_boot */);
m_mbm->recover(true);
restart_homestore();
}

void validate() {
// verify received blks via callbaks are all good;
verify_cb_blks();
}

void scan_blks() { m_mbm->scan_meta_blks(); }

meta_op_type get_op() {
static thread_local bool keep_remove{false};
// if we hit some high watermark, remove the sbs until hit some low watermark;
Expand Down Expand Up @@ -742,9 +730,6 @@ TEST_F(VMetaBlkMgrTest, random_dependency_test) {

iomanager.iobuf_free(buf);

// simulate reboot case that MetaBlkMgr will scan the disk for all the metablks that were written;
this->scan_blks();

this->recover_with_on_complete();

std::unordered_map< meta_sub_type, int > actual_first_cb_order_map;
Expand Down Expand Up @@ -816,10 +801,7 @@ TEST_F(VMetaBlkMgrTest, random_load_test) {

this->do_rand_load();

// simulate reboot case that MetaBlkMgr will scan the disk for all the metablks that were written;
this->scan_blks();

this->recover();
this->recover_with_on_complete();

this->validate();

Expand Down Expand Up @@ -861,11 +843,7 @@ TEST_F(VMetaBlkMgrTest, RecoveryFromBadData) {
// Then do a recovery, the data read from disk should be uncompressed and match the size we saved in its metablk
// header. If size mismatch, it will hit assert failure;
//

// simulate reboot case that MetaBlkMgr will scan the disk for all the metablks that were written;
this->scan_blks();

this->recover();
this->recover_with_on_complete();

this->validate();

Expand All @@ -892,11 +870,7 @@ TEST_F(VMetaBlkMgrTest, CompressionBackoff) {
// Then do a recovery, the data read from disk should be uncompressed and match the size we saved in its metablk
// header. If size mismatch, it will hit assert failure;
//

// simulate reboot case that MetaBlkMgr will scan the disk for all the metablks that were written;
this->scan_blks();

this->recover();
this->recover_with_on_complete();

this->validate();

Expand Down

0 comments on commit a513a08

Please sign in to comment.