From 60eea4afa6f059148e5f16ba1e63b894a9553de9 Mon Sep 17 00:00:00 2001 From: koujl <108138320+koujl@users.noreply.github.com> Date: Tue, 29 Oct 2024 16:10:39 +0800 Subject: [PATCH] Add index CR UT for basic merge (#556) Signed-off-by: Jilong Kou --- conanfile.py | 2 +- .../homestore/btree/detail/btree_internal.hpp | 4 +- .../homestore/btree/detail/btree_node.hpp | 21 +- src/include/homestore/index/index_table.hpp | 120 ++++-- src/lib/device/virtual_dev.cpp | 9 + src/lib/homestore.cpp | 2 - src/lib/index/index_cp.cpp | 15 +- src/lib/index/wb_cache.cpp | 166 +++++--- .../test_common/homestore_test_common.hpp | 7 +- src/tests/test_index_crash_recovery.cpp | 375 +++++++++++++----- src/tests/test_scripts/index_test.py | 1 + 11 files changed, 519 insertions(+), 203 deletions(-) diff --git a/conanfile.py b/conanfile.py index 7c0d5aa76..f61f1ef54 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.5.3" + version = "6.5.4" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/btree/detail/btree_internal.hpp b/src/include/homestore/btree/detail/btree_internal.hpp index 67b33b089..8989a2d5d 100644 --- a/src/include/homestore/btree/detail/btree_internal.hpp +++ b/src/include/homestore/btree/detail/btree_internal.hpp @@ -245,7 +245,9 @@ struct BtreeConfig { uint8_t m_split_pct{50}; uint32_t m_max_merge_nodes{3}; #ifdef _PRERELEASE - uint64_t m_max_keys_in_node{0}; + // These are for testing purpose only + uint64_t m_max_keys_in_node{0}; + uint64_t m_min_keys_in_node{0}; #endif bool m_rebalance_turned_on{false}; bool m_merge_turned_on{true}; diff --git a/src/include/homestore/btree/detail/btree_node.hpp b/src/include/homestore/btree/detail/btree_node.hpp index a3285ef35..988b683cf 100644 --- a/src/include/homestore/btree/detail/btree_node.hpp +++ b/src/include/homestore/btree/detail/btree_node.hpp @@ -37,6 +37,7 @@ struct transient_hdr_t { /* these variables are accessed without taking lock and are not expected to change after init */ uint8_t leaf_node{0}; uint64_t max_keys_in_node{0}; + uint64_t min_keys_in_node{0}; // to specify the threshold for triggering merge bool is_leaf() const { return (leaf_node != 0); } }; @@ -116,6 +117,7 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { m_trans_hdr.leaf_node = is_leaf; #ifdef _PRERELEASE m_trans_hdr.max_keys_in_node = cfg.m_max_keys_in_node; + m_trans_hdr.min_keys_in_node = cfg.m_min_keys_in_node; #endif } @@ -299,6 +301,7 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { template < typename K > K get_first_key() const { + if (total_entries() == 0) { return K{}; } return get_nth_key< K >(0, true); } @@ -333,6 +336,7 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { // uint32_t total_entries() const { return (has_valid_edge() ? total_entries() + 1 : total_entries()); } uint64_t max_keys_in_node() const { return m_trans_hdr.max_keys_in_node; } + uint64_t min_keys_in_node() const { return m_trans_hdr.min_keys_in_node; } void lock(locktype_t l) const { if (l == locktype_t::READ) { @@ -392,6 +396,12 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { } fmt::format_to(std::back_inserter(str), "]"); } + + // Should not happen + if (this->is_node_deleted()) { + fmt::format_to(std::back_inserter(str), " **DELETED** "); + } + return str; } @@ -527,15 +537,10 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { virtual uint32_t occupied_size() const { return (node_data_size() - available_size()); } bool is_merge_needed(const BtreeConfig& cfg) const { -#if 0 #ifdef _PRERELEASE - if (iomgr_flip::instance()->test_flip("btree_merge_node") && occupied_size() < node_data_size) { - return true; - } - - auto ret = iomgr_flip::instance()->get_test_flip< uint64_t >("btree_merge_node_pct"); - if (ret && occupied_size() < (ret.get() * node_data_size() / 100)) { return true; } -#endif + if (min_keys_in_node()) { + return total_entries() < min_keys_in_node(); + } #endif return (occupied_size() < cfg.suggested_min_size()); } diff --git a/src/include/homestore/index/index_table.hpp b/src/include/homestore/index/index_table.hpp index 86f3a8c86..94b8685a3 100644 --- a/src/include/homestore/index/index_table.hpp +++ b/src/include/homestore/index/index_table.hpp @@ -78,7 +78,8 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } void destroy() override { - Btree< K, V >::destroy_btree(nullptr); + auto cpg = cp_mgr().cp_guard(); + Btree::destroy_btree(cpg.context(cp_consumer_t::INDEX_SVC)); m_sb.destroy(); } @@ -130,13 +131,16 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { idx_buf->m_dirtied_cp_id = cpg->id(); BtreeNodePtr bn = BtreeNodePtr{n}; - LOGTRACEMOD(wbcache, "repair_node cp={} buf={}", cpg->id(), idx_buf->to_string()); - repair_links(bn, (void*)cpg.context(cp_consumer_t::INDEX_SVC)); + // Only for interior nodes we need to repair its links + if (!bn->is_leaf()) { + LOGTRACEMOD(wbcache, "repair_node cp={} buf={}", cpg->id(), idx_buf->to_string()); + repair_links(bn, (void *) cpg.context(cp_consumer_t::INDEX_SVC)); + } if (idx_buf->m_up_buffer && idx_buf->m_up_buffer->is_meta_buf()) { // Our up buffer is a meta buffer, which means that we are the new root node, we need to update the // meta_buf with new root as well - on_root_changed(bn, (void*)cpg.context(cp_consumer_t::INDEX_SVC)); + on_root_changed(bn, (void *) cpg.context(cp_consumer_t::INDEX_SVC)); } } @@ -223,7 +227,8 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { wb_cache().free_buf(n->m_idx_buf, r_cast< CPContext* >(context)); } - btree_status_t on_root_changed(BtreeNodePtr const& new_root, void* context) override { + btree_status_t + on_root_changed(BtreeNodePtr const &new_root, void *context) override { // todo: if(m_sb->root_node == new_root->node_id() && m_sb->root_link_version == new_root->link_version()){ // return btree_status_t::success;} m_sb->root_node = new_root->node_id(); @@ -235,12 +240,12 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } auto& root_buf = static_cast< IndexBtreeNode* >(new_root.get())->m_idx_buf; - wb_cache().transact_bufs(ordinal(), m_sb_buffer, root_buf, {}, {}, r_cast< CPContext* >(context)); + wb_cache().transact_bufs(ordinal(), m_sb_buffer, root_buf, {}, {}, r_cast(context)); return btree_status_t::success; } btree_status_t repair_links(BtreeNodePtr const& parent_node, void* cp_ctx) { - BT_LOG(DEBUG, "Repairing links for parent node {}", parent_node->to_string()); + BT_LOG(DEBUG, "Repairing links for parent node [{}]", parent_node->to_string()); // TODO: is it possible that repairing many nodes causes an increase to level of btree? If so, then this needs // to be handled. Get the last key in the node auto const last_parent_key = parent_node->get_last_key< K >(); @@ -250,7 +255,15 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { parent_node->node_id()); return btree_status_t::not_found; } - BT_LOG(INFO, "Repairing node={} with last_parent_key={}", parent_node->to_string(), + + // Get all original child ids as a support to check if we are beyond the last child node + std::set orig_child_ids; + for (uint32_t i = 0; i < parent_node->total_entries(); ++i) { + BtreeLinkInfo link_info; + parent_node->get_nth_value(i, &link_info, true); + orig_child_ids.insert(link_info.bnode_id()); + } + BT_LOG(INFO, "Repairing node=[{}] with last_parent_key={}", parent_node->to_string(), last_parent_key.to_string()); // Get the first child node and its link info @@ -275,22 +288,45 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { auto cur_parent = parent_node; BtreeNodeList new_parent_nodes; do { - if (child_node->has_valid_edge() || - (child_node->is_leaf() && (child_node->next_bnode() == empty_bnodeid))) { - BT_DBG_ASSERT(is_parent_edge_node, - "Child node={} is an edge node but parent_node={} is not an edge node", - child_node->node_id(), cur_parent->node_id()); - cur_parent->set_edge_value(BtreeLinkInfo{child_node->node_id(), child_node->link_version()}); + if (child_node->has_valid_edge() || (child_node->is_leaf() && child_node->next_bnode() == empty_bnodeid)) { + if (child_node->is_node_deleted()) { + // Edge node is merged, we need to set the current last entry as edge + if (cur_parent->total_entries() > 0) { + auto prev_val = V{}; + cur_parent->get_nth_value(cur_parent->total_entries() - 1, &prev_val, true); + cur_parent->remove(cur_parent->total_entries() - 1); + cur_parent->set_edge_value(prev_val); + BT_LOG(INFO, "Reparing node={}, child_node=[{}] is deleted, set previous as edge_value={}", + cur_parent->node_id(), child_node->to_string(), prev_val.to_string()); + } else { + BT_LOG(INFO, "Found an empty interior node {} with maybe all childs deleted", + cur_parent->node_id()); + } + } else { + // Update edge and finish + BT_LOG(INFO, "Repairing node={}, child_node=[{}] is an edge node, end loop", cur_parent->node_id(), + child_node->to_string()); + child_node->set_next_bnode(empty_bnodeid); + write_node_impl(child_node, cp_ctx); + cur_parent->set_edge_value(BtreeLinkInfo{child_node->node_id(), child_node->link_version()}); + } break; } auto const child_last_key = child_node->get_last_key< K >(); - BT_LOG(INFO, "Repairing node={} child_node={} child_last_key={}", cur_parent->node_id(), + BT_LOG(INFO, "Repairing node={}, child_node=[{}] child_last_key={}", cur_parent->node_id(), child_node->to_string(), child_last_key.to_string()); - if (child_last_key.compare(last_parent_key) > 0 && !is_parent_edge_node) { - // We have reached the last key, and the parent node doesn't have edge, so we can stop now - break; + // Check if we are beyond the last child node. + // + // There can be cases where the child level merge is successfully persisted but the parent level is not. + // In this case, you may have your rightmost child node with last key greater than the last_parent_key. + // That's why here we have to check if the child node is one of the original child nodes first. + if (!is_parent_edge_node && !orig_child_ids.contains(child_node->node_id())) { + if (child_node->total_entries() == 0 || child_last_key.compare(last_parent_key) > 0) { + // We have reached a child beyond this parent, we can stop now + break; + } } if (!cur_parent->has_room_for_put(btree_put_type::INSERT, K::get_max_size(), @@ -312,20 +348,37 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } // Insert the last key of the child node into parent node - cur_parent->insert(cur_parent->total_entries(), child_last_key, - BtreeLinkInfo{child_node->node_id(), child_node->link_version()}); + if (!child_node->is_node_deleted()) { + cur_parent->insert(cur_parent->total_entries(), + child_node->total_entries() > 0 ? child_last_key : last_parent_key, + BtreeLinkInfo{child_node->node_id(), child_node->link_version()}); + if (child_node->total_entries() == 0) { + // There should be at most one empty child node per parent - if we find one, we should stop here + BT_LOG(INFO, "Repairing node={}, child_node=[{}] is empty, end loop", cur_parent->node_id(), + child_node->to_string()); + break; + } + } else { + // Node deleted indicates it's freed & no longer used during recovery + BT_LOG(INFO, "Repairing node={}, child node=[{}] is deleted, skipping the insert", + cur_parent->node_id(), child_node->to_string()); + } - BT_LOG(INFO, "Repairing node={}, repaired so_far={}", cur_parent->node_id(), cur_parent->to_string()); + BT_LOG(INFO, "Repairing node={}, repaired so_far=[{}]", cur_parent->node_id(), cur_parent->to_string()); // Move to the next child node - this->unlock_node(child_node, locktype_t::READ); auto const next_node_id = child_node->next_bnode(); + this->unlock_node(child_node, locktype_t::READ); if (next_node_id == empty_bnodeid) { - BT_LOG_ASSERT(false, - "Child node={} next_node_id is empty, while its not a edge node, parent_node={} " - "repair is partial", - child_node->node_id(), parent_node->node_id()); - ret = btree_status_t::not_found; + // This can be a deleted edge node - only check if it is still valid + if (!child_node->is_node_deleted()) { + BT_LOG_ASSERT(false, + "Child node={} next_node_id is empty, while its not a edge node, parent_node={} " + "repair is partial", + child_node->node_id(), parent_node->node_id()); + ret = btree_status_t::not_found; + } + child_node = nullptr; break; } @@ -333,10 +386,21 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { if (ret != btree_status_t::success) { BT_LOG_ASSERT(false, "Parent node={} repair is partial, because child_node get has failed with ret={}", parent_node->node_id(), enum_name(ret)); + child_node = nullptr; break; } } while (true); - this->unlock_node(child_node, locktype_t::READ); + + if (child_node) { + this->unlock_node(child_node, locktype_t::READ); + } + + if (parent_node->total_entries() == 0 && !parent_node->has_valid_edge()) { + // We shouldn't have an empty interior node in the tree, let's delete it. + // The buf will be released by the caller + BT_LOG(INFO, "Parent node={} is empty, deleting it", parent_node->node_id()); + parent_node->set_node_deleted(); + } if (ret == btree_status_t::success) { ret = transact_nodes(new_parent_nodes, {}, parent_node, nullptr, cp_ctx); diff --git a/src/lib/device/virtual_dev.cpp b/src/lib/device/virtual_dev.cpp index 3665f13b9..ac49f95dd 100644 --- a/src/lib/device/virtual_dev.cpp +++ b/src/lib/device/virtual_dev.cpp @@ -424,6 +424,8 @@ std::error_code VirtualDev::sync_write(const char* buf, uint32_t size, BlkId con Chunk* chunk; uint64_t const dev_offset = to_dev_offset(bid, &chunk); + HS_LOG(TRACE, device, "Writing sync in device: {}, offset = {}", chunk->physical_dev_mutable()->pdev_id(), + dev_offset); if (sisl_unlikely(dev_offset == INVALID_DEV_OFFSET)) { return std::make_error_code(std::errc::resource_unavailable_try_again); } @@ -436,6 +438,9 @@ std::error_code VirtualDev::sync_write(const char* buf, uint32_t size, cshared< if (hs()->crash_simulator().is_crashed()) { return std::error_code{}; } #endif + HS_LOG(TRACE, device, "Writing sync in device: {}, offset = {}", chunk->physical_dev_mutable()->pdev_id(), + chunk->start_offset() + offset_in_chunk); + if (sisl_unlikely(!is_chunk_available(chunk))) { return std::make_error_code(std::errc::resource_unavailable_try_again); } @@ -457,6 +462,8 @@ std::error_code VirtualDev::sync_writev(const iovec* iov, int iovcnt, BlkId cons auto const size = get_len(iov, iovcnt); auto* pdev = chunk->physical_dev_mutable(); + HS_LOG(TRACE, device, "Writing sync in device: {}, offset = {}", pdev->pdev_id(), dev_offset); + COUNTER_INCREMENT(m_metrics, vdev_write_count, 1); if (sisl_unlikely(!hs_utils::mod_aligned_sz(dev_offset, pdev->align_size()))) { COUNTER_INCREMENT(m_metrics, unalign_writes, 1); @@ -479,6 +486,8 @@ std::error_code VirtualDev::sync_writev(const iovec* iov, int iovcnt, cshared< C auto const size = get_len(iov, iovcnt); auto* pdev = chunk->physical_dev_mutable(); + HS_LOG(TRACE, device, "Writing sync in device: {}, offset = {}", pdev->pdev_id(), dev_offset); + COUNTER_INCREMENT(m_metrics, vdev_write_count, 1); if (sisl_unlikely(!hs_utils::mod_aligned_sz(dev_offset, pdev->align_size()))) { COUNTER_INCREMENT(m_metrics, unalign_writes, 1); diff --git a/src/lib/homestore.cpp b/src/lib/homestore.cpp index 0b9e0ed76..feec506c5 100644 --- a/src/lib/homestore.cpp +++ b/src/lib/homestore.cpp @@ -329,8 +329,6 @@ void HomeStore::shutdown() { #ifdef _PRERELEASE flip::Flip::instance().stop_rpc_server(); #endif - - HomeStore::reset_instance(); LOGINFO("Homestore is completed its shutdown"); } diff --git a/src/lib/index/index_cp.cpp b/src/lib/index/index_cp.cpp index 99ba7dbba..d58e33c22 100644 --- a/src/lib/index/index_cp.cpp +++ b/src/lib/index/index_cp.cpp @@ -261,7 +261,6 @@ void IndexCPContext::process_txn_record(txn_record const* rec, std::map< BlkId, } if (up_buf) { - DEBUG_ASSERT(((buf->m_up_buffer == nullptr) || (buf->m_up_buffer == up_buf)), "Inconsistent up buffer"); auto real_up_buf = (up_buf->m_created_cp_id == cpg->id()) ? up_buf->m_up_buffer : up_buf; #ifndef NDEBUG @@ -279,6 +278,20 @@ void IndexCPContext::process_txn_record(txn_record const* rec, std::map< BlkId, #endif if (buf->m_up_buffer != real_up_buf) { + if (buf->m_up_buffer) { + buf->m_up_buffer->m_wait_for_down_buffers.decrement(1); +#ifndef NDEBUG + bool found{false}; + for (auto it = buf->m_up_buffer->m_down_buffers.begin(); it != buf->m_up_buffer->m_down_buffers.end(); ++it) { + if (it->lock() == buf) { + buf->m_up_buffer->m_down_buffers.erase(it); + found = true; + break; + } + } + HS_DBG_ASSERT(found, "Down buffer is linked to Up buf, but up_buf doesn't have down_buf in its list"); +#endif + } real_up_buf->m_wait_for_down_buffers.increment(1); buf->m_up_buffer = real_up_buf; } diff --git a/src/lib/index/wb_cache.cpp b/src/lib/index/wb_cache.cpp index ed5dd7e6d..b17d67dc8 100644 --- a/src/lib/index/wb_cache.cpp +++ b/src/lib/index/wb_cache.cpp @@ -289,13 +289,8 @@ void IndexWBCache::transact_bufs(uint32_t index_ordinal, IndexBufferPtr const& p } icp_ctx->add_to_txn_journal(index_ordinal, parent_buf, nullptr, {child_buf}, {}); } else { - icp_ctx->add_to_txn_journal(index_ordinal, // Ordinal - child_buf->m_up_buffer, // real up buffer - new_node_bufs.empty() ? freed_node_bufs[0]->m_up_buffer - : new_node_bufs[0]->m_up_buffer, // real in place child - new_node_bufs, // new node bufs - freed_node_bufs // free_node_bufs - ); + icp_ctx->add_to_txn_journal(index_ordinal, child_buf->m_up_buffer /* real up buffer */, child_buf, + new_node_bufs, freed_node_bufs); } #ifdef _PRERELEASE // log new nodes and freed nodes and parent and child @@ -415,6 +410,22 @@ void IndexWBCache::link_buf(IndexBufferPtr const& up_buf, IndexBufferPtr const& } // Now we link the down_buffer to the real up_buffer + if (down_buf->m_up_buffer) { + // release existing up_buffer's wait count + down_buf->m_up_buffer->m_wait_for_down_buffers.decrement(); +#ifndef NDEBUG + bool found{false}; + for (auto it = down_buf->m_up_buffer->m_down_buffers.begin(); it != down_buf->m_up_buffer->m_down_buffers.end(); + ++it) { + if (it->lock() == down_buf) { + down_buf->m_up_buffer->m_down_buffers.erase(it); + found = true; + break; + } + } + HS_DBG_ASSERT(found, "Down buffer is linked to Up buf, but up_buf doesn't have down_buf in its list"); +#endif + } real_up_buf->m_wait_for_down_buffers.increment(1); down_buf->m_up_buffer = real_up_buf; #ifndef NDEBUG @@ -428,13 +439,13 @@ void IndexWBCache::free_buf(const IndexBufferPtr& buf, CPContext* cp_ctx) { bool done = m_cache.remove(buf->m_blkid, node); HS_REL_ASSERT_EQ(done, true, "Race on cache removal of btree blkid?"); } - + buf->m_node_freed = true; resource_mgr().inc_free_blk(m_node_size); - m_vdev->free_blk(buf->m_blkid, s_cast< VDevCPContext* >(cp_ctx)); + m_vdev->free_blk(buf->m_blkid, s_cast(cp_ctx)); } //////////////////// Recovery Related section ///////////////////////////////// -void IndexWBCache::load_buf(IndexBufferPtr const& buf) { +void IndexWBCache::load_buf(IndexBufferPtr const &buf) { if (buf->m_bytes == nullptr) { buf->m_bytes = hs_utils::iobuf_alloc(m_node_size, sisl::buftag::btree_node, m_vdev->align_size()); m_vdev->sync_read(r_cast< char* >(buf->m_bytes), m_node_size, buf->blkid()); @@ -462,17 +473,17 @@ void IndexWBCache::recover(sisl::byte_view sb) { #ifdef _PRERELEASE auto detailed_log = [this](std::map< BlkId, IndexBufferPtr > const& bufs, - std::vector< IndexBufferPtr > const& l0_bufs) { + std::vector const &pending_bufs) { std::string log = fmt::format("\trecovered bufs (#of bufs = {})\n", bufs.size()); - for (auto const& [_, buf] : bufs) { + for (auto const &[_, buf]: bufs) { load_buf(buf); fmt::format_to(std::back_inserter(log), "{}\n", buf->to_string()); } // list of new_bufs - if (!l0_bufs.empty()) { - fmt::format_to(std::back_inserter(log), "\n\tl0_bufs (#of bufs = {})\n", l0_bufs.size()); - for (auto const& buf : l0_bufs) { + if (!pending_bufs.empty()) { + fmt::format_to(std::back_inserter(log), "\n\tpending_bufs (#of bufs = {})\n", pending_bufs.size()); + for (auto const &buf: pending_bufs) { fmt::format_to(std::back_inserter(log), "{}\n", buf->to_string()); } } @@ -493,57 +504,79 @@ void IndexWBCache::recover(sisl::byte_view sb) { // This has to be done before doing any repair, because repair can allocate blkids and we don't want to allocate // the same blkid which could clash with the blkid next in the buf list. // - // On the second pass, we only take the new nodes/bufs and then repair their up buffers, if needed. - std::vector< IndexBufferPtr > l0_bufs; - for (auto const& [_, buf] : bufs) { - if (buf->m_node_freed || (buf->m_created_cp_id == icp_ctx->id())) { + // On the second pass, we only take part of the parents/siblings and then repair them, if needed. + std::vector pending_bufs; + std::vector deleted_bufs; + for (auto const &[_, buf]: bufs) { + if (buf->m_node_freed) { + // Freed node + load_buf(buf); if (was_node_committed(buf)) { - if (was_node_committed(buf->m_up_buffer)) { - if (buf->m_node_freed) { - // Up buffer was written, so this buffer can be freed and thus can free the blk. - m_vdev->free_blk(buf->m_blkid, s_cast< VDevCPContext* >(icp_ctx)); - } else { - m_vdev->commit_blk(buf->m_blkid); - } - l0_bufs.push_back(buf); - } else { - buf->m_up_buffer->m_wait_for_down_buffers.decrement(); + // Mark this buffer as deleted, so that we can avoid using it anymore when repairing its parent's link + r_cast(buf->m_bytes)->node_deleted = true; + write_buf(nullptr, buf, icp_ctx); + deleted_bufs.push_back(buf); + pending_bufs.push_back(buf->m_up_buffer); + } else { + // (Up) buffer is not committed, node need to be kept and (potentially) repaired later + buf->m_node_freed = false; + if (buf->m_created_cp_id == icp_ctx->id()) { + // New nodes need to be commited first + m_vdev->commit_blk(buf->m_blkid); + } + pending_bufs.push_back(buf); + buf->m_wait_for_down_buffers.increment(1); // Purely for recover_buf() counter consistency + } + } else if (buf->m_created_cp_id == icp_ctx->id()) { + // New node + if (was_node_committed(buf) && was_node_committed(buf->m_up_buffer)) { + // Both current and up buffer is commited, we can safely commit the current block + m_vdev->commit_blk(buf->m_blkid); + pending_bufs.push_back(buf->m_up_buffer); + } else { + // Just ignore it + buf->m_up_buffer->m_wait_for_down_buffers.decrement(); #ifndef NDEBUG - bool found{false}; - for (auto it = buf->m_up_buffer->m_down_buffers.begin(); - it != buf->m_up_buffer->m_down_buffers.end(); ++it) { - auto sp = it->lock(); - if (sp && sp == buf) { - found = true; - buf->m_up_buffer->m_down_buffers.erase(it); - break; - } + bool found{false}; + for (auto it = buf->m_up_buffer->m_down_buffers.begin(); + it != buf->m_up_buffer->m_down_buffers.end(); ++it) { + auto sp = it->lock(); + if (sp && sp == buf) { + found = true; + buf->m_up_buffer->m_down_buffers.erase(it); + break; } - HS_DBG_ASSERT(found, - "Down buffer is linked to Up buf, but up_buf doesn't have down_buf in its list"); -#endif } + HS_DBG_ASSERT(found, + "Down buffer is linked to Up buf, but up_buf doesn't have down_buf in its list"); +#endif } } } #ifdef _PRERELEASE LOGINFOMOD(wbcache, "Index Recovery detected {} nodes out of {} as new/freed nodes to be recovered in prev cp={}", - l0_bufs.size(), bufs.size(), icp_ctx->id()); - LOGTRACEMOD(wbcache, "All unclean bufs list\n{}", detailed_log(bufs, l0_bufs)); + pending_bufs.size(), bufs.size(), icp_ctx->id()); + LOGTRACEMOD(wbcache, "All unclean bufs list\n{}", detailed_log(bufs, pending_bufs)); #endif - // Second iteration we start from the lowest levels (which are all new_bufs) and check if up_buffers need to be - // repaired. All L1 buffers are not needed to repair, because they are sibling nodes and so we pass false in - // do_repair flag. - for (auto const& buf : l0_bufs) { - recover_buf(buf->m_up_buffer); + for (auto const &buf: pending_bufs) { + recover_buf(buf); + if (buf->m_bytes != nullptr && r_cast(buf->m_bytes)->node_deleted) { + // This buffer was marked as deleted during repair, so we also need to free it + deleted_bufs.push_back(buf); + } } + + for (auto const &buf: deleted_bufs) { + m_vdev->free_blk(buf->m_blkid, s_cast(icp_ctx)); + } + m_in_recovery = false; m_vdev->recovery_completed(); } -void IndexWBCache::recover_buf(IndexBufferPtr const& buf) { +void IndexWBCache::recover_buf(IndexBufferPtr const &buf) { if (!buf->m_wait_for_down_buffers.decrement_testz()) { // TODO: remove the buf_>m_up_buffer from down_buffers list of buf->m_up_buffer return; @@ -638,6 +671,10 @@ folly::Future< bool > IndexWBCache::async_cp_flush(IndexCPContext* cp_ctx) { void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const& buf, bool part_of_batch) { #ifdef _PRERELEASE static std::once_flag flag; + if (hs()->crash_simulator().is_crashed()) { + std::call_once(flag, []() { LOGINFO("Crash simulation is ongoing; aid simulation by not flushing."); }); + return; + } if (buf->m_crash_flag_on) { std::string filename = "crash_buf_" + std::to_string(cp_ctx->id()) + ".dot"; LOGINFO("Simulating crash while writing buffer {}, stored in file {}", buf->to_string(), filename); @@ -645,33 +682,34 @@ void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const hs()->crash_simulator().crash(); cp_ctx->complete(true); return; - } else if (hs()->crash_simulator().is_crashed()) { - std::call_once(flag, []() { LOGINFO("Crash simulation is ongoing; aid simulation by not flushing."); }); - return; } #endif buf->set_state(index_buf_state_t::FLUSHING); if (buf->is_meta_buf()) { - LOGTRACEMOD(wbcache, "flushing cp {} meta buf {} possibly because of root split", cp_ctx->id(), + LOGTRACEMOD(wbcache, "Flushing cp {} meta buf {} possibly because of root split", cp_ctx->id(), buf->to_string()); - auto const& sb = r_cast< MetaIndexBuffer* >(buf.get())->m_sb; - meta_service().update_sub_sb(buf->m_bytes, sb.size(), sb.meta_blk()); + auto const &sb = r_cast(buf.get())->m_sb; + if (!sb.is_empty()) { + meta_service().update_sub_sb(buf->m_bytes, sb.size(), sb.meta_blk()); + } process_write_completion(cp_ctx, buf); } else if (buf->m_node_freed) { LOGTRACEMOD(wbcache, "Not flushing buf {} as it was freed, its here for merely dependency", cp_ctx->id(), buf->to_string()); process_write_completion(cp_ctx, buf); } else { - LOGTRACEMOD(wbcache, "flushing cp {} buf {}", cp_ctx->id(), buf->to_string()); - m_vdev->async_write(r_cast< const char* >(buf->raw_buffer()), m_node_size, buf->m_blkid, part_of_batch) - .thenValue([buf, cp_ctx](auto) { - try { - auto& pthis = s_cast< IndexWBCache& >(wb_cache()); - pthis.process_write_completion(cp_ctx, buf); - } catch (const std::runtime_error& e) { LOGERROR("Failed to access write-back cache: {}", e.what()); } - }); + LOGTRACEMOD(wbcache, "Flushing cp {} buf {}", cp_ctx->id(), buf->to_string()); + m_vdev->async_write(r_cast(buf->raw_buffer()), m_node_size, buf->m_blkid, part_of_batch) + .thenValue([buf, cp_ctx](auto) { + try { + auto &pthis = s_cast(wb_cache()); + pthis.process_write_completion(cp_ctx, buf); + } catch (const std::runtime_error &e) { + LOGERROR("Failed to access write-back cache: {}", e.what()); + } + }); if (!part_of_batch) { m_vdev->submit_batch(); } } @@ -764,7 +802,7 @@ void IndexWBCache::get_next_bufs_internal(IndexCPContext* cp_ctx, uint32_t max_c std::optional< IndexBufferPtr > buf = cp_ctx->next_dirty(); if (!buf) { break; } // End of list - if ((*buf)->m_wait_for_down_buffers.testz()) { + if ((*buf)->state() == index_buf_state_t::DIRTY && (*buf)->m_wait_for_down_buffers.testz()) { bufs.emplace_back(std::move(*buf)); ++count; } else { diff --git a/src/tests/test_common/homestore_test_common.hpp b/src/tests/test_common/homestore_test_common.hpp index 97ca410f7..af1b38f0e 100644 --- a/src/tests/test_common/homestore_test_common.hpp +++ b/src/tests/test_common/homestore_test_common.hpp @@ -198,8 +198,8 @@ class HSTestHelper { } homestore::HomeStore::instance()->shutdown(); + iomanager.stop(); // Stop iomanager first in case any fiber is still referencing homestore resources homestore::HomeStore::reset_instance(); - iomanager.stop(); if (cleanup) { remove_files(m_generated_devs); @@ -251,6 +251,11 @@ class HSTestHelper { m_fc.inject_delay_flip(flip_name, {null_cond}, freq, delay_usec); LOGDEBUG("Flip {} set", flip_name); } + + void remove_flip(const std::string flip_name) { + m_fc.remove_flip(flip_name); + LOGDEBUG("Flip {} removed", flip_name); + } #endif static void fill_data_buf(uint8_t* buf, uint64_t size, uint64_t pattern = 0) { diff --git a/src/tests/test_index_crash_recovery.cpp b/src/tests/test_index_crash_recovery.cpp index 83b1928cd..560bf0f83 100644 --- a/src/tests/test_index_crash_recovery.cpp +++ b/src/tests/test_index_crash_recovery.cpp @@ -37,27 +37,29 @@ SISL_LOGGING_DECL(test_index_crash_recovery) SISL_OPTION_GROUP( test_index_crash_recovery, (num_iters, "", "num_iters", "number of iterations for rand ops", - ::cxxopts::value< uint32_t >()->default_value("500"), "number"), + ::cxxopts::value()->default_value("500"), "number"), (num_entries, "", "num_entries", "number of entries to test with", - ::cxxopts::value< uint32_t >()->default_value("5000"), "number"), - (run_time, "", "run_time", "run time for io", ::cxxopts::value< uint32_t >()->default_value("360000"), "seconds"), + ::cxxopts::value()->default_value("5000"), "number"), + (run_time, "", "run_time", "run time for io", ::cxxopts::value()->default_value("360000"), "seconds"), (num_rounds, "", "num_rounds", "number of rounds to test with", - ::cxxopts::value< uint32_t >()->default_value("100"), "number"), + ::cxxopts::value()->default_value("100"), "number"), (num_entries_per_rounds, "", "num_entries_per_rounds", "number of entries per rounds", - ::cxxopts::value< uint32_t >()->default_value("40"), "number"), - (max_keys_in_node, "", "max_keys_in_node", "max_keys_in_node", ::cxxopts::value< uint32_t >()->default_value("0"), - ""), + ::cxxopts::value()->default_value("40"), "number"), + (max_keys_in_node, "", "max_keys_in_node", "max_keys_in_node", + ::cxxopts::value()->default_value("20"), ""), + (min_keys_in_node, "", "min_keys_in_node", "min_keys_in_node", + ::cxxopts::value()->default_value("6"), ""), (operation_list, "", "operation_list", "operation list instead of default created following by percentage", - ::cxxopts::value< std::vector< std::string > >(), "operations [...]"), + ::cxxopts::value< std::vector< std::string > >(), "operations [...]"), (preload_size, "", "preload_size", "number of entries to preload tree with", - ::cxxopts::value< uint32_t >()->default_value("1000"), "number"), + ::cxxopts::value()->default_value("1000"), "number"), (init_device, "", "init_device", "init device", ::cxxopts::value< bool >()->default_value("1"), ""), (load_from_file, "", "load_from_file", "load from file", ::cxxopts::value< bool >()->default_value("0"), ""), (save_to_file, "", "save_to_file", "save to file", ::cxxopts::value< bool >()->default_value("0"), ""), (cleanup_after_shutdown, "", "cleanup_after_shutdown", "cleanup after shutdown", - ::cxxopts::value< bool >()->default_value("1"), ""), + ::cxxopts::value< bool >()->default_value("1"), ""), (seed, "", "seed", "random engine seed, use random if not defined", - ::cxxopts::value< uint64_t >()->default_value("0"), "number")) + ::cxxopts::value< uint64_t >()->default_value("0"), "number")) void log_obj_life_counter() { std::string str; @@ -99,10 +101,16 @@ class SequenceGenerator { keyDist_ = std::uniform_int_distribution<>(start_range_, end_range_); } + void fillRange(uint64_t start, uint64_t end) { + for (uint64_t i = start; i <= end; ++i) { + keyStates[i] = true; + } + } + OperationList generateOperations(size_t numOperations, bool reset = false) { std::vector< Operation > operations; if (reset) { this->reset(); } - for (size_t i = 0; i < numOperations; ++i) { + while (operations.size() < numOperations) { uint32_t key = keyDist_(g_re); auto [it, inserted] = keyStates.try_emplace(key, false); auto& inUse = it->second; @@ -120,6 +128,7 @@ class SequenceGenerator { return operations; } + __attribute__((noinline)) std::string showKeyState(uint64_t key) const { auto it = keyStates.find(key); if (it != keyStates.end()) { return it->second ? "Put" : "Remove"; } @@ -134,6 +143,7 @@ class SequenceGenerator { } return occurrences; } + __attribute__((noinline)) static std::string printOperations(const OperationList& operations) { std::ostringstream oss; auto count = 1; @@ -143,6 +153,7 @@ class SequenceGenerator { } return oss.str(); } + __attribute__((noinline)) static std::string printKeysOccurrences(const OperationList& operations) { std::set< uint64_t > keys = collectUniqueKeys(operations); std::ostringstream oss; @@ -156,6 +167,7 @@ class SequenceGenerator { } return oss.str(); } + __attribute__((noinline)) static std::string printKeyOccurrences(const OperationList& operations, uint64_t key) { std::ostringstream oss; auto keyOccurrences = inspect(operations, key); @@ -237,6 +249,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT m_test->m_cfg.m_leaf_node_type = T::leaf_node_type; m_test->m_cfg.m_int_node_type = T::interior_node_type; m_test->m_cfg.m_max_keys_in_node = SISL_OPTIONS["max_keys_in_node"].as< uint32_t >(); + m_test->m_cfg.m_min_keys_in_node = SISL_OPTIONS["min_keys_in_node"].as(); m_test->m_bt = std::make_shared< typename T::BtreeType >(std::move(sb), m_test->m_cfg); return m_test->m_bt; } @@ -262,9 +275,11 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT {HS_SERVICE::INDEX, {.size_pct = 70.0, .index_svc_cbs = new TestIndexServiceCallbacks(this)}}}, nullptr, {}, SISL_OPTIONS["init_device"].as< bool >()); - LOGINFO("Node size {} ", hs()->index_service().node_size()); this->m_cfg = BtreeConfig(hs()->index_service().node_size()); this->m_cfg.m_max_keys_in_node = SISL_OPTIONS["max_keys_in_node"].as< uint32_t >(); + this->m_cfg.m_min_keys_in_node = SISL_OPTIONS["min_keys_in_node"].as(); + LOGINFO("Node size {}, max_keys_in_node {}, min_keys_in_node {}", this->m_cfg.node_size(), + this->m_cfg.m_max_keys_in_node, this->m_cfg.m_min_keys_in_node); auto uuid = boost::uuids::random_generator()(); auto parent_uuid = boost::uuids::random_generator()(); @@ -301,7 +316,10 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT } void reset_btree() { + hs()->index_service().remove_index_table(this->m_bt); this->m_bt->destroy(); + this->trigger_cp(true); + auto uuid = boost::uuids::random_generator()(); auto parent_uuid = boost::uuids::random_generator()(); this->m_bt = std::make_shared< typename T::BtreeType >(uuid, parent_uuid, 0, this->m_cfg); @@ -334,14 +352,21 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT } LOGINFO("Diff between shadow map and snapshot map\n{}\n", dif_str); - for (const auto& [k, addition] : diff) { + for (const auto &[k, addition]: diff) { // this->print_keys(fmt::format("reapply: before inserting key {}", k.key())); // this->visualize_keys(recovered_tree_filename); - if (addition) { this->force_upsert(k.key()); } + if (addition) { + LOGDEBUG("Reapply: Inserting key {}", k.key()); + this->force_upsert(k.key()); + } else { + LOGDEBUG("Reapply: Removing key {}", k.key()); + this->remove_one(k.key(), false); + } } - test_common::HSTestHelper::trigger_cp(true); + trigger_cp(true); this->m_shadow_map.save(m_shadow_filename); } + void reapply_after_crash(OperationList& operations) { for (const auto& [key, opType] : operations) { switch (opType) { @@ -355,7 +380,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT break; } } - test_common::HSTestHelper::trigger_cp(true); + trigger_cp(true); } void TearDown() override { @@ -377,13 +402,15 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT void crash_and_recover(uint32_t s_key, uint32_t e_key) { this->print_keys("Btree prior to CP and susbsequent simulated crash: "); - test_common::HSTestHelper::trigger_cp(false); + trigger_cp(false); this->wait_for_crash_recovery(); // this->visualize_keys("tree_after_crash_" + std::to_string(s_key) + "_" + std::to_string(e_key) + ".dot"); this->print_keys("Post crash and recovery, btree structure: "); this->reapply_after_crash(); + this->print_keys("Post reapply, btree structure: "); + this->get_all(); LOGINFO("Expect to have [{},{}) in tree and it is actually{} ", s_key, e_key, tree_key_count()); ASSERT_EQ(this->m_shadow_map.size(), this->tree_key_count()) << "shadow map size and tree size mismatch"; @@ -420,7 +447,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT this->visualize_keys(b_filename); } - test_common::HSTestHelper::trigger_cp(false); + trigger_cp(false); LOGINFO("waiting for crash to recover"); this->wait_for_crash_recovery(); @@ -428,8 +455,8 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT std::string rec_filename = filename + "_after_recovery.dot"; LOGINFO("Visualize the tree file after recovery : {}", rec_filename); this->visualize_keys(rec_filename); - this->print_keys("Post crash and recovery, btree structure: "); } + this->print_keys("Post crash and recovery, btree structure: "); sanity_check(operations); // Added to the index service right after recovery. Not needed here // test_common::HSTestHelper::trigger_cp(true); @@ -440,8 +467,8 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT std::string re_filename = filename + "_after_reapply.dot"; LOGINFO("Visualize the tree after reapply {}", re_filename); this->visualize_keys(re_filename); -// this->print_keys("Post crash and recovery, btree structure: "); } + this->print_keys("Post reapply, btree structure: "); this->get_all(); LOGINFO("After reapply: {} keys in shadow map and actually {} in tress", this->m_shadow_map.size(), @@ -529,82 +556,6 @@ TYPED_TEST(IndexCrashTest, SplitOnLeftEdge) { this->query_all_paginate(80); } -/* -TYPED_TEST(IndexCrashTest, ManualMergeCrash){ - // Define the lambda function - const uint32_t num_entries = 30; - - auto initTree = [this, num_entries]() { - for (uint64_t k = 0u; k < num_entries; ++k) { - this->force_upsert(k); - } - test_common::HSTestHelper::trigger_cp(true); - this->m_shadow_map.save(this->m_shadow_filename); - }; - - std::vector< OperationList > removing_scenarios = { - {{29, OperationType::Remove}, - {28, OperationType::Remove}, - {27, OperationType::Remove}, - {26, OperationType::Remove}, - {25, OperationType::Remove}, - {24, OperationType::Remove}} - }; - - auto scenario = removing_scenarios[0]; - - LOGINFO("Step 1-1: Populate some keys and flush"); - initTree(); - this->visualize_keys("tree_init.dot"); - LOGINFO("Step 2-1: Set crash flag, remove some keys in reverse order"); - this->set_basic_flip("crash_flush_on_merge_at_parent"); - - for (auto [k, _] : scenario) { - LOGINFO("\n\n\t\t\t\t\t\t\t\t\t\t\t\t\tRemoving entry {}", k); - this->remove_one(k); - } - this->visualize_keys("tree_before_crash.dot"); - - LOGINFO("Step 3-1: Trigger cp to crash"); - this->crash_and_recover(scenario, "recover_tree_crash_1.dot"); - test_common::HSTestHelper::trigger_cp(true); - this->get_all(); - - LOGINFO("Step 1-2: Populate some keys and flush"); - initTree(); - this->visualize_keys("tree_init_02.dot"); - LOGINFO("Step 2-2: Set crash flag, remove some keys in reverse order"); - this->set_basic_flip("crash_flush_on_merge_at_left_child"); - for (auto [k, _] : scenario) { - LOGINFO("\n\n\t\t\t\t\t\t\t\t\t\t\t\t\tRemoving entry {}", k); - this->remove_one(k); - } - this->visualize_keys("tree_before_crash_2.dot"); - - LOGINFO("Step 3-2: Trigger cp to crash"); - this->crash_and_recover(scenario, "recover_tree_crash_2.dot"); - test_common::HSTestHelper::trigger_cp(true); - this->get_all(); - - LOGINFO("Step 1-3: Populate some keys and flush"); - initTree(); - this->visualize_keys("tree_init_03.dot"); - LOGINFO("Step 2-3: Set crash flag, remove some keys in reverse order"); - this->set_basic_flip("crash_flush_on_freed_child"); - for (auto [k, _] : scenario) { - LOGINFO("\n\n\t\t\t\t\t\t\t\t\t\t\t\t\tRemoving entry {}", k); - this->remove_one(k); - } - LOGINFO("Step 2-3: Set crash flag, remove some keys in reverse order"); - this->visualize_keys("tree_before_crash_3.dot"); - - LOGINFO("Step 3-3: Trigger cp to crash"); - this->crash_and_recover(scenario, "recover_tree_crash_3.dot"); - test_common::HSTestHelper::trigger_cp(true); - this->get_all(); -} -*/ - TYPED_TEST(IndexCrashTest, SplitCrash1) { // Define the lambda function auto const num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); @@ -769,6 +720,236 @@ TYPED_TEST(IndexCrashTest, long_running_put_crash) { if (renew_btree_after_crash) { this->reset_btree(); }; } } + +// Basic reverse and forward order remove with different flip points +TYPED_TEST(IndexCrashTest, MergeRemoveBasic) { + vector flip_points = { + "crash_flush_on_merge_at_parent", + "crash_flush_on_merge_at_left_child", + // "crash_flush_on_freed_child", + }; + + for (size_t i = 0; i < flip_points.size(); ++i) { + this->reset_btree(); + + auto &flip_point = flip_points[i]; + LOGINFO("=== Testing flip point: {} - {} ===", i + 1, flip_point); + + // Populate some keys [1,num_entries) and trigger cp to persist + LOGINFO("Step {}-1: Populate some keys and flush", i+1); + auto const num_entries = SISL_OPTIONS["num_entries"].as(); + for (auto k = 0u; k < num_entries; ++k) { + this->put(k, btree_put_type::INSERT, true /* expect_success */); + } + test_common::HSTestHelper::trigger_cp(true); + this->m_shadow_map.save(this->m_shadow_filename); + + this->visualize_keys("tree_merge_full.dot"); + + // Split keys into batches and remove the last one in reverse order + LOGINFO("Step {}-2: Set crash flag, remove some keys in reverse order", i + 1); + int batch_num = 4; { + int n = batch_num; + auto r = num_entries * n / batch_num - 1; + auto l = num_entries * (n - 1) / batch_num; + OperationList ops; + for (auto k = r; k >= l; --k) { + ops.emplace_back(k, OperationType::Remove); + } + LOGINFO("Step {}-2-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, batch_num, r, l); + + this->set_basic_flip(flip_point); + for (auto [k, _]: ops) { + LOGINFO("Removing key {}", k); + this->remove_one(k, true); + } + this->visualize_keys("tree_merge_before_first_crash.dot"); + + LOGINFO("Step {}-2-2: Trigger cp to crash", i + 1); + this->crash_and_recover(ops); + } + + // Remove the next batch of keys in forward order + LOGINFO("Step {}-3: Remove another batch in ascending order", i + 1) { + int n = batch_num - 1; + auto r = num_entries * n / batch_num - 1; + auto l = num_entries * (n - 1) / batch_num; + OperationList ops; + for (auto k = l; k <= r; ++k) { + ops.emplace_back(k, OperationType::Remove); + } + LOGINFO("Step {}-3-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, batch_num, l, r); + + this->set_basic_flip(flip_point); + for (auto [k, _]: ops) { + LOGINFO("Removing key {}", k); + this->remove_one(k, true); + } + this->visualize_keys("tree_merge_before_second_crash.dot"); + + LOGINFO("Step {}-3-2: Trigger cp to crash", i + 1); + this->crash_and_recover(ops); + } + + // Remove the next batch of keys in random order + LOGINFO("Step {}-4: Remove another batch in random order", i + 1) { + int n = batch_num - 2; + auto r = num_entries * n / batch_num - 1; + auto l = num_entries * (n - 1) / batch_num; + SequenceGenerator generator(0, 100, l, r); + generator.fillRange(l, r); + OperationList ops = generator.generateOperations(r - l + 1, false); + + LOGINFO("Step {}-4-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, batch_num, l, r); + + this->set_basic_flip(flip_point); + for (auto [k, _]: ops) { + LOGINFO("Removing key {}", k); + this->remove_one(k, true); + } + this->visualize_keys("tree_merge_before_third_crash.dot"); + + LOGINFO("Step {}-4-2: Trigger cp to crash", i + 1); + this->crash_and_recover(ops); + } + + LOGINFO("Step {}-5: Cleanup the tree", i + 1); + for (auto k = 0u; k < num_entries; ++k) { + this->remove_one(k, false); + } + test_common::HSTestHelper::trigger_cp(true); + this->get_all(); + } +} + +// +// TYPED_TEST(IndexCrashTest, MergeCrash1) { +// auto const num_entries = SISL_OPTIONS["num_entries"].as(); +// vector flips = { +// "crash_flush_on_merge_at_parent", "crash_flush_on_merge_at_left_child", +// }; +// SequenceGenerator generator(0 /*putFreq*/, 100 /* removeFreq*/, 0 /*start_range*/, num_entries - 1 /*end_range*/); +// OperationList operations; +// for (size_t i = 0; i < flips.size(); ++i) { +// this->reset_btree(); +// LOGINFO("Step {}-1: Init btree", i + 1); +// for (auto k = 0u; k < num_entries; ++k) { +// this->put(k, btree_put_type::INSERT, true /* expect_success */); +// } +// test_common::HSTestHelper::trigger_cp(true); +// this->print_keys("Inited tree"); +// +// LOGINFO("Step {}-2: Set flag {}", i + 1, flips[i]); +// this->set_basic_flip(flips[i], 1, 10); +// generator.reset(); +// generator.fillRange(0, num_entries - 1); +// +// // Randomly remove some keys +// std::random_device rd; +// std::mt19937 gen(rd()); +// std::uniform_int_distribution<> dis(num_entries / 4, num_entries / 2); +// auto num_keys_to_remove = dis(gen); +// LOGINFO("Removing {} keys before crash", num_keys_to_remove); +// operations = generator.generateOperations(num_keys_to_remove, false /* reset */); +// for (auto [k, _]: operations) { +// LOGINFO("Removing key {}", k); +// this->remove_one(k, true); +// } +// +// LOGINFO("Step {}-3: Simulate crash and recover", i + 1); +// this->crash_and_recover(operations, fmt::format("recover_tree_crash_{}.dot", i + 1)); +// } +// } +// +// TYPED_TEST(IndexCrashTest, MergeManualCrash) { +// std::vector flip_points = { +// "crash_flush_on_merge_at_parent", +// "crash_flush_on_merge_at_left_child", +// }; +// +// constexpr uint32_t num_entries = 28; // with max=5 & min=3 +// +// auto initTree = [this, num_entries]() { +// for (auto k = 0u; k < num_entries; ++k) { +// this->put(k, btree_put_type::INSERT, true /* expect_success */); +// } +// test_common::HSTestHelper::trigger_cp(true); +// this->m_shadow_map.save(this->m_shadow_filename); +// }; +// +// std::vector removing_scenarios = { +// { +// {27, OperationType::Remove}, +// {26, OperationType::Remove}, +// {25, OperationType::Remove}, +// {24, OperationType::Remove}, +// {23, OperationType::Remove}, +// {22, OperationType::Remove}, +// }, // Merge 2 rightmost leaf nodes in 1 action +// { +// {27, OperationType::Remove}, +// {26, OperationType::Remove}, +// {25, OperationType::Remove}, +// {24, OperationType::Remove}, +// {23, OperationType::Remove}, +// {20, OperationType::Remove}, +// {19, OperationType::Remove}, +// }, // Merge 3 rightmost leaf nodes in 1 action +// { +// {27, OperationType::Remove}, +// {26, OperationType::Remove}, +// {25, OperationType::Remove}, +// {24, OperationType::Remove}, +// {23, OperationType::Remove}, +// {22, OperationType::Remove}, +// {21, OperationType::Remove}, +// {20, OperationType::Remove}, +// {19, OperationType::Remove}, +// }, // Merge 3 rightmost leaf nodes in 2 actions +// { +// {23, OperationType::Remove}, +// {22, OperationType::Remove}, +// {11, OperationType::Remove}, +// {10, OperationType::Remove}, +// {13, OperationType::Remove}, +// }, // Merge from level=0 then level=1 +// // { +// // {16, OperationType::Remove}, +// // }, // Merge from level=1 then level=0 - need to set min=4 +// }; +// +// for (int i = 0; i < static_cast(removing_scenarios.size()); i++) { +// auto scenario = removing_scenarios[i]; +// auto s_idx = i + 1; +// LOGINFO("\n\tTesting scenario {}", s_idx); +// for (int j = 0; j < static_cast(flip_points.size()); j++) { +// const auto &flip_point = flip_points[j]; +// auto f_idx = j + 1; +// LOGINFO("\n\t\t\t\tTesting flip point: {}", flip_point); +// +// LOGINFO("Step {}-{}-1: Populate keys and flush", s_idx, f_idx); +// initTree(); +// this->visualize_keys(fmt::format("tree_init.{}_{}.dot", s_idx, f_idx)); +// +// LOGINFO("Step {}-{}-2: Set crash flag, remove keys in reverse order", s_idx, f_idx); +// this->set_basic_flip(flip_point); +// for (auto k: scenario) { +// LOGINFO("Removing entry {}", k.first); +// this->remove_one(k.first); +// } +// this->visualize_keys(fmt::format("tree_before_first_crash.{}_{}.dot", s_idx, f_idx)); +// this->remove_flip(flip_point); +// +// LOGINFO("Step {}-{}-3: Trigger cp to crash", s_idx, f_idx); +// this->crash_and_recover(scenario); +// test_common::HSTestHelper::trigger_cp(true); +// this->get_all(); +// +// this->reset_btree(); +// test_common::HSTestHelper::trigger_cp(true); +// } +// } +// } #endif int main(int argc, char* argv[]) { diff --git a/src/tests/test_scripts/index_test.py b/src/tests/test_scripts/index_test.py index 02c3e4c2c..dd2f8f010 100755 --- a/src/tests/test_scripts/index_test.py +++ b/src/tests/test_scripts/index_test.py @@ -52,6 +52,7 @@ def parse_arguments(): parser.add_argument('--cleanup_after_shutdown', help='Cleanup after shutdown', type=bool, default=False) parser.add_argument('--init_device', help='Initialize device', type=bool, default=True) parser.add_argument('--max_keys_in_node', help='Maximum num of keys in btree nodes', type=int, default=5) + parser.add_argument('--min_keys_in_node', help='Minimum num of keys in btree nodes', type=int, default=2) parser.add_argument('--num_rounds', help='number of rounds for crash test', type=int, default=10000) parser.add_argument('--num_entries_per_rounds', help='number of rounds for crash test', type=int, default=60)