diff --git a/conanfile.py b/conanfile.py index f34a966c7..51d8e4923 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.4.63" + version = "6.4.64" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/docs/imgs/Child_Node_Merge_1.png b/docs/imgs/Child_Node_Merge_1.png index d9329bd08..cbc5c5d09 100644 Binary files a/docs/imgs/Child_Node_Merge_1.png and b/docs/imgs/Child_Node_Merge_1.png differ diff --git a/src/include/homestore/btree/btree.hpp b/src/include/homestore/btree/btree.hpp index 2ef1e1d44..7b90519b6 100644 --- a/src/include/homestore/btree/btree.hpp +++ b/src/include/homestore/btree/btree.hpp @@ -154,7 +154,7 @@ class Btree { virtual btree_status_t transact_nodes(const BtreeNodeList& new_nodes, const BtreeNodeList& freed_nodes, const BtreeNodePtr& left_child_node, const BtreeNodePtr& parent_node, void* context) = 0; - virtual btree_status_t on_root_changed(BtreeNodePtr const& root, void* context) = 0; + virtual btree_status_t on_root_changed(BtreeNodePtr const &root, BtreeNodePtr const &freed_root, void *context) = 0; virtual std::string btree_store_type() const = 0; /////////////////////////// Methods the application use case is expected to handle /////////////////////////// diff --git a/src/include/homestore/btree/detail/btree_internal.hpp b/src/include/homestore/btree/detail/btree_internal.hpp index 67b33b089..8989a2d5d 100644 --- a/src/include/homestore/btree/detail/btree_internal.hpp +++ b/src/include/homestore/btree/detail/btree_internal.hpp @@ -245,7 +245,9 @@ struct BtreeConfig { uint8_t m_split_pct{50}; uint32_t m_max_merge_nodes{3}; #ifdef _PRERELEASE - uint64_t m_max_keys_in_node{0}; + // These are for testing purpose only + uint64_t m_max_keys_in_node{0}; + uint64_t m_min_keys_in_node{0}; #endif bool m_rebalance_turned_on{false}; bool m_merge_turned_on{true}; diff --git a/src/include/homestore/btree/detail/btree_mutate_impl.ipp b/src/include/homestore/btree/detail/btree_mutate_impl.ipp index 209b35558..c08422068 100644 --- a/src/include/homestore/btree/detail/btree_mutate_impl.ipp +++ b/src/include/homestore/btree/detail/btree_mutate_impl.ipp @@ -227,7 +227,7 @@ btree_status_t Btree< K, V >::check_split_root(ReqT& req) { root = std::move(new_root); // We need to notify about the root change, before splitting the node, so that correct dependencies are set - ret = on_root_changed(root, req.m_op_context); + ret = on_root_changed(root, nullptr, req.m_op_context); if (ret != btree_status_t::success) { free_node(root, locktype_t::WRITE, req.m_op_context); unlock_node(child_node, locktype_t::WRITE); @@ -236,9 +236,9 @@ btree_status_t Btree< K, V >::check_split_root(ReqT& req) { ret = split_node(root, child_node, root->total_entries(), &split_key, req.m_op_context); if (ret != btree_status_t::success) { + on_root_changed(child_node, root, req.m_op_context); // Revert it back free_node(root, locktype_t::WRITE, req.m_op_context); root = std::move(child_node); - on_root_changed(root, req.m_op_context); // Revert it back unlock_node(root, locktype_t::WRITE); } else { if (req.route_tracing) { append_route_trace(req, child_node, btree_event_t::SPLIT); } diff --git a/src/include/homestore/btree/detail/btree_node.hpp b/src/include/homestore/btree/detail/btree_node.hpp index a3285ef35..b516988d7 100644 --- a/src/include/homestore/btree/detail/btree_node.hpp +++ b/src/include/homestore/btree/detail/btree_node.hpp @@ -37,6 +37,7 @@ struct transient_hdr_t { /* these variables are accessed without taking lock and are not expected to change after init */ uint8_t leaf_node{0}; uint64_t max_keys_in_node{0}; + uint64_t min_keys_in_node{0}; // to specify the threshold for triggering merge bool is_leaf() const { return (leaf_node != 0); } }; @@ -116,6 +117,7 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { m_trans_hdr.leaf_node = is_leaf; #ifdef _PRERELEASE m_trans_hdr.max_keys_in_node = cfg.m_max_keys_in_node; + m_trans_hdr.min_keys_in_node = cfg.m_min_keys_in_node; #endif } @@ -299,6 +301,7 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { template < typename K > K get_first_key() const { + if (total_entries() == 0) { return K{}; } return get_nth_key< K >(0, true); } @@ -333,6 +336,7 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { // uint32_t total_entries() const { return (has_valid_edge() ? total_entries() + 1 : total_entries()); } uint64_t max_keys_in_node() const { return m_trans_hdr.max_keys_in_node; } + uint64_t min_keys_in_node() const { return m_trans_hdr.min_keys_in_node; } void lock(locktype_t l) const { if (l == locktype_t::READ) { @@ -392,6 +396,12 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { } fmt::format_to(std::back_inserter(str), "]"); } + + // Should not happen + if (this->is_node_deleted()) { + fmt::format_to(std::back_inserter(str), " **DELETED** "); + } + return str; } @@ -527,6 +537,9 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { virtual uint32_t occupied_size() const { return (node_data_size() - available_size()); } bool is_merge_needed(const BtreeConfig& cfg) const { + if (min_keys_in_node()) { + return total_entries() < min_keys_in_node(); + } #if 0 #ifdef _PRERELEASE if (iomgr_flip::instance()->test_flip("btree_merge_node") && occupied_size() < node_data_size) { diff --git a/src/include/homestore/btree/detail/btree_node_mgr.ipp b/src/include/homestore/btree/detail/btree_node_mgr.ipp index a5b0317de..3b2383dd2 100644 --- a/src/include/homestore/btree/detail/btree_node_mgr.ipp +++ b/src/include/homestore/btree/detail/btree_node_mgr.ipp @@ -42,7 +42,7 @@ btree_status_t Btree< K, V >::create_root_node(void* op_context) { } m_root_node_info = BtreeLinkInfo{root->node_id(), root->link_version()}; - ret = on_root_changed(root, op_context); + ret = on_root_changed(root, nullptr, op_context); if (ret != btree_status_t::success) { free_node(root, locktype_t::NONE, op_context); m_root_node_info = BtreeLinkInfo{}; diff --git a/src/include/homestore/btree/detail/btree_remove_impl.ipp b/src/include/homestore/btree/detail/btree_remove_impl.ipp index 82213dcc6..6b0d78a5f 100644 --- a/src/include/homestore/btree/detail/btree_remove_impl.ipp +++ b/src/include/homestore/btree/detail/btree_remove_impl.ipp @@ -199,7 +199,7 @@ btree_status_t Btree< K, V >::check_collapse_root(ReqT& req) { goto done; } - ret = on_root_changed(child, req.m_op_context); + ret = on_root_changed(child, root, req.m_op_context); if (ret != btree_status_t::success) { unlock_node(child, locktype_t::WRITE); unlock_node(root, locktype_t::WRITE); @@ -476,7 +476,6 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const ++idx; } #endif - ret = transact_nodes(new_nodes, old_nodes, leftmost_node, parent_node, context); } diff --git a/src/include/homestore/btree/detail/simple_node.hpp b/src/include/homestore/btree/detail/simple_node.hpp index 1f4c30e32..fecc04e2e 100644 --- a/src/include/homestore/btree/detail/simple_node.hpp +++ b/src/include/homestore/btree/detail/simple_node.hpp @@ -229,8 +229,9 @@ class SimpleNode : public VariantNode< K, V > { } return str; } + std::string to_dot_keys() const override { - return to_dot_keys_impl(std::is_same{}); + return to_dot_keys_impl(std::is_same().key()), uint64_t>{}); } std::string to_dot_keys_impl(std::false_type) const { diff --git a/src/include/homestore/btree/mem_btree.hpp b/src/include/homestore/btree/mem_btree.hpp index ce606fc5a..4b3ea6f56 100644 --- a/src/include/homestore/btree/mem_btree.hpp +++ b/src/include/homestore/btree/mem_btree.hpp @@ -81,6 +81,8 @@ class MemBtree : public Btree< K, V > { return btree_status_t::success; } - btree_status_t on_root_changed(BtreeNodePtr const&, void*) override { return btree_status_t::success; } + btree_status_t on_root_changed(BtreeNodePtr const &, BtreeNodePtr const &, void *) override { + return btree_status_t::success; + } }; } // namespace homestore diff --git a/src/include/homestore/index/index_table.hpp b/src/include/homestore/index/index_table.hpp index 2bec275e3..7a56d952b 100644 --- a/src/include/homestore/index/index_table.hpp +++ b/src/include/homestore/index/index_table.hpp @@ -78,7 +78,8 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } void destroy() override { - Btree< K, V >::destroy_btree(nullptr); + auto cpg = cp_mgr().cp_guard(); + Btree::destroy_btree(cpg.context(cp_consumer_t::INDEX_SVC)); m_sb.destroy(); } @@ -130,13 +131,16 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { idx_buf->m_dirtied_cp_id = cpg->id(); BtreeNodePtr bn = BtreeNodePtr{n}; - LOGTRACEMOD(wbcache, "repair_node cp={} buf={}", cpg->id(), idx_buf->to_string()); - repair_links(bn, (void*)cpg.context(cp_consumer_t::INDEX_SVC)); + // Only for interior nodes we need to repair its links + if (!bn->is_leaf()) { + LOGTRACEMOD(wbcache, "repair_node cp={} buf={}", cpg->id(), idx_buf->to_string()); + repair_links(bn, (void *) cpg.context(cp_consumer_t::INDEX_SVC)); + } if (idx_buf->m_up_buffer && idx_buf->m_up_buffer->is_meta_buf()) { // Our up buffer is a meta buffer, which means that we are the new root node, we need to update the // meta_buf with new root as well - on_root_changed(bn, (void*)cpg.context(cp_consumer_t::INDEX_SVC)); + on_root_changed(bn, nullptr, (void *) cpg.context(cp_consumer_t::INDEX_SVC)); } } @@ -223,7 +227,8 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { wb_cache().free_buf(n->m_idx_buf, r_cast< CPContext* >(context)); } - btree_status_t on_root_changed(BtreeNodePtr const& new_root, void* context) override { + btree_status_t + on_root_changed(BtreeNodePtr const &new_root, BtreeNodePtr const &freed_root, void *context) override { m_sb->root_node = new_root->node_id(); m_sb->root_link_version = new_root->link_version(); @@ -232,12 +237,18 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } auto& root_buf = static_cast< IndexBtreeNode* >(new_root.get())->m_idx_buf; - wb_cache().transact_bufs(ordinal(), m_sb_buffer, root_buf, {}, {}, r_cast< CPContext* >(context)); + IndexBufferPtrList freed_bufs; + if (freed_root) { + freed_bufs.push_back(static_cast(freed_root.get())->m_idx_buf); + } + // Meta is similar to a leftmost child here - it should always be the up buffer for (both) root(s) + wb_cache().transact_bufs(ordinal(), nullptr, m_sb_buffer, {root_buf}, freed_bufs, + r_cast(context)); return btree_status_t::success; } btree_status_t repair_links(BtreeNodePtr const& parent_node, void* cp_ctx) { - BT_LOG(DEBUG, "Repairing links for parent node {}", parent_node->to_string()); + BT_LOG(DEBUG, "Repairing links for parent node [{}]", parent_node->to_string()); // Get the last key in the node auto const last_parent_key = parent_node->get_last_key< K >(); @@ -247,7 +258,15 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { parent_node->node_id()); return btree_status_t::not_found; } - BT_LOG(INFO, "Repairing node={} with last_parent_key={}", parent_node->to_string(), + + // Get all original child ids as a support to check if we are beyond the last child node + std::set orig_child_ids; + for (uint32_t i = 0; i < parent_node->total_entries(); ++i) { + BtreeLinkInfo link_info; + parent_node->get_nth_value(i, &link_info, true); + orig_child_ids.insert(link_info.bnode_id()); + } + BT_LOG(INFO, "Repairing node=[{}] with last_parent_key={}", parent_node->to_string(), last_parent_key.to_string()); // Get the first child node and its link info @@ -272,24 +291,48 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { auto cur_parent = parent_node; BtreeNodeList new_parent_nodes; do { - if (child_node->has_valid_edge() || - (child_node->is_leaf() && (child_node->next_bnode() == empty_bnodeid))) { - BT_DBG_ASSERT(is_parent_edge_node, - "Child node={} is an edge node but parent_node={} is not an edge node", - child_node->node_id(), cur_parent->node_id()); - cur_parent->set_edge_value(BtreeLinkInfo{child_node->node_id(), child_node->link_version()}); + if (child_node->has_valid_edge() || (child_node->is_leaf() && child_node->next_bnode() == empty_bnodeid)) { + if (child_node->is_node_deleted()) { + // Edge node is merged, we need to set the current last entry as edge + if (cur_parent->total_entries() > 0) { + auto prev_val = V{}; + cur_parent->get_nth_value(cur_parent->total_entries() - 1, &prev_val, true); + cur_parent->remove(cur_parent->total_entries() - 1); + cur_parent->set_edge_value(prev_val); + BT_LOG(INFO, "Reparing node={}, child_node=[{}] is deleted, set previous as edge_value={}", + cur_parent->node_id(), child_node->to_string(), prev_val.to_string()); + } else { + BT_LOG(INFO, "Found an empty interior node {} with maybe all childs deleted", + cur_parent->node_id()); + } + } else { + // Update edge and finish + BT_LOG(INFO, "Repairing node={}, child_node=[{}] is an edge node, end loop", cur_parent->node_id(), + child_node->to_string()); + child_node->set_next_bnode(empty_bnodeid); + write_node_impl(child_node, cp_ctx); + cur_parent->set_edge_value(BtreeLinkInfo{child_node->node_id(), child_node->link_version()}); + } break; } auto const child_last_key = child_node->get_last_key< K >(); - BT_LOG(INFO, "Repairing node={} child_node={} child_last_key={}", cur_parent->node_id(), + BT_LOG(INFO, "Repairing node={}, child_node=[{}] child_last_key={}", cur_parent->node_id(), child_node->to_string(), child_last_key.to_string()); - if (child_last_key.compare(last_parent_key) > 0) { - // We have reached the last key, we can stop now - break; + // Check if we are beyond the last child node. + // + // There can be cases where the child level merge is successfully persisted but the parent level is not. + // In this case, you may have your rightmost child node with last key greater than the last_parent_key. + // That's why here we have to check if the child node is one of the original child nodes first. + if (!is_parent_edge_node && !orig_child_ids.contains(child_node->node_id())) { + if (child_node->total_entries() == 0 || child_last_key.compare(last_parent_key) > 0) { + // We have reached a child beyond this parent, we can stop now + break; + } } + if (!cur_parent->has_room_for_put(btree_put_type::INSERT, K::get_max_size(), BtreeLinkInfo::get_fixed_size())) { // No room in the parent_node, let us split the parent_node and continue @@ -309,20 +352,34 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } // Insert the last key of the child node into parent node - cur_parent->insert(cur_parent->total_entries(), child_last_key, - BtreeLinkInfo{child_node->node_id(), child_node->link_version()}); + if (!child_node->is_node_deleted()) { + cur_parent->insert(cur_parent->total_entries(), + child_node->total_entries() > 0 ? child_last_key : last_parent_key, + BtreeLinkInfo{child_node->node_id(), child_node->link_version()}); + if (child_node->total_entries() == 0) { + // There should be at most one empty child node per parent - if we find one, we should stop here + BT_LOG(INFO, "Repairing node={}, child_node=[{}] is empty, end loop", cur_parent->node_id(), + child_node->to_string()); + break; + } + } else { + // Node deleted indicates it's freed & no longer used during recovery + BT_LOG(INFO, "Repairing node={}, child node=[{}] is deleted, skipping the insert", + cur_parent->node_id(), child_node->to_string()); + } - BT_LOG(INFO, "Repairing node={}, repaired so_far={}", cur_parent->node_id(), cur_parent->to_string()); + BT_LOG(INFO, "Repairing node={}, repaired so_far=[{}]", cur_parent->node_id(), cur_parent->to_string()); // Move to the next child node - this->unlock_node(child_node, locktype_t::READ); auto const next_node_id = child_node->next_bnode(); + this->unlock_node(child_node, locktype_t::READ); if (next_node_id == empty_bnodeid) { BT_LOG_ASSERT(false, "Child node={} next_node_id is empty, while its not a edge node, parent_node={} " "repair is partial", child_node->node_id(), parent_node->node_id()); ret = btree_status_t::not_found; + child_node = nullptr; break; } @@ -330,10 +387,21 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { if (ret != btree_status_t::success) { BT_LOG_ASSERT(false, "Parent node={} repair is partial, because child_node get has failed with ret={}", parent_node->node_id(), enum_name(ret)); + child_node = nullptr; break; } } while (true); - this->unlock_node(child_node, locktype_t::READ); + + if (child_node) { + this->unlock_node(child_node, locktype_t::READ); + } + + if (parent_node->total_entries() == 0 && !parent_node->has_valid_edge()) { + // We shouldn't have an empty interior node in the tree, let's delete it. + // The buf will be released by the caller + BT_LOG(INFO, "Parent node={} is empty, deleting it", parent_node->node_id()); + parent_node->set_node_deleted(); + } if (ret == btree_status_t::success) { ret = transact_nodes(new_parent_nodes, {}, parent_node, nullptr, cp_ctx); diff --git a/src/lib/common/crash_simulator.hpp b/src/lib/common/crash_simulator.hpp index 98c22fe17..cfe8ec327 100644 --- a/src/lib/common/crash_simulator.hpp +++ b/src/lib/common/crash_simulator.hpp @@ -13,9 +13,8 @@ class CrashSimulator { ~CrashSimulator() = default; void crash() { + m_crashed.update([](auto *s) { *s = true; }); if (m_restart_cb) { - m_crashed.update([](auto* s) { *s = true; }); - // We can restart on a new thread to allow other operations to continue std::thread t([cb = std::move(m_restart_cb)]() { // Restart could destroy this pointer, so we are storing in local variable and then calling. diff --git a/src/lib/device/virtual_dev.cpp b/src/lib/device/virtual_dev.cpp index 3665f13b9..ac49f95dd 100644 --- a/src/lib/device/virtual_dev.cpp +++ b/src/lib/device/virtual_dev.cpp @@ -424,6 +424,8 @@ std::error_code VirtualDev::sync_write(const char* buf, uint32_t size, BlkId con Chunk* chunk; uint64_t const dev_offset = to_dev_offset(bid, &chunk); + HS_LOG(TRACE, device, "Writing sync in device: {}, offset = {}", chunk->physical_dev_mutable()->pdev_id(), + dev_offset); if (sisl_unlikely(dev_offset == INVALID_DEV_OFFSET)) { return std::make_error_code(std::errc::resource_unavailable_try_again); } @@ -436,6 +438,9 @@ std::error_code VirtualDev::sync_write(const char* buf, uint32_t size, cshared< if (hs()->crash_simulator().is_crashed()) { return std::error_code{}; } #endif + HS_LOG(TRACE, device, "Writing sync in device: {}, offset = {}", chunk->physical_dev_mutable()->pdev_id(), + chunk->start_offset() + offset_in_chunk); + if (sisl_unlikely(!is_chunk_available(chunk))) { return std::make_error_code(std::errc::resource_unavailable_try_again); } @@ -457,6 +462,8 @@ std::error_code VirtualDev::sync_writev(const iovec* iov, int iovcnt, BlkId cons auto const size = get_len(iov, iovcnt); auto* pdev = chunk->physical_dev_mutable(); + HS_LOG(TRACE, device, "Writing sync in device: {}, offset = {}", pdev->pdev_id(), dev_offset); + COUNTER_INCREMENT(m_metrics, vdev_write_count, 1); if (sisl_unlikely(!hs_utils::mod_aligned_sz(dev_offset, pdev->align_size()))) { COUNTER_INCREMENT(m_metrics, unalign_writes, 1); @@ -479,6 +486,8 @@ std::error_code VirtualDev::sync_writev(const iovec* iov, int iovcnt, cshared< C auto const size = get_len(iov, iovcnt); auto* pdev = chunk->physical_dev_mutable(); + HS_LOG(TRACE, device, "Writing sync in device: {}, offset = {}", pdev->pdev_id(), dev_offset); + COUNTER_INCREMENT(m_metrics, vdev_write_count, 1); if (sisl_unlikely(!hs_utils::mod_aligned_sz(dev_offset, pdev->align_size()))) { COUNTER_INCREMENT(m_metrics, unalign_writes, 1); diff --git a/src/lib/homestore.cpp b/src/lib/homestore.cpp index af2d521c5..b0bd2ff54 100644 --- a/src/lib/homestore.cpp +++ b/src/lib/homestore.cpp @@ -321,8 +321,6 @@ void HomeStore::shutdown() { #ifdef _PRERELEASE flip::Flip::instance().stop_rpc_server(); #endif - - HomeStore::reset_instance(); LOGINFO("Homestore is completed its shutdown"); } diff --git a/src/lib/index/README.md b/src/lib/index/README.md index db24c9fbb..cb1805e79 100644 --- a/src/lib/index/README.md +++ b/src/lib/index/README.md @@ -91,7 +91,14 @@ Merge node happens somewhat similar to split node with respect to the atomicity Homestore Index service creates 2 more new nodes to replace C30, C40 lets say C30' and C40' respectively and now C20 is linked to C30' and C40' and C30, C40 nodes are deleted. -With this in perspective, dependency graph will link similar to split nodes with new nodes linked to left child, which is linked to parent node. +We can notice that we have to choose 1 of the 2 node sets, `(C20, C30, C40)` or `(C20, C30', C40')` as the current +state. **Whether we use the new nodes or the old ones completely depends on the persistence of leftmost child C20.** + +With this in perspective, dependency graph will link similar to split nodes with new nodes/freed nodes linked to left +child, which is linked to parent node. + +(NOTE that logically the node release should depend on the leftmost child - but in practice let's do it in a simpler way +since it would be very unlikely of freed nodes being overwritten in the same cp) ![Child_Node_Merge](../../../docs/imgs/Child_Node_Merge_1.png) @@ -100,4 +107,4 @@ The journal entries will need to record the existing node deletion information a ``` { , , , } {P10, C20, [C30', C40'], [C30, C40]} -``` \ No newline at end of file +``` diff --git a/src/lib/index/index_cp.cpp b/src/lib/index/index_cp.cpp index 955bd523f..a2e6ed72d 100644 --- a/src/lib/index/index_cp.cpp +++ b/src/lib/index/index_cp.cpp @@ -249,7 +249,6 @@ void IndexCPContext::process_txn_record(txn_record const* rec, std::map< BlkId, } if (up_buf) { - DEBUG_ASSERT(((buf->m_up_buffer == nullptr) || (buf->m_up_buffer == up_buf)), "Inconsistent up buffer"); auto real_up_buf = (up_buf->m_created_cp_id == cpg->id()) ? up_buf->m_up_buffer : up_buf; #ifndef NDEBUG @@ -267,6 +266,20 @@ void IndexCPContext::process_txn_record(txn_record const* rec, std::map< BlkId, #endif if (buf->m_up_buffer != real_up_buf) { + if (buf->m_up_buffer) { + buf->m_up_buffer->m_wait_for_down_buffers.decrement(1); +#ifndef NDEBUG + bool found{false}; + for (auto it = buf->m_up_buffer->m_down_buffers.begin(); it != buf->m_up_buffer->m_down_buffers.end(); ++it) { + if (it->lock() == buf) { + buf->m_up_buffer->m_down_buffers.erase(it); + found = true; + break; + } + } + HS_DBG_ASSERT(found, "Down buffer is linked to Up buf, but up_buf doesn't have down_buf in its list"); +#endif + } real_up_buf->m_wait_for_down_buffers.increment(1); buf->m_up_buffer = real_up_buf; } diff --git a/src/lib/index/wb_cache.cpp b/src/lib/index/wb_cache.cpp index 1b7523363..560a9a120 100644 --- a/src/lib/index/wb_cache.cpp +++ b/src/lib/index/wb_cache.cpp @@ -213,12 +213,12 @@ static void set_crash_flips(IndexBufferPtr const& parent_buf, IndexBufferPtr con IndexBufferPtrList const& new_node_bufs, IndexBufferPtrList const& freed_node_bufs) { // TODO: Need an API from flip to quickly check if flip is enabled, so this method doesn't check flip_enabled a // bunch of times. - if (parent_buf && parent_buf->is_meta_buf()) { + if (parent_buf == nullptr && child_buf->is_meta_buf()) { // Split or merge happening on root if (iomgr_flip::instance()->test_flip("crash_flush_on_meta")) { - parent_buf->set_crash_flag(); - } else if (iomgr_flip::instance()->test_flip("crash_flush_on_root")) { child_buf->set_crash_flag(); + } else if (iomgr_flip::instance()->test_flip("crash_flush_on_root")) { + new_node_bufs[0]->set_crash_flag(); } } else if ((new_node_bufs.size() == 1) && freed_node_bufs.empty()) { // Its a split node situation @@ -237,6 +237,8 @@ static void set_crash_flips(IndexBufferPtr const& parent_buf, IndexBufferPtr con child_buf->set_crash_flag(); } else if (iomgr_flip::instance()->test_flip("crash_flush_on_merge_at_right_child")) { if (!new_node_bufs.empty()) { new_node_bufs[0]->set_crash_flag(); } + } else if (iomgr_flip::instance()->test_flip("crash_flush_on_freed_child")) { + freed_node_bufs[0]->set_crash_flag(); } } else if (!freed_node_bufs.empty() && (new_node_bufs.size() == freed_node_bufs.size())) { // Its a rebalance node situation @@ -246,6 +248,8 @@ static void set_crash_flips(IndexBufferPtr const& parent_buf, IndexBufferPtr con child_buf->set_crash_flag(); } else if (iomgr_flip::instance()->test_flip("crash_flush_on_rebalance_at_right_child")) { if (!new_node_bufs.empty()) { new_node_bufs[0]->set_crash_flag(); } + } else if (iomgr_flip::instance()->test_flip("crash_flush_on_freed_child")) { + freed_node_bufs[0]->set_crash_flag(); } } } @@ -265,15 +269,8 @@ void IndexWBCache::transact_bufs(uint32_t index_ordinal, IndexBufferPtr const& p link_buf(child_buf, buf, true /* is_sibling_link */, cp_ctx); } - for (auto const& buf : freed_node_bufs) { - if (!buf->m_wait_for_down_buffers.testz()) { - // This buffer has some down bufs depending on it. It can happen for an upper level interior node, where - // lower level node (say leaf) has split causing it to write entries in this node, but this node is now - // merging with other node, causing it to free. In these rare instances, we link this node to the new - // node resulting in waiting for all the down bufs to be flushed before up buf can flush (this buf is - // not written anyways) - link_buf(child_buf, buf, true /* is_sibling_link */, cp_ctx); - } + for (auto const &buf: freed_node_bufs) { + link_buf(child_buf, buf, true /* is_sibling_link */, cp_ctx); } if (new_node_bufs.empty() && freed_node_bufs.empty()) { @@ -286,10 +283,11 @@ void IndexWBCache::transact_bufs(uint32_t index_ordinal, IndexBufferPtr const& p } else { icp_ctx->add_to_txn_journal(index_ordinal, // Ordinal child_buf->m_up_buffer, // real up buffer - new_node_bufs.empty() ? freed_node_bufs[0]->m_up_buffer - : new_node_bufs[0]->m_up_buffer, // real in place child - new_node_bufs, // new node bufs - freed_node_bufs // free_node_bufs + new_node_bufs.empty() + ? freed_node_bufs[0]->m_up_buffer + : new_node_bufs[0]->m_up_buffer, // real in place child + new_node_bufs, // new node bufs + freed_node_bufs // free_node_bufs ); } #if 0 @@ -371,6 +369,21 @@ void IndexWBCache::link_buf(IndexBufferPtr const& up_buf, IndexBufferPtr const& } // Now we link the down_buffer to the real up_buffer + if (down_buf->m_up_buffer) { + // release existing up_buffer's wait count + down_buf->m_up_buffer->m_wait_for_down_buffers.decrement(); +#ifndef NDEBUG + bool found{false}; + for (auto it = down_buf->m_up_buffer->m_down_buffers.begin(); it != down_buf->m_up_buffer->m_down_buffers.end(); ++it) { + if (it->lock() == down_buf) { + down_buf->m_up_buffer->m_down_buffers.erase(it); + found = true; + break; + } + } + HS_DBG_ASSERT(found, "Down buffer is linked to Up buf, but up_buf doesn't have down_buf in its list"); +#endif + } real_up_buf->m_wait_for_down_buffers.increment(1); down_buf->m_up_buffer = real_up_buf; #ifndef NDEBUG @@ -384,9 +397,13 @@ void IndexWBCache::free_buf(const IndexBufferPtr& buf, CPContext* cp_ctx) { bool done = m_cache.remove(buf->m_blkid, node); HS_REL_ASSERT_EQ(done, true, "Race on cache removal of btree blkid?"); } - + buf->m_node_freed = true; resource_mgr().inc_free_blk(m_node_size); - m_vdev->free_blk(buf->m_blkid, s_cast< VDevCPContext* >(cp_ctx)); + if (buf->is_clean()) { + buf->set_state(index_buf_state_t::DIRTY); + r_cast(cp_ctx)->add_to_dirty_list(buf); + resource_mgr().inc_dirty_buf_size(m_node_size); + } } //////////////////// Recovery Related section ///////////////////////////////// @@ -418,31 +435,52 @@ void IndexWBCache::recover(sisl::byte_view sb) { // This has to be done before doing any repair, because repair can allocate blkids and we don't want to allocate // the same blkid which could clash with the blkid next in the buf list. // - // On the second pass, we only take the new nodes/bufs and then repair their up buffers, if needed. - std::vector< IndexBufferPtr > l0_bufs; + // On the second pass, we only take part of the parents/siblings and then repair them, if needed. + std::vector pending_bufs; + std::vector deleted_bufs; for (auto const& [_, buf] : bufs) { - if (buf->m_node_freed || (buf->m_created_cp_id == icp_ctx->id())) { + if (buf->m_node_freed) { + // Freed node + if (buf->m_bytes == nullptr) { + buf->m_bytes = hs_utils::iobuf_alloc(m_node_size, sisl::buftag::btree_node, m_vdev->align_size()); + m_vdev->sync_read(r_cast(buf->m_bytes), m_node_size, buf->blkid()); + buf->m_dirtied_cp_id = BtreeNode::get_modified_cp_id(buf->m_bytes); + } + if (was_node_committed(buf)) { - if (was_node_committed(buf->m_up_buffer)) { - if (buf->m_node_freed) { - // Up buffer was written, so this buffer can be freed and thus can free the blk. - m_vdev->free_blk(buf->m_blkid, s_cast< VDevCPContext* >(icp_ctx)); - } else { - m_vdev->commit_blk(buf->m_blkid); - } - l0_bufs.push_back(buf); - } else { - buf->m_up_buffer->m_wait_for_down_buffers.decrement(); + // Mark this buffer as deleted, so that we can avoid using it anymore when repairing its parent's link + r_cast(buf->m_bytes)->node_deleted = true; + write_buf(nullptr, buf, icp_ctx); + deleted_bufs.push_back(buf); + pending_bufs.push_back(buf->m_up_buffer); + } else { + // (Up) buffer is not committed, node need to be kept and (potentially) repaired later + buf->m_node_freed = false; + if (buf->m_created_cp_id == icp_ctx->id()) { + // New nodes need to be commited first + m_vdev->commit_blk(buf->m_blkid); } + pending_bufs.push_back(buf); + buf->m_wait_for_down_buffers.increment(1); // Purely for recover_buf() counter consistency + } + } else if (buf->m_created_cp_id == icp_ctx->id()) { + // New node + if (was_node_committed(buf) && was_node_committed(buf->m_up_buffer)) { + // Both current and up buffer is commited, we can safely commit the current block + m_vdev->commit_blk(buf->m_blkid); + pending_bufs.push_back(buf->m_up_buffer); + } else { + // Just ignore it + buf->m_up_buffer->m_wait_for_down_buffers.decrement(); } } } LOGINFOMOD(wbcache, "Index Recovery detected {} nodes out of {} as new/freed nodes to be recovered in prev cp={}", - l0_bufs.size(), bufs.size(), icp_ctx->id()); + pending_bufs.size(), bufs.size(), icp_ctx->id()); - auto detailed_log = [this](std::map< BlkId, IndexBufferPtr > const& bufs, - std::vector< IndexBufferPtr > const& l0_bufs) { + auto detailed_log = [this](std::map const &bufs, + std::vector const &pending_bufs) { // Logs to detect down_waits are set correctly for up buffers list of all recovered bufs std::string log = fmt::format("\trecovered bufs (#of bufs = {})\n", bufs.size()); for (auto const& [_, buf] : bufs) { @@ -450,20 +488,26 @@ void IndexWBCache::recover(sisl::byte_view sb) { } // list of new_bufs - fmt::format_to(std::back_inserter(log), "\n\tl0_bufs (#of bufs = {})\n", l0_bufs.size()); - for (auto const& buf : l0_bufs) { + fmt::format_to(std::back_inserter(log), "\n\tpending_bufs (#of bufs = {})\n", pending_bufs.size()); + for (auto const &buf: pending_bufs) { fmt::format_to(std::back_inserter(log), "{}\n", buf->to_string()); } return log; }; - LOGTRACEMOD(wbcache, "All unclean bufs list\n{}", detailed_log(bufs, l0_bufs)); + LOGTRACEMOD(wbcache, "All unclean bufs list\n{}", detailed_log(bufs, pending_bufs)); + + for (auto const &buf: pending_bufs) { + recover_buf(buf); + if (buf->m_bytes != nullptr && r_cast(buf->m_bytes)->node_deleted) { + // This buffer was marked as deleted during repair, so we also need to free it + deleted_bufs.push_back(buf); + } + } - // Second iteration we start from the lowest levels (which are all new_bufs) and check if up_buffers need to be - // repaired. All L1 buffers are not needed to repair, because they are sibling nodes and so we pass false in - // do_repair flag. - for (auto const& buf : l0_bufs) { - recover_buf(buf->m_up_buffer); + for (auto const &buf: deleted_bufs) { + m_vdev->free_blk(buf->m_blkid, s_cast(icp_ctx)); } + m_in_recovery = false; m_vdev->recovery_completed(); } @@ -556,17 +600,18 @@ folly::Future< bool > IndexWBCache::async_cp_flush(IndexCPContext* cp_ctx) { void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const& buf, bool part_of_batch) { #ifdef _PRERELEASE + if (hs()->crash_simulator().is_crashed()) { + LOGINFOMOD(wbcache, "crash simulation is ongoing, aid simulation by not flushing"); + return; + } if (buf->m_crash_flag_on) { -// std::string filename = "crash_buf_" + std::to_string(cp_ctx->id()) + ".dot"; -// LOGINFOMOD(wbcache, "Simulating crash while writing buffer {}, stored in file {}", buf->to_string(), filename); -// cp_ctx->to_string_dot(filename); + // std::string filename = "crash_buf_" + std::to_string(cp_ctx->id()) + ".dot"; + // LOGINFOMOD(wbcache, "Simulating crash while writing buffer {}, stored in file {}", buf->to_string(), filename); + // cp_ctx->to_string_dot(filename); LOGINFOMOD(wbcache, "Simulating crash while writing buffer {}", buf->to_string()); hs()->crash_simulator().crash(); cp_ctx->complete(true); return; - } else if (hs()->crash_simulator().is_crashed()) { - LOGINFOMOD(wbcache, "crash simulation is ongoing, aid simulation by not flushing"); - return; } #endif @@ -574,17 +619,19 @@ void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const buf->set_state(index_buf_state_t::FLUSHING); if (buf->is_meta_buf()) { - LOGTRACEMOD(wbcache, "flushing cp {} meta buf {} possibly because of root split", cp_ctx->id(), + LOGTRACEMOD(wbcache, "Flushing cp {} meta buf {} possibly because of root split", cp_ctx->id(), buf->to_string()); auto const& sb = r_cast< MetaIndexBuffer* >(buf.get())->m_sb; - meta_service().update_sub_sb(buf->m_bytes, sb.size(), sb.meta_blk()); + if (!sb.is_empty()) { + meta_service().update_sub_sb(buf->m_bytes, sb.size(), sb.meta_blk()); + } process_write_completion(cp_ctx, buf); } else if (buf->m_node_freed) { - LOGTRACEMOD(wbcache, "Not flushing buf {} as it was freed, its here for merely dependency", cp_ctx->id(), - buf->to_string()); + LOGTRACEMOD(wbcache, "Not flushing cp {} buf {} as it was freed", cp_ctx->id(), buf->to_string()); + m_vdev->free_blk(buf->m_blkid, cp_ctx); process_write_completion(cp_ctx, buf); } else { - LOGTRACEMOD(wbcache, "flushing cp {} buf {} info: {}", cp_ctx->id(), buf->to_string(), + LOGTRACEMOD(wbcache, "Flushing cp {} buf {} info: {}", cp_ctx->id(), buf->to_string(), BtreeNode::to_string_buf(buf->raw_buffer())); m_vdev->async_write(r_cast< const char* >(buf->raw_buffer()), m_node_size, buf->m_blkid, part_of_batch) .thenValue([buf, cp_ctx](auto) { @@ -685,7 +732,7 @@ void IndexWBCache::get_next_bufs_internal(IndexCPContext* cp_ctx, uint32_t max_c std::optional< IndexBufferPtr > buf = cp_ctx->next_dirty(); if (!buf) { break; } // End of list - if ((*buf)->m_wait_for_down_buffers.testz()) { + if ((*buf)->state() == index_buf_state_t::DIRTY && (*buf)->m_wait_for_down_buffers.testz()) { bufs.emplace_back(std::move(*buf)); ++count; } else { diff --git a/src/tests/test_common/homestore_test_common.hpp b/src/tests/test_common/homestore_test_common.hpp index 174039495..b01aff3f2 100644 --- a/src/tests/test_common/homestore_test_common.hpp +++ b/src/tests/test_common/homestore_test_common.hpp @@ -198,8 +198,8 @@ class HSTestHelper { } homestore::HomeStore::instance()->shutdown(); + iomanager.stop(); // Stop iomanager first in case any fiber is still referencing homestore resources homestore::HomeStore::reset_instance(); - iomanager.stop(); if (cleanup) { remove_files(m_generated_devs); @@ -251,6 +251,11 @@ class HSTestHelper { m_fc.inject_delay_flip(flip_name, {null_cond}, freq, delay_usec); LOGDEBUG("Flip {} set", flip_name); } + + void remove_flip(const std::string flip_name) { + m_fc.remove_flip(flip_name); + LOGDEBUG("Flip {} removed", flip_name); + } #endif static void fill_data_buf(uint8_t* buf, uint64_t size, uint64_t pattern = 0) { diff --git a/src/tests/test_index_crash_recovery.cpp b/src/tests/test_index_crash_recovery.cpp index 11235be6a..fbf33c51e 100644 --- a/src/tests/test_index_crash_recovery.cpp +++ b/src/tests/test_index_crash_recovery.cpp @@ -36,23 +36,25 @@ SISL_LOGGING_DECL(test_index_crash_recovery) SISL_OPTION_GROUP(test_index_crash_recovery, (num_iters, "", "num_iters", "number of iterations for rand ops", - ::cxxopts::value< uint32_t >()->default_value("500"), "number"), + ::cxxopts::value< uint32_t >()->default_value("500"), "number"), (num_entries, "", "num_entries", "number of entries to test with", - ::cxxopts::value< uint32_t >()->default_value("5000"), "number"), + ::cxxopts::value< uint32_t >()->default_value("5000"), "number"), (run_time, "", "run_time", "run time for io", ::cxxopts::value< uint32_t >()->default_value("360000"), - "seconds"), + "seconds"), (max_keys_in_node, "", "max_keys_in_node", "max_keys_in_node", - ::cxxopts::value< uint32_t >()->default_value("0"), ""), + ::cxxopts::value< uint32_t >()->default_value("20"), ""), + (min_keys_in_node, "", "min_keys_in_node", "min_keys_in_node", + ::cxxopts::value< uint32_t >()->default_value("6"), ""), (operation_list, "", "operation_list", - "operation list instead of default created following by percentage", - ::cxxopts::value< std::vector< std::string > >(), "operations [...]"), + "operation list instead of default created following by percentage", + ::cxxopts::value< std::vector< std::string > >(), "operations [...]"), (preload_size, "", "preload_size", "number of entries to preload tree with", - ::cxxopts::value< uint32_t >()->default_value("1000"), "number"), + ::cxxopts::value< uint32_t >()->default_value("1000"), "number"), (init_device, "", "init_device", "init device", ::cxxopts::value< bool >()->default_value("1"), ""), (cleanup_after_shutdown, "", "cleanup_after_shutdown", "cleanup after shutdown", - ::cxxopts::value< bool >()->default_value("1"), ""), + ::cxxopts::value< bool >()->default_value("1"), ""), (seed, "", "seed", "random engine seed, use random if not defined", - ::cxxopts::value< uint64_t >()->default_value("0"), "number")) + ::cxxopts::value< uint64_t >()->default_value("0"), "number")) void log_obj_life_counter() { std::string str; @@ -96,10 +98,16 @@ class SequenceGenerator { keyDist_ = std::uniform_int_distribution<>(start_range_, end_range_); } + void fillRange(uint64_t start, uint64_t end) { + for (uint64_t i = start; i <= end; ++i) { + keyStates[i] = true; + } + } + OperationList generateOperations(size_t numOperations, bool reset = false) { std::vector< Operation > operations; if (reset) { this->reset(); } - for (size_t i = 0; i < numOperations; ++i) { + while (operations.size() < numOperations) { uint32_t key = keyDist_(gen_); auto [it, inserted] = keyStates.try_emplace(key, false); auto& inUse = it->second; @@ -117,6 +125,7 @@ class SequenceGenerator { return operations; } + __attribute__((noinline)) std::string showKeyState(uint64_t key) const { auto it = keyStates.find(key); if (it != keyStates.end()) { return it->second ? "Put" : "Remove"; } @@ -131,6 +140,7 @@ class SequenceGenerator { } return occurrences; } + __attribute__((noinline)) std::string printOperations(const OperationList& operations) const { std::ostringstream oss; for (const auto& [key, opType] : operations) { @@ -139,6 +149,7 @@ class SequenceGenerator { } return oss.str(); } + __attribute__((noinline)) std::string printKeysOccurrences(const OperationList& operations) const { std::set< uint64_t > keys = collectUniqueKeys(operations); std::ostringstream oss; @@ -152,6 +163,7 @@ class SequenceGenerator { } return oss.str(); } + __attribute__((noinline)) std::string printKeyOccurrences(const OperationList& operations, uint64_t key ) const { std::ostringstream oss; auto keyOccurrences = inspect(operations, key); @@ -162,6 +174,7 @@ class SequenceGenerator { } return oss.str(); } + void reset() { keyStates.clear(); } private: @@ -204,6 +217,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT m_test->m_cfg.m_leaf_node_type = T::leaf_node_type; m_test->m_cfg.m_int_node_type = T::interior_node_type; m_test->m_cfg.m_max_keys_in_node = SISL_OPTIONS["max_keys_in_node"].as< uint32_t >(); + m_test->m_cfg.m_min_keys_in_node = SISL_OPTIONS["min_keys_in_node"].as(); m_test->m_bt = std::make_shared< typename T::BtreeType >(std::move(sb), m_test->m_cfg); return m_test->m_bt; } @@ -232,6 +246,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT LOGINFO("Node size {} ", hs()->index_service().node_size()); this->m_cfg = BtreeConfig(hs()->index_service().node_size()); this->m_cfg.m_max_keys_in_node = SISL_OPTIONS["max_keys_in_node"].as< uint32_t >(); + this->m_cfg.m_min_keys_in_node = SISL_OPTIONS["min_keys_in_node"].as(); auto uuid = boost::uuids::random_generator()(); auto parent_uuid = boost::uuids::random_generator()(); @@ -257,7 +272,10 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT } void reset_btree() { + hs()->index_service().remove_index_table(this->m_bt); this->m_bt->destroy(); + this->trigger_cp(true); + auto uuid = boost::uuids::random_generator()(); auto parent_uuid = boost::uuids::random_generator()(); this->m_bt = std::make_shared< typename T::BtreeType >(uuid, parent_uuid, 0, this->m_cfg); @@ -274,7 +292,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT void reapply_after_crash() { ShadowMap< K, V > snapshot_map{this->m_shadow_map.max_keys()}; snapshot_map.load(m_shadow_filename); - LOGDEBUG("\tSnapshot before crash\n{}", snapshot_map.to_string()); + // LOGDEBUG("\tSnapshot before crash\n{}", snapshot_map.to_string()); auto diff = this->m_shadow_map.diff(snapshot_map); // visualize tree after crash @@ -286,16 +304,23 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT for (const auto& [k, addition] : diff) { dif_str += fmt::format(" {} \t{}\n", k.key(), addition); } - LOGDEBUG("Diff between shadow map and snapshot map\n{}\n", dif_str); + // LOGDEBUG("Diff between shadow map and snapshot map\n{}\n", dif_str); - for (const auto& [k, addition] : diff) { + for (const auto &[k, addition]: diff) { // this->print_keys(fmt::format("reapply: before inserting key {}", k.key())); // this->visualize_keys(recovered_tree_filename); - if (addition) { this->force_upsert(k.key()); } + if (addition) { + LOGDEBUG("Reapply: Inserting key {}", k.key()); + this->force_upsert(k.key()); + } else { + LOGDEBUG("Reapply: Removing key {}", k.key()); + this->remove_one(k.key(), false); + } } - test_common::HSTestHelper::trigger_cp(true); + trigger_cp(true); this->m_shadow_map.save(m_shadow_filename); } + void reapply_after_crash(OperationList& operations) { for (const auto& [key, opType] : operations) { switch (opType) { @@ -309,7 +334,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT break; } } - test_common::HSTestHelper::trigger_cp(true); + trigger_cp(true); } void TearDown() override { @@ -331,14 +356,15 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT } void crash_and_recover(uint32_t s_key, uint32_t e_key) { - this->print_keys("Btree prior to CP and susbsequent simulated crash: "); - test_common::HSTestHelper::trigger_cp(false); + // this->print_keys("Btree prior to CP and susbsequent simulated crash: "); + trigger_cp(false); this->wait_for_crash_recovery(); // this->visualize_keys("tree_after_crash_" + std::to_string(s_key) + "_" + std::to_string(e_key) + ".dot"); - this->print_keys("Post crash and recovery, btree structure: "); + // this->print_keys("Post crash and recovery, btree structure: "); this->reapply_after_crash(); + // Verification this->get_all(); LOGINFO("Expect to have [{},{}) in tree and it is actually{} ", s_key, e_key, tree_key_count()); ASSERT_EQ(this->m_shadow_map.size(), this->m_bt->count_keys(this->m_bt->root_node_id())) @@ -346,10 +372,10 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT } void crash_and_recover(OperationList& operations, std::string filename = "") { - // this->print_keys("Btree prior to CP and susbsequent simulated crash: "); - test_common::HSTestHelper::trigger_cp(false); + // this->print_keys("Btree prior to CP and susbsequent simulated crash: "); + trigger_cp(false); this->wait_for_crash_recovery(); - // this->print_keys("Post crash and recovery, btree structure:"); + // this->print_keys("Post crash and recovery, btree structure:"); if (!filename.empty()) { LOGINFO("Visualize the tree file {}", filename); @@ -364,6 +390,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT this->visualize_keys("after_reapply__" + filename); } + // Verification this->get_all(); } @@ -442,82 +469,6 @@ TYPED_TEST(IndexCrashTest, SplitOnLeftEdge) { this->query_all_paginate(80); } -/* -TYPED_TEST(IndexCrashTest, ManualMergeCrash){ - // Define the lambda function - const uint32_t num_entries = 30; - - auto initTree = [this, num_entries]() { - for (uint64_t k = 0u; k < num_entries; ++k) { - this->force_upsert(k); - } - test_common::HSTestHelper::trigger_cp(true); - this->m_shadow_map.save(this->m_shadow_filename); - }; - - std::vector< OperationList > removing_scenarios = { - {{29, OperationType::Remove}, - {28, OperationType::Remove}, - {27, OperationType::Remove}, - {26, OperationType::Remove}, - {25, OperationType::Remove}, - {24, OperationType::Remove}} - }; - - auto scenario = removing_scenarios[0]; - - LOGINFO("Step 1-1: Populate some keys and flush"); - initTree(); - this->visualize_keys("tree_init.dot"); - LOGINFO("Step 2-1: Set crash flag, remove some keys in reverse order"); - this->set_basic_flip("crash_flush_on_merge_at_parent"); - - for (auto [k, _] : scenario) { - LOGINFO("\n\n\t\t\t\t\t\t\t\t\t\t\t\t\tRemoving entry {}", k); - this->remove_one(k); - } - this->visualize_keys("tree_before_crash.dot"); - - LOGINFO("Step 3-1: Trigger cp to crash"); - this->crash_and_recover(scenario, "recover_tree_crash_1.dot"); - test_common::HSTestHelper::trigger_cp(true); - this->get_all(); - - LOGINFO("Step 1-2: Populate some keys and flush"); - initTree(); - this->visualize_keys("tree_init_02.dot"); - LOGINFO("Step 2-2: Set crash flag, remove some keys in reverse order"); - this->set_basic_flip("crash_flush_on_merge_at_left_child"); - for (auto [k, _] : scenario) { - LOGINFO("\n\n\t\t\t\t\t\t\t\t\t\t\t\t\tRemoving entry {}", k); - this->remove_one(k); - } - this->visualize_keys("tree_before_crash_2.dot"); - - LOGINFO("Step 3-2: Trigger cp to crash"); - this->crash_and_recover(scenario, "recover_tree_crash_2.dot"); - test_common::HSTestHelper::trigger_cp(true); - this->get_all(); - - LOGINFO("Step 1-3: Populate some keys and flush"); - initTree(); - this->visualize_keys("tree_init_03.dot"); - LOGINFO("Step 2-3: Set crash flag, remove some keys in reverse order"); - this->set_basic_flip("crash_flush_on_freed_child"); - for (auto [k, _] : scenario) { - LOGINFO("\n\n\t\t\t\t\t\t\t\t\t\t\t\t\tRemoving entry {}", k); - this->remove_one(k); - } - LOGINFO("Step 2-3: Set crash flag, remove some keys in reverse order"); - this->visualize_keys("tree_before_crash_3.dot"); - - LOGINFO("Step 3-3: Trigger cp to crash"); - this->crash_and_recover(scenario, "recover_tree_crash_3.dot"); - test_common::HSTestHelper::trigger_cp(true); - this->get_all(); -} -*/ - TYPED_TEST(IndexCrashTest, SplitCrash1) { // Define the lambda function auto const num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); @@ -583,6 +534,237 @@ TYPED_TEST(IndexCrashTest, long_running_put_crash) { } } + +// Basic reverse and forward order remove with different flip points +TYPED_TEST(IndexCrashTest, MergeRemoveBasic) { + vector flip_points = { + "crash_flush_on_merge_at_parent", + "crash_flush_on_merge_at_left_child", + "crash_flush_on_freed_child", + }; + + for (size_t i = 0; i < flip_points.size(); ++i) { + this->reset_btree(); + + auto &flip_point = flip_points[i]; + LOGINFO("=== Testing flip point: {} - {} ===", i + 1, flip_point); + + // Populate some keys [1,num_entries) and trigger cp to persist + LOGINFO("Step {}-1: Populate some keys and flush", i+1); + auto const num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); + for (auto k = 0u; k < num_entries; ++k) { + this->put(k, btree_put_type::INSERT, true /* expect_success */); + } + test_common::HSTestHelper::trigger_cp(true); + this->m_shadow_map.save(this->m_shadow_filename); + + this->visualize_keys("tree_merge_full.dot"); + + // Split keys into batches and remove the last one in reverse order + LOGINFO("Step {}-2: Set crash flag, remove some keys in reverse order", i + 1); + int batch_num = 4; + { + int n = batch_num; + auto r = num_entries * n / batch_num - 1; + auto l = num_entries * (n - 1) / batch_num; + OperationList ops; + for (auto k = r; k >= l; --k) { + ops.emplace_back(k, OperationType::Remove); + } + LOGINFO("Step {}-2-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, batch_num, r, l); + + this->set_basic_flip(flip_point); + for (auto [k, _]: ops) { + LOGINFO("Removing key {}", k); + this->remove_one(k, true); + } + this->visualize_keys("tree_merge_before_first_crash.dot"); + + LOGINFO("Step {}-2-2: Trigger cp to crash", i + 1); + this->crash_and_recover(ops); + } + + // Remove the next batch of keys in forward order + LOGINFO("Step {}-3: Remove another batch in ascending order", i + 1) { + int n = batch_num - 1; + auto r = num_entries * n / batch_num - 1; + auto l = num_entries * (n - 1) / batch_num; + OperationList ops; + for (auto k = l; k <= r; ++k) { + ops.emplace_back(k, OperationType::Remove); + } + LOGINFO("Step {}-3-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, batch_num, l, r); + + this->set_basic_flip(flip_point); + for (auto [k, _]: ops) { + LOGINFO("Removing key {}", k); + this->remove_one(k, true); + } + this->visualize_keys("tree_merge_before_second_crash.dot"); + + LOGINFO("Step {}-3-2: Trigger cp to crash", i + 1); + this->crash_and_recover(ops); + } + + // Remove the next batch of keys in random order + LOGINFO("Step {}-4: Remove another batch in random order", i + 1) { + int n = batch_num - 2; + auto r = num_entries * n / batch_num - 1; + auto l = num_entries * (n - 1) / batch_num; + SequenceGenerator generator(0, 100, l, r); + generator.fillRange(l, r); + OperationList ops = generator.generateOperations(r - l + 1, false); + + LOGINFO("Step {}-4-1: Remove keys in batch {}/{} ({} to {})", i + 1, n, batch_num, l, r); + + this->set_basic_flip(flip_point); + for (auto [k, _]: ops) { + LOGINFO("Removing key {}", k); + this->remove_one(k, true); + } + this->visualize_keys("tree_merge_before_third_crash.dot"); + + LOGINFO("Step {}-4-2: Trigger cp to crash", i + 1); + this->crash_and_recover(ops); + } + + LOGINFO("Step {}-5: Cleanup the tree", i + 1); + for (auto k = 0u; k < num_entries; ++k) { + this->remove_one(k, false); + } + test_common::HSTestHelper::trigger_cp(true); + this->get_all(); + } +} +// +// TYPED_TEST(IndexCrashTest, MergeCrash1) { +// auto const num_entries = SISL_OPTIONS["num_entries"].as(); +// vector flips = { +// "crash_flush_on_merge_at_parent", "crash_flush_on_merge_at_left_child", "crash_flush_on_freed_child" +// }; +// SequenceGenerator generator(0 /*putFreq*/, 100 /* removeFreq*/, 0 /*start_range*/, num_entries - 1 /*end_range*/); +// OperationList operations; +// for (size_t i = 0; i < flips.size(); ++i) { +// this->reset_btree(); +// LOGINFO("Step {}-1: Init btree", i + 1); +// for (auto k = 0u; k < num_entries; ++k) { +// this->put(k, btree_put_type::INSERT, true /* expect_success */); +// } +// test_common::HSTestHelper::trigger_cp(true); +// this->print_keys("Inited tree"); +// +// LOGINFO("Step {}-2: Set flag {}", i + 1, flips[i]); +// this->set_basic_flip(flips[i], 1, 10); +// generator.reset(); +// generator.fillRange(0, num_entries - 1); +// +// // Randomly remove some keys +// std::random_device rd; +// std::mt19937 gen(rd()); +// std::uniform_int_distribution<> dis(num_entries / 4, num_entries / 2); +// auto num_keys_to_remove = dis(gen); +// LOGINFO("Removing {} keys before crash", num_keys_to_remove); +// operations = generator.generateOperations(num_keys_to_remove, false /* reset */); +// for (auto [k, _]: operations) { +// LOGINFO("Removing key {}", k); +// this->remove_one(k, true); +// } +// +// LOGINFO("Step {}-3: Simulate crash and recover", i + 1); +// this->crash_and_recover(operations, fmt::format("recover_tree_crash_{}.dot", i + 1)); +// } +// } +// +// TYPED_TEST(IndexCrashTest, MergeManualCrash) { +// std::vector flip_points = { +// "crash_flush_on_merge_at_parent", +// "crash_flush_on_merge_at_left_child", +// "crash_flush_on_freed_child", +// }; +// +// constexpr uint32_t num_entries = 28; // with max=5 & min=3 +// +// auto initTree = [this, num_entries]() { +// for (auto k = 0u; k < num_entries; ++k) { +// this->put(k, btree_put_type::INSERT, true /* expect_success */); +// } +// test_common::HSTestHelper::trigger_cp(true); +// this->m_shadow_map.save(this->m_shadow_filename); +// }; +// +// std::vector removing_scenarios = { +// { +// {27, OperationType::Remove}, +// {26, OperationType::Remove}, +// {25, OperationType::Remove}, +// {24, OperationType::Remove}, +// {23, OperationType::Remove}, +// {22, OperationType::Remove}, +// }, // Merge 2 rightmost leaf nodes in 1 action +// { +// {27, OperationType::Remove}, +// {26, OperationType::Remove}, +// {25, OperationType::Remove}, +// {24, OperationType::Remove}, +// {23, OperationType::Remove}, +// {20, OperationType::Remove}, +// {19, OperationType::Remove}, +// }, // Merge 3 rightmost leaf nodes in 1 action +// { +// {27, OperationType::Remove}, +// {26, OperationType::Remove}, +// {25, OperationType::Remove}, +// {24, OperationType::Remove}, +// {23, OperationType::Remove}, +// {22, OperationType::Remove}, +// {21, OperationType::Remove}, +// {20, OperationType::Remove}, +// {19, OperationType::Remove}, +// }, // Merge 3 rightmost leaf nodes in 2 actions +// { +// {23, OperationType::Remove}, +// {22, OperationType::Remove}, +// {11, OperationType::Remove}, +// {10, OperationType::Remove}, +// {13, OperationType::Remove}, +// }, // Merge from level=0 then level=1 +// // { +// // {16, OperationType::Remove}, +// // }, // Merge from level=1 then level=0 - need to set min=4 +// }; +// +// for (int i = 0; i < static_cast(removing_scenarios.size()); i++) { +// auto scenario = removing_scenarios[i]; +// auto s_idx = i + 1; +// LOGINFO("\n\tTesting scenario {}", s_idx); +// for (int j = 0; j < static_cast(flip_points.size()); j++) { +// const auto &flip_point = flip_points[j]; +// auto f_idx = j + 1; +// LOGINFO("\n\t\t\t\tTesting flip point: {}", flip_point); +// +// LOGINFO("Step {}-{}-1: Populate keys and flush", s_idx, f_idx); +// initTree(); +// this->visualize_keys(fmt::format("tree_init.{}_{}.dot", s_idx, f_idx)); +// +// LOGINFO("Step {}-{}-2: Set crash flag, remove keys in reverse order", s_idx, f_idx); +// this->set_basic_flip(flip_point); +// for (auto k: scenario) { +// LOGINFO("Removing entry {}", k.first); +// this->remove_one(k.first); +// } +// this->visualize_keys(fmt::format("tree_before_first_crash.{}_{}.dot", s_idx, f_idx)); +// this->remove_flip(flip_point); +// +// LOGINFO("Step {}-{}-3: Trigger cp to crash", s_idx, f_idx); +// this->crash_and_recover(scenario); +// test_common::HSTestHelper::trigger_cp(true); +// this->get_all(); +// +// this->reset_btree(); +// test_common::HSTestHelper::trigger_cp(true); +// } +// } +// } #endif int main(int argc, char* argv[]) {