From 5ee291c23699026a7a7772e44a87fd7de31d75b2 Mon Sep 17 00:00:00 2001 From: Mehdi Hosseini <116847813+shosseinimotlagh@users.noreply.github.com> Date: Mon, 23 Oct 2023 14:24:57 -0700 Subject: [PATCH] Fix split, merge, varlen remove, (parent size estimation), simple remove (#181) --- conanfile.py | 2 +- src/include/homestore/btree/btree.ipp | 2 +- .../btree/detail/btree_remove_impl.ipp | 57 ++++++++++--------- .../homestore/btree/detail/simple_node.hpp | 3 +- .../homestore/btree/detail/varlen_node.hpp | 13 +++-- src/tests/btree_test_kvs.hpp | 8 +-- src/tests/test_mem_btree.cpp | 2 - 7 files changed, 43 insertions(+), 44 deletions(-) diff --git a/conanfile.py b/conanfile.py index 70abb5980..b35bbb3e8 100644 --- a/conanfile.py +++ b/conanfile.py @@ -5,7 +5,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "4.5.6" + version = "4.5.7" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/btree/btree.ipp b/src/include/homestore/btree/btree.ipp index a3bd1ee02..5ebe3e235 100644 --- a/src/include/homestore/btree/btree.ipp +++ b/src/include/homestore/btree/btree.ipp @@ -207,7 +207,7 @@ retry: m_btree_lock.unlock_shared(); ret = check_collapse_root(req); - if (ret != btree_status_t::success) { + if (ret != btree_status_t::success && ret != btree_status_t::merge_not_required) { LOGERROR("check collapse read failed btree name {}", m_bt_cfg.name()); goto out; } diff --git a/src/include/homestore/btree/detail/btree_remove_impl.ipp b/src/include/homestore/btree/detail/btree_remove_impl.ipp index 9b62e2e90..0ecddc7d6 100644 --- a/src/include/homestore/btree/detail/btree_remove_impl.ipp +++ b/src/include/homestore/btree/detail/btree_remove_impl.ipp @@ -288,6 +288,7 @@ bool Btree< K, V >::remove_extents_in_leaf(const BtreeNodePtr& node, BtreeRangeR template < typename K, typename V > template < typename ReqT > btree_status_t Btree< K, V >::check_collapse_root(ReqT& req) { + if (!m_bt_cfg.m_merge_turned_on) { return btree_status_t::merge_not_required; } BtreeNodePtr child; BtreeNodePtr root; btree_status_t ret = btree_status_t::success; @@ -355,7 +356,7 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const for (auto indx = start_idx + 1; indx <= end_idx; ++indx) { if (indx == parent_node->total_entries()) { BT_NODE_LOG_ASSERT(parent_node->has_valid_edge(), parent_node, - "Assertion failure, expected valid edge for parent_node: {}"); + "Assertion failure, expected valid edge for parent_node"); } BtreeLinkInfo child_info; @@ -463,28 +464,25 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const if (!K::is_fixed_size()) { // Lets see if we have enough room in parent node to accommodate changes. This is needed only if the key is not - // fixed length. For fixed length node merge will always result in lesser or equal size - int64_t post_merge_size{0}; - auto& old_node = old_nodes[leftmost_src.ith_nodes.back()]; - if (old_node->total_entries()) { - post_merge_size += old_node->get_nth_obj_size( - std::min(leftmost_src.last_node_upto, old_node->total_entries() - 1)); // New leftmost entry - } - post_merge_size -= parent_node->get_nth_obj_size(start_idx); // Previous left entry - - for (auto& node : new_nodes) { - if (node->total_entries()) { post_merge_size += node->get_nth_obj_size(node->total_entries() - 1); } - } - - for (auto& node : old_nodes) { - if (node->total_entries()) { post_merge_size -= node->get_nth_obj_size(node->total_entries() - 1); } - } - - if (post_merge_size > parent_node->available_size(m_bt_cfg)) { + // fixed length. For fixed length node merge will always result in less or equal size + + // we first calculate the least amount of space being released after removing excess children. the key size + // cannot be taken account; so we know for sure that value (i.e., linkinfo) and also its record will be freed. + // If the end_idx is the parent's edge, the space is not released eventually. + auto excess_releasing_nodes = + old_nodes.size() - new_nodes.size() - parent_node->total_entries() == end_idx ? 1 : 0; + auto minimum_releasing_excess_size = + excess_releasing_nodes * (BtreeLinkInfo::get_fixed_size() + parent_node->get_record_size()); + + // aside from releasing size due to excess node, K::get_estimate_max_size is needed for each updating element + // at worst case (linkinfo and record remain the same for old and new nodes). The number of updating elements + // are the size of the new nodes (the last key of the last new node is not getting updated; hence excluded) plus + // the leftmost node. + if (parent_node->available_size(m_bt_cfg) + minimum_releasing_excess_size < + (1 + new_nodes.size() ? new_nodes.size() - 1 : 0) * K::get_estimate_max_size()) { BT_NODE_LOG(DEBUG, parent_node, - "Merge is needed, however after merge it will add {} bytes which is more than " - "available_size={}, so not proceeding with merge", - post_merge_size, parent_node->available_size(m_bt_cfg)); + "Merge is needed, however after merge, the parent MAY not have enough space to accommodate the " + "new keys, so not proceeding with merge"); ret = btree_status_t::merge_not_required; goto out; } @@ -613,15 +611,18 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const out: // Do free/unlock based on success/failure in reverse order if (ret == btree_status_t::success) { - for (auto& node : old_nodes) { - free_node(node, locktype_t::WRITE, context); + for (auto it = old_nodes.rbegin(); it != old_nodes.rend(); ++it) { + BT_NODE_LOG(DEBUG, (*it).get(), "Freeing this node as part of successful merge"); + free_node(*it, locktype_t::WRITE, context); } } else { - for (auto& node : old_nodes) { - unlock_node(node, locktype_t::WRITE); + for (auto it = old_nodes.rbegin(); it != old_nodes.rend(); ++it) { + BT_NODE_LOG(DEBUG, (*it).get(), "Unlocking this node as part of unsuccessful merge"); + unlock_node(*it, locktype_t::WRITE); } - for (auto& node : new_nodes) { - free_node(node, locktype_t::NONE, context); + for (auto it = new_nodes.rbegin(); it != new_nodes.rend(); ++it) { + BT_NODE_LOG(DEBUG, (*it).get(), "Freeing this new node as part of unsuccessful merge"); + free_node(*it, locktype_t::NONE, context); } } return ret; diff --git a/src/include/homestore/btree/detail/simple_node.hpp b/src/include/homestore/btree/detail/simple_node.hpp index 26e32cc68..f12302183 100644 --- a/src/include/homestore/btree/detail/simple_node.hpp +++ b/src/include/homestore/btree/detail/simple_node.hpp @@ -147,8 +147,7 @@ class SimpleNode : public BtreeNode { } uint32_t num_entries_by_size(uint32_t start_idx, uint32_t size) const override { - uint32_t possible_entries = (size == 0) ? 0 : (size - 1) / get_nth_obj_size(0) + 1; - return std::min(possible_entries, this->total_entries() - start_idx); + return std::min(size / get_nth_obj_size(0), this->total_entries() - start_idx); } uint32_t copy_by_size(const BtreeConfig& cfg, const BtreeNode& o, uint32_t start_idx, uint32_t size) override { diff --git a/src/include/homestore/btree/detail/varlen_node.hpp b/src/include/homestore/btree/detail/varlen_node.hpp index 959884ce2..29a955983 100644 --- a/src/include/homestore/btree/detail/varlen_node.hpp +++ b/src/include/homestore/btree/detail/varlen_node.hpp @@ -166,7 +166,7 @@ class VariableNode : public BtreeNode { get_nth_value(ind_s - 1, &last_1_val, false); this->set_edge_value(last_1_val); - for (uint32_t i = ind_s; i < total_entries; i++) { + for (uint32_t i = ind_s - 1; i < total_entries; i++) { get_var_node_header()->m_available_space += get_nth_key_len(i) + get_nth_value_len(i) + recSize; } this->sub_entries(total_entries - ind_s + 1); @@ -270,15 +270,15 @@ class VariableNode : public BtreeNode { vb.bytes = kb.bytes + kb.size; vb.size = get_nth_value_len(ind); - auto sz = other.insert(0, kb, vb); // Keep on inserting on the first index, thus moving everything to right - if (!sz) break; - - --ind; - ++nmoved; if ((kb.size + vb.size + this->get_record_size()) > size_to_move) { // We reached threshold of how much we could move break; } + + auto sz = other.insert(0, kb, vb); // Keep on inserting on the first index, thus moving everything to right + + --ind; + ++nmoved; size_to_move -= sz; } remove(ind + 1, this->total_entries() - 1); @@ -329,6 +329,7 @@ class VariableNode : public BtreeNode { if (sz == 0) { break; } ++n; ++idx; + copy_size -= sz; } this->set_gen(this_gen + 1); diff --git a/src/tests/btree_test_kvs.hpp b/src/tests/btree_test_kvs.hpp index 21c283bc6..3822f8ec4 100644 --- a/src/tests/btree_test_kvs.hpp +++ b/src/tests/btree_test_kvs.hpp @@ -21,8 +21,8 @@ #include #include -static constexpr uint32_t g_max_keysize{120}; -static constexpr uint32_t g_max_valsize{120}; +static constexpr uint32_t g_max_keysize{100}; // for node size = 512 : free space : 442 => 100+100+6(record size) = 46% +static constexpr uint32_t g_max_valsize{100}; static std::random_device g_rd{}; static std::default_random_engine g_re{g_rd()}; static std::normal_distribution<> g_randkeysize_generator{32, 24}; @@ -49,10 +49,10 @@ static std::string gen_random_string(size_t len, uint32_t preamble = std::numeri static thread_local std::random_device rd{}; static thread_local std::default_random_engine re{rd()}; std::uniform_int_distribution< size_t > rand_char{0, alphanum.size() - 1}; - for (size_t i{0}; i < len; ++i) { + if (len < str.size()) { len = str.size(); } + for (size_t i{0}; i < len - str.size(); ++i) { str += alphanum[rand_char(re)]; } - str += '\0'; return str; } diff --git a/src/tests/test_mem_btree.cpp b/src/tests/test_mem_btree.cpp index 979cdc498..495cb8fd3 100644 --- a/src/tests/test_mem_btree.cpp +++ b/src/tests/test_mem_btree.cpp @@ -556,9 +556,7 @@ class BtreeConcurrentTest : public testing::Test { }); preload(SISL_OPTIONS["preload_size"].as< uint32_t >()); - print_keys(); runInParallel(op_list); - print_keys(); } private: