diff --git a/conanfile.py b/conanfile.py index 4bedf8360..db9fc4a7a 100644 --- a/conanfile.py +++ b/conanfile.py @@ -5,7 +5,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "4.7.1" + version = "4.8.1" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/btree/btree.hpp b/src/include/homestore/btree/btree.hpp index 0748b6295..428de8aa9 100644 --- a/src/include/homestore/btree/btree.hpp +++ b/src/include/homestore/btree/btree.hpp @@ -63,6 +63,8 @@ class Btree { return fiber_map[this_id].get(); } + static bool is_repair_needed(const BtreeNodePtr& child_node, const BtreeLinkInfo& child_info); + protected: BtreeConfig m_bt_cfg; diff --git a/src/include/homestore/btree/btree.ipp b/src/include/homestore/btree/btree.ipp index 38eaf960e..bb4018e05 100644 --- a/src/include/homestore/btree/btree.ipp +++ b/src/include/homestore/btree/btree.ipp @@ -346,11 +346,17 @@ template < typename K, typename V > bnodeid_t Btree< K, V >::root_node_id() const { return m_root_node_info.bnode_id(); } + template < typename K, typename V > uint64_t Btree< K, V >::root_link_version() const { return m_root_node_info.link_version(); } +template < typename K, typename V > +bool Btree< K, V >::is_repair_needed(const BtreeNodePtr& child_node, const BtreeLinkInfo& child_info) { + return child_info.link_version() != child_node->link_version(); +} + // TODO: Commenting out flip till we figure out how to move flip dependency inside sisl package. #if 0 #ifdef _PRERELEASE diff --git a/src/include/homestore/btree/detail/btree_mutate_impl.ipp b/src/include/homestore/btree/detail/btree_mutate_impl.ipp index 13c00c7b9..e5c6b832c 100644 --- a/src/include/homestore/btree/detail/btree_mutate_impl.ipp +++ b/src/include/homestore/btree/detail/btree_mutate_impl.ipp @@ -18,10 +18,6 @@ namespace homestore { -static bool is_repair_needed(const BtreeNodePtr& child_node, const BtreeLinkInfo& child_info) { - return child_info.link_version() != child_node->link_version(); -} - /* This function does the heavy lifiting of co-ordinating inserts. It is a recursive function which walks * down the tree. * diff --git a/src/include/homestore/btree/detail/btree_remove_impl.ipp b/src/include/homestore/btree/detail/btree_remove_impl.ipp index 61b9e4308..df0a22773 100644 --- a/src/include/homestore/btree/detail/btree_remove_impl.ipp +++ b/src/include/homestore/btree/detail/btree_remove_impl.ipp @@ -53,7 +53,6 @@ btree_status_t Btree< K, V >::do_remove(const BtreeNodePtr& my_node, locktype_t return modified ? btree_status_t::success : btree_status_t::not_found; } - // bool go_to_out = false; retry: locktype_t child_cur_lock = locktype_t::NONE; uint32_t curr_idx; @@ -80,7 +79,6 @@ retry: } end_idx = start_idx = (end_idx - start_idx) / 2; // Pick the middle, TODO: Ideally we need to pick random } - // if (go_to_out) { goto out_return; } if (req.route_tracing) { append_route_trace(req, my_node, btree_event_t::READ, start_idx, end_idx); } curr_idx = start_idx; @@ -366,27 +364,39 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const if (!K::is_fixed_size()) { // Lets see if we have enough room in parent node to accommodate changes. This is needed only if the key is not // fixed length. For fixed length node merge will always result in less or equal size + auto excess_releasing_nodes = + old_nodes.size() - new_nodes.size() - (parent_node->total_entries() == end_idx) ? 1 : 0; + if (!parent_node->has_room_for_put(btree_put_type::INSERT, excess_releasing_nodes * K::get_max_size(), + excess_releasing_nodes * BtreeLinkInfo::get_fixed_size())) { + BT_NODE_LOG(DEBUG, parent_node, + "Merge is needed, however after merge, the parent MAY not have enough space to accommodate the " + "new keys, so not proceeding with merge"); + ret = btree_status_t::merge_not_required; + goto out; + } +#if 0 // we first calculate the least amount of space being released after removing excess children. the key size // cannot be taken account; so we know for sure that value (i.e., linkinfo) and also its record will be freed. // If the end_idx is the parent's edge, the space is not released eventually. auto excess_releasing_nodes = - old_nodes.size() - new_nodes.size() - parent_node->total_entries() == end_idx ? 1 : 0; + old_nodes.size() - new_nodes.size() - (parent_node->total_entries() == end_idx) ? 1 : 0; auto minimum_releasing_excess_size = excess_releasing_nodes * (BtreeLinkInfo::get_fixed_size() + parent_node->get_record_size()); - // aside from releasing size due to excess node, K::get_estimate_max_size is needed for each updating element + // aside from releasing size due to excess node, K::get_max_size is needed for each updating element // at worst case (linkinfo and record remain the same for old and new nodes). The number of updating elements // are the size of the new nodes (the last key of the last new node is not getting updated; hence excluded) plus // the leftmost node. - if (parent_node->available_size(m_bt_cfg) + minimum_releasing_excess_size < - (1 + new_nodes.size() ? new_nodes.size() - 1 : 0) * K::get_estimate_max_size()) { + if (parent_node->available_size() + minimum_releasing_excess_size < + (1 + new_nodes.size() ? new_nodes.size() - 1 : 0) * K::get_max_size()) { BT_NODE_LOG(DEBUG, parent_node, "Merge is needed, however after merge, the parent MAY not have enough space to accommodate the " "new keys, so not proceeding with merge"); ret = btree_status_t::merge_not_required; goto out; } +#endif } // Now it is time to commit things and at this point no going back, since in-place write nodes are modified diff --git a/src/include/homestore/btree/detail/varlen_node.hpp b/src/include/homestore/btree/detail/varlen_node.hpp index d7c8fcd01..de155f9c5 100644 --- a/src/include/homestore/btree/detail/varlen_node.hpp +++ b/src/include/homestore/btree/detail/varlen_node.hpp @@ -467,7 +467,7 @@ class VariableNode : public VariantNode< K, V > { bool has_room_for_put(btree_put_type put_type, uint32_t key_size, uint32_t value_size) const override { auto needed_size = key_size + value_size; - if ((put_type == btree_put_type::UPSERT) + (put_type == btree_put_type::INSERT)) { + if ((put_type == btree_put_type::UPSERT) || (put_type == btree_put_type::INSERT)) { needed_size += get_record_size(); } return (available_size() >= needed_size); diff --git a/src/lib/blkalloc/bitmap_blk_allocator.cpp b/src/lib/blkalloc/bitmap_blk_allocator.cpp index bef768684..78f747c08 100644 --- a/src/lib/blkalloc/bitmap_blk_allocator.cpp +++ b/src/lib/blkalloc/bitmap_blk_allocator.cpp @@ -127,75 +127,73 @@ void BitmapBlkAllocator::free_on_disk(BlkId const& bid) { // this api should be called only on persistent blk allocator DEBUG_ASSERT_EQ(is_persistent(), true, "free_on_disk called for non-persistent blk allocator"); - auto unset_on_disk_bm = - [this](auto& b) { - BlkAllocPortion& portion = blknum_to_portion(b.blk_num()); - { - auto lock{portion.portion_auto_lock()}; - if (!hs()->is_initializing()) { - // During recovery we might try to free the entry which is already freed while replaying the - // journal, This assert is valid only post recovery. - if (!m_disk_bm->is_bits_set(b.blk_num(), b.blk_count())) { - BLKALLOC_LOG(ERROR, "bit not set {} nblks {} chunk number {}", b.blk_num(), b.blk_count(), - m_chunk_id); - for (blk_count_t i{0}; i < b.blk_count(); ++i) { - if (!m_disk_bm->is_bits_set(b.blk_num() + i, 1)) { - BLKALLOC_LOG(ERROR, "bit not set {}", b.blk_num() + i); - } + auto unset_on_disk_bm = [this](auto& b) { + BlkAllocPortion& portion = blknum_to_portion(b.blk_num()); + { + auto lock{portion.portion_auto_lock()}; + if (!hs()->is_initializing()) { + // During recovery we might try to free the entry which is already freed while replaying the + // journal, This assert is valid only post recovery. + if (!m_disk_bm->is_bits_set(b.blk_num(), b.blk_count())) { + BLKALLOC_LOG(ERROR, "bit not set {} nblks {} chunk number {}", b.blk_num(), b.blk_count(), + m_chunk_id); + for (blk_count_t i{0}; i < b.blk_count(); ++i) { + if (!m_disk_bm->is_bits_set(b.blk_num() + i, 1)) { + BLKALLOC_LOG(ERROR, "bit not set {}", b.blk_num() + i); } - BLKALLOC_REL_ASSERT(m_disk_bm->is_bits_set(b.blk_num(), b.blk_count()), - "Expected disk bits to set blk num {} num blks {}", b.blk_num(), - b.blk_count()); } - m_disk_bm->reset_bits(b.blk_num(), b.blk_count()); - } - }; - - if (bid.is_multi()) { - MultiBlkId const& mbid = r_cast< MultiBlkId const& >(bid); - auto it = mbid.iterate(); - while (auto const b = it.next()) { - unset_on_disk_bm(*b); + BLKALLOC_REL_ASSERT(m_disk_bm->is_bits_set(b.blk_num(), b.blk_count()), + "Expected disk bits to set blk num {} num blks {}", b.blk_num(), b.blk_count()); } - } else { - unset_on_disk_bm(bid); } + m_disk_bm->reset_bits(b.blk_num(), b.blk_count()); } + }; - sisl::byte_array - BitmapBlkAllocator::acquire_underlying_buffer() { - // prepare and temporary alloc list, where blkalloc is accumulated till underlying buffer is released. - // RCU will wait for all I/Os that are still in critical section (allocating on disk bm) to complete and exit; - auto alloc_list_ptr = new sisl::ThreadVector< BlkId >(); - auto old_alloc_list_ptr = rcu_xchg_pointer(&m_alloc_blkid_list, alloc_list_ptr); - synchronize_rcu(); - - BLKALLOC_REL_ASSERT(old_alloc_list_ptr == nullptr, "Multiple acquires concurrently?"); - return (m_disk_bm->serialize(m_align_size)); - } - - void BitmapBlkAllocator::release_underlying_buffer() { - // set to nullptr, so that alloc will go to disk bm directly - // wait for all I/Os in critical section (still accumulating bids) to complete and exit; - auto old_alloc_list_ptr = rcu_xchg_pointer(&m_alloc_blkid_list, nullptr); - synchronize_rcu(); - - // at this point, no I/O will be pushing back to the list (old_alloc_list_ptr); - auto it = old_alloc_list_ptr->begin(true /* latest */); - const BlkId* bid{nullptr}; - while ((bid = old_alloc_list_ptr->next(it)) != nullptr) { - alloc_on_disk(*bid); + if (bid.is_multi()) { + MultiBlkId const& mbid = r_cast< MultiBlkId const& >(bid); + auto it = mbid.iterate(); + while (auto const b = it.next()) { + unset_on_disk_bm(*b); } - old_alloc_list_ptr->clear(); - delete (old_alloc_list_ptr); + } else { + unset_on_disk_bm(bid); } +} - /* Get status */ - nlohmann::json BitmapBlkAllocator::get_status(int) const { return nlohmann::json{}; } +sisl::byte_array BitmapBlkAllocator::acquire_underlying_buffer() { + // prepare and temporary alloc list, where blkalloc is accumulated till underlying buffer is released. + // RCU will wait for all I/Os that are still in critical section (allocating on disk bm) to complete and exit; + auto alloc_list_ptr = new sisl::ThreadVector< BlkId >(); + auto old_alloc_list_ptr = rcu_xchg_pointer(&m_alloc_blkid_list, alloc_list_ptr); + synchronize_rcu(); + + BLKALLOC_REL_ASSERT(old_alloc_list_ptr == nullptr, "Multiple acquires concurrently?"); + return (m_disk_bm->serialize(m_align_size)); +} - sisl::ThreadVector< BlkId >* BitmapBlkAllocator::get_alloc_blk_list() { - auto p = rcu_dereference(m_alloc_blkid_list); - return p; +void BitmapBlkAllocator::release_underlying_buffer() { + // set to nullptr, so that alloc will go to disk bm directly + // wait for all I/Os in critical section (still accumulating bids) to complete and exit; + auto old_alloc_list_ptr = rcu_xchg_pointer(&m_alloc_blkid_list, nullptr); + synchronize_rcu(); + + // at this point, no I/O will be pushing back to the list (old_alloc_list_ptr); + auto it = old_alloc_list_ptr->begin(true /* latest */); + const BlkId* bid{nullptr}; + while ((bid = old_alloc_list_ptr->next(it)) != nullptr) { + alloc_on_disk(*bid); } + old_alloc_list_ptr->clear(); + delete (old_alloc_list_ptr); +} + +/* Get status */ +nlohmann::json BitmapBlkAllocator::get_status(int) const { return nlohmann::json{}; } + +sisl::ThreadVector< BlkId >* BitmapBlkAllocator::get_alloc_blk_list() { + auto p = rcu_dereference(m_alloc_blkid_list); + return p; +} } // namespace homestore diff --git a/src/tests/test_mem_btree.cpp b/src/tests/test_mem_btree.cpp index 99797255b..68bf4003d 100644 --- a/src/tests/test_mem_btree.cpp +++ b/src/tests/test_mem_btree.cpp @@ -289,44 +289,12 @@ struct BtreeConcurrentTest : public BtreeTestHelper< TestType > { BtreeConcurrentTest() { this->m_is_multi_threaded = true; } void SetUp() override { - m_cfg.m_leaf_node_type = T::leaf_node_type; - m_cfg.m_int_node_type = T::interior_node_type; - m_max_range_input = SISL_OPTIONS["num_entries"].as< uint32_t >(); - if (SISL_OPTIONS.count("disable_merge")) m_cfg.m_merge_turned_on = false; - m_fibers.clear(); - m_bt = std::make_unique< typename T::BtreeType >(m_cfg); - m_bt->init(nullptr); - } - - void TearDown() override { iomanager.stop(); } - - void print() const { m_bt->print_tree(); } - void print_keys() const { m_bt->print_tree_keys(); } - - void execute(const std::vector< std::pair< std::string, int > >& op_list) { - - m_cfg.m_leaf_node_type = T::leaf_node_type; - m_cfg.m_int_node_type = T::interior_node_type; - m_max_range_input = SISL_OPTIONS["num_entries"].as< uint32_t >(); - if (SISL_OPTIONS.count("disable_merge")) m_cfg.m_merge_turned_on = false; - m_fibers.clear(); - m_bt = std::make_unique< typename T::BtreeType >(m_cfg); - m_bt->init(nullptr); - } - - void TearDown() override { iomanager.stop(); } - - void print() const { m_bt->print_tree(); } - void print_keys() const { m_bt->print_tree_keys(); } - - void execute(const std::vector< std::pair< std::string, int > >& op_list) { LOGINFO("Starting iomgr with {} threads", SISL_OPTIONS["n_threads"].as< uint32_t >()); ioenvironment.with_iomgr(iomgr::iomgr_params{.num_threads = SISL_OPTIONS["n_threads"].as< uint32_t >(), - .is_spdk = false, .is_spdk = false, .num_fibers = 1 + SISL_OPTIONS["n_fibers"].as< uint32_t >(), .app_mem_size_mb = 0, - 0}); + .hugepage_size_mb = 0}); BtreeTestHelper< TestType >::SetUp(); this->m_bt = std::make_shared< typename T::BtreeType >(this->m_cfg);