From 52dcbd60733118c16fe61cbb39e35f8092600b24 Mon Sep 17 00:00:00 2001 From: Harihara Kadayam Date: Sat, 21 Oct 2023 09:15:07 -0700 Subject: [PATCH] Changes to support PrefixNode and consolidation of tree Following are the major changes in btree library: 1. A new BtreeKey and BtreeValue type called IntervalKey and IntervalValue which allows a particular key to be part of an larger interval (say 1-3 will result in key1, key2, key3 and each key is an BtreeIntervalKey). 2. Implemented a new tree node called PrefixNode which can store interval key and value effectively stores them with prefix and suffix to store the intervals compactly. 3. Removed the Extent code which was not used and replaced it with the IntervalKey/Value as mentioned above. 4. Added filtering for read/write/remove operations, so that consumer can change the way which key/value to be custom modify. This change effectively provides compare_and_set or compare_and_remove operations atomically. This change also removed the previous on_read_cb etc, which was passed on every callback 5. Removed the requirement for BtreeKey and Value to implement some static methods like get_key_size() etc, by creating a dummyKey, dummyValue. Now the requirement of the implementor of btree key and value only needs to ensure it is default constructible (which is already the case today) 6. Created new internal btree node APIs such as multi_put, multi_get instead of each btree operations repeat similar code to read or remove operations in leaf node. As a result, a method called match_range() is introduced in the btree node, which consolidates and searches for all range operation requirements. Moved most of the common operations into variant_node class and now variant_node is derived from BtreeNode. PrefixNode overrides variant_node and implements its own version of multi_put etc. range operations. 7. Removed obsolete btree return status. 8. Consolidated the mem_btree test, mem_btree concurrent test and index_btree test into one btree_test_helper library. Modified the way range_scheduler picks the existing and working keys from boost icl to sis::Bitset as it simplifies them and also provides some essential feature which is always pick some keys to schedule instead of failing. --- CMakeLists.txt | 2 +- conanfile.py | 2 +- src/include/homestore/btree/btree.hpp | 20 +- src/include/homestore/btree/btree.ipp | 22 +- src/include/homestore/btree/btree_kv.hpp | 255 ++---- src/include/homestore/btree/btree_req.hpp | 69 +- .../homestore/btree/detail/btree_common.ipp | 33 +- .../homestore/btree/detail/btree_get_impl.ipp | 5 +- .../homestore/btree/detail/btree_internal.hpp | 7 +- .../btree/detail/btree_mutate_impl.ipp | 352 +------- .../homestore/btree/detail/btree_node.hpp | 289 ++---- .../homestore/btree/detail/btree_node_mgr.ipp | 12 +- .../btree/detail/btree_query_impl.ipp | 32 +- .../btree/detail/btree_remove_impl.ipp | 163 +--- .../homestore/btree/detail/prefix_node.hpp | 828 ++++++++++++++++++ .../homestore/btree/detail/simple_node.hpp | 119 +-- .../homestore/btree/detail/variant_node.hpp | 311 +++++++ .../homestore/btree/detail/varlen_node.hpp | 142 ++- src/include/homestore/index/index_table.hpp | 11 +- src/tests/btree_helpers/btree_test_helper.hpp | 383 ++++++++ .../{ => btree_helpers}/btree_test_kvs.hpp | 233 ++++- src/tests/btree_helpers/shadow_map.hpp | 96 ++ src/tests/test_btree_node.cpp | 103 ++- src/tests/test_common/range_scheduler.hpp | 253 +++--- src/tests/test_index_btree.cpp | 297 ++----- src/tests/test_mem_btree.cpp | 689 ++------------- 26 files changed, 2562 insertions(+), 2166 deletions(-) create mode 100644 src/include/homestore/btree/detail/prefix_node.hpp create mode 100644 src/include/homestore/btree/detail/variant_node.hpp create mode 100644 src/tests/btree_helpers/btree_test_helper.hpp rename src/tests/{ => btree_helpers}/btree_test_kvs.hpp (56%) create mode 100644 src/tests/btree_helpers/shadow_map.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index bd27696ae..a9af184e6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -98,7 +98,7 @@ endif() if(${CMAKE_BUILD_TYPE} STREQUAL "Debug") message(STATUS "Debug build") - add_flags("-DDEBUG_RCU") + add_flags("-DDEBUG_RCU -D_DEBUG") else() message(STATUS "Release build") if((${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU") OR (${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")) diff --git a/conanfile.py b/conanfile.py index 1e7364867..1e18df643 100644 --- a/conanfile.py +++ b/conanfile.py @@ -5,7 +5,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "4.5.5" + version = "4.6.0" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/btree/btree.hpp b/src/include/homestore/btree/btree.hpp index 63056c413..0748b6295 100644 --- a/src/include/homestore/btree/btree.hpp +++ b/src/include/homestore/btree/btree.hpp @@ -31,10 +31,6 @@ SISL_LOGGING_DECL(btree) namespace homestore { -typedef std::function< bool(const BtreeKey&, const BtreeValue&, const BtreeRequest&) > on_kv_read_t; -typedef std::function< bool(const BtreeKey&, const BtreeValue&, const BtreeRequest&) > on_kv_remove_t; -typedef std::function< bool(const BtreeKey&, const BtreeKey&, const BtreeValue&, const BtreeRequest&) > on_kv_update_t; - using BtreeNodePtr = boost::intrusive_ptr< BtreeNode >; struct BtreeThreadVariables { @@ -57,11 +53,6 @@ class Btree { std::atomic< uint64_t > m_req_id{0}; #endif - // Optional callback on various read or kv operations - on_kv_read_t m_on_read_cb{nullptr}; - on_kv_update_t m_on_update_cb{nullptr}; - on_kv_remove_t m_on_remove_cb{nullptr}; - // This workaround of BtreeThreadVariables is needed instead of directly declaring statics // to overcome the gcc bug, pointer here: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66944 static BtreeThreadVariables* bt_thread_vars() { @@ -77,8 +68,7 @@ class Btree { public: /////////////////////////////////////// All External APIs ///////////////////////////// - Btree(const BtreeConfig& cfg, on_kv_read_t&& read_cb = nullptr, on_kv_update_t&& update_cb = nullptr, - on_kv_remove_t&& remove_cb = nullptr); + Btree(const BtreeConfig& cfg); virtual ~Btree(); virtual btree_status_t init(void* op_context); @@ -174,10 +164,6 @@ class Btree { void validate_sanity_child(const BtreeNodePtr& parent_node, uint32_t ind) const; void validate_sanity_next_child(const BtreeNodePtr& parent_node, uint32_t ind) const; void print_node(const bnodeid_t& bnodeid) const; - bool call_on_read_kv_cb(const BtreeNodePtr& node, uint32_t idx, const BtreeRequest& req) const; - bool call_on_remove_kv_cb(const BtreeNodePtr& node, uint32_t idx, const BtreeRequest& req) const; - bool call_on_update_kv_cb(const BtreeNodePtr& node, uint32_t idx, const BtreeKey& new_key, - const BtreeRequest& req) const; void append_route_trace(BtreeRequest& req, const BtreeNodePtr& node, btree_event_t event, uint32_t start_idx = 0, uint32_t end_idx = 0) const; @@ -194,10 +180,10 @@ class Btree { btree_status_t check_split_root(ReqT& req); template < typename ReqT > - bool is_split_needed(const BtreeNodePtr& node, const BtreeConfig& cfg, ReqT& req) const; + bool is_split_needed(const BtreeNodePtr& node, ReqT& req) const; btree_status_t split_node(const BtreeNodePtr& parent_node, const BtreeNodePtr& child_node, uint32_t parent_ind, - BtreeKey* out_split_key, void* context); + K* out_split_key, void* context); btree_status_t mutate_extents_in_leaf(const BtreeNodePtr& my_node, BtreeRangePutRequest< K >& rpreq); btree_status_t repair_split(const BtreeNodePtr& parent_node, const BtreeNodePtr& child_node1, uint32_t parent_split_idx, void* context); diff --git a/src/include/homestore/btree/btree.ipp b/src/include/homestore/btree/btree.ipp index a3bd1ee02..b9dcbf398 100644 --- a/src/include/homestore/btree/btree.ipp +++ b/src/include/homestore/btree/btree.ipp @@ -36,14 +36,8 @@ namespace homestore { template < typename K, typename V > -Btree< K, V >::Btree(const BtreeConfig& cfg, on_kv_read_t&& read_cb, on_kv_update_t&& update_cb, - on_kv_remove_t&& remove_cb) : - m_metrics{cfg.name().c_str()}, - m_node_size{cfg.node_size()}, - m_on_read_cb{std::move(read_cb)}, - m_on_update_cb{std::move(update_cb)}, - m_on_remove_cb{std::move(remove_cb)}, - m_bt_cfg{cfg} { +Btree< K, V >::Btree(const BtreeConfig& cfg) : + m_metrics{cfg.name().c_str()}, m_node_size{cfg.node_size()}, m_bt_cfg{cfg} { m_bt_cfg.set_node_data_size(cfg.node_size() - sizeof(persistent_hdr_t)); } @@ -105,7 +99,7 @@ retry: if (ret != btree_status_t::success) { goto out; } is_leaf = root->is_leaf(); - if (is_split_needed(root, m_bt_cfg, put_req)) { + if (is_split_needed(root, put_req)) { // Time to do the split of root. unlock_node(root, acq_lock); m_btree_lock.unlock_shared(); @@ -143,8 +137,7 @@ out: #ifndef NDEBUG check_lock_debug(); #endif - if (ret != btree_status_t::success && ret != btree_status_t::fast_path_not_possible && - ret != btree_status_t::cp_mismatch) { + if (ret != btree_status_t::success && ret != btree_status_t::cp_mismatch) { BT_LOG(ERROR, "btree put failed {}", ret); COUNTER_INCREMENT(m_metrics, write_err_cnt, 1); } @@ -267,9 +260,9 @@ btree_status_t Btree< K, V >::query(BtreeQueryRequest< K >& qreq, std::vector< s if ((qreq.query_type() == BtreeQueryType::SWEEP_NON_INTRUSIVE_PAGINATION_QUERY || qreq.query_type() == BtreeQueryType::TREE_TRAVERSAL_QUERY)) { if (out_values.size()) { - K& out_last_key = out_values.back().first; - qreq.set_cursor_key(out_last_key); + K out_last_key = out_values.back().first; if (out_last_key.compare(qreq.input_range().end_key()) >= 0) { ret = btree_status_t::success; } + qreq.shift_working_range(std::move(out_last_key), false /* non inclusive*/); } else { DEBUG_ASSERT_NE(ret, btree_status_t::has_more, "Query returned has_more, but no values added") } @@ -280,8 +273,7 @@ out: #ifndef NDEBUG check_lock_debug(); #endif - if (ret != btree_status_t::success && ret != btree_status_t::has_more && - ret != btree_status_t::fast_path_not_possible) { + if ((ret != btree_status_t::success) && (ret != btree_status_t::has_more)) { BT_LOG(ERROR, "btree query failed {}", ret); COUNTER_INCREMENT(m_metrics, query_err_cnt, 1); } diff --git a/src/include/homestore/btree/btree_kv.hpp b/src/include/homestore/btree/btree_kv.hpp index d54db46fd..18dd832a8 100644 --- a/src/include/homestore/btree/btree_kv.hpp +++ b/src/include/homestore/btree/btree_kv.hpp @@ -31,11 +31,10 @@ ENUM(MultiMatchOption, uint16_t, ) ENUM(btree_put_type, uint16_t, - INSERT_ONLY_IF_NOT_EXISTS, // Insert - REPLACE_ONLY_IF_EXISTS, // Update - REPLACE_IF_EXISTS_ELSE_INSERT, // Upsert - APPEND_ONLY_IF_EXISTS, // Update - APPEND_IF_EXISTS_ELSE_INSERT) + INSERT, // Insert only if it doesn't exist + UPDATE, // Update only if it exists + UPSERT // Update if exists, insert otherwise +) // The base class, btree library expects its key to be derived from class BtreeKey { @@ -45,23 +44,33 @@ class BtreeKey { // Deleting copy constructor forces the derived class to define its own copy constructor // BtreeKey(const BtreeKey& other) = delete; // BtreeKey(const sisl::blob& b) = delete; - BtreeKey(const BtreeKey& other) = default; + BtreeKey(BtreeKey const& other) = default; virtual ~BtreeKey() = default; - virtual BtreeKey& operator=(const BtreeKey& other) { - clone(other); - return *this; - }; - - virtual void clone(const BtreeKey& other) = 0; - virtual int compare(const BtreeKey& other) const = 0; + virtual int compare(BtreeKey const& other) const = 0; virtual sisl::blob serialize() const = 0; virtual uint32_t serialized_size() const = 0; - virtual void deserialize(const sisl::blob& b, bool copy) = 0; + virtual void deserialize(sisl::blob const& b, bool copy) = 0; virtual std::string to_string() const = 0; - virtual bool is_extent_key() const { return false; } + virtual bool is_interval_key() const { return false; } +}; + +// An extension of BtreeKey where each key is part of an interval range. Keys are not neccessarily only needs to be +// integers, but it needs to be able to get next or prev key from a given key in the key range +class BtreeIntervalKey : public BtreeKey { +public: + virtual void shift(int n) = 0; + virtual int distance(BtreeKey const& from) const = 0; + bool is_interval_key() const override { return true; } + + virtual sisl::blob serialize_prefix() const = 0; + virtual sisl::blob serialize_suffix() const = 0; + + virtual uint32_t serialized_prefix_size() const = 0; + virtual uint32_t serialized_suffix_size() const = 0; + virtual void deserialize(sisl::blob const& prefix, sisl::blob const& suffix, bool copy) = 0; }; template < typename K > @@ -69,13 +78,9 @@ class BtreeTraversalState; template < typename K > class BtreeKeyRange { -private: - K m_actual_start_key; - K m_actual_end_key; - public: - K* m_input_start_key{&m_actual_start_key}; - K* m_input_end_key{&m_actual_end_key}; + K m_start_key; + K m_end_key; bool m_start_incl{true}; bool m_end_incl{true}; MultiMatchOption m_multi_selector{MultiMatchOption::DO_NOT_CARE}; @@ -85,68 +90,35 @@ class BtreeKeyRange { public: BtreeKeyRange() = default; - BtreeKeyRange(const K& start_key, bool start_incl = true) : - m_actual_start_key{start_key}, - m_input_start_key{&m_actual_start_key}, - m_input_end_key{&m_actual_start_key}, - m_start_incl{start_incl}, - m_end_incl{true}, - m_multi_selector{MultiMatchOption::DO_NOT_CARE} {} - BtreeKeyRange(const K& start_key, bool start_incl, const K& end_key, bool end_incl = true, MultiMatchOption option = MultiMatchOption::DO_NOT_CARE) : - m_actual_start_key{start_key}, - m_actual_end_key{end_key}, - m_input_start_key{&m_actual_start_key}, - m_input_end_key{&m_actual_end_key}, + m_start_key{start_key}, + m_end_key{end_key}, m_start_incl{start_incl}, m_end_incl{end_incl}, m_multi_selector{option} {} BtreeKeyRange(const K& start_key, const K& end_key) : BtreeKeyRange(start_key, true, end_key, true) {} - BtreeKeyRange(const BtreeKeyRange& other) { copy(other); } - BtreeKeyRange(BtreeKeyRange&& other) { do_move(std::move(other)); } - BtreeKeyRange& operator=(const BtreeKeyRange< K >& other) { - this->copy(other); - return *this; - } - BtreeKeyRange& operator=(BtreeKeyRange< K >&& other) { - this->do_move(std::move(other)); - return *this; - } - - void copy(const BtreeKeyRange< K >& other) { - m_actual_start_key = other.m_actual_start_key; - m_actual_end_key = other.m_actual_end_key; - m_input_start_key = &m_actual_start_key; - m_input_end_key = - (other.m_input_end_key == &other.m_actual_start_key) ? &m_actual_start_key : &m_actual_end_key; - m_start_incl = other.m_start_incl; - m_end_incl = other.m_end_incl; - m_multi_selector = other.m_multi_selector; - } - - void do_move(BtreeKeyRange< K >&& other) { - m_input_start_key = &m_actual_start_key; - m_input_end_key = - (other.m_input_end_key == &other.m_actual_start_key) ? &m_actual_start_key : &m_actual_end_key; - m_actual_start_key = std::move(other.m_actual_start_key); - m_actual_end_key = std::move(other.m_actual_end_key); - m_start_incl = std::move(other.m_start_incl); - m_end_incl = std::move(other.m_end_incl); - m_multi_selector = std::move(other.m_multi_selector); - } + BtreeKeyRange(const BtreeKeyRange& other) = default; + BtreeKeyRange(BtreeKeyRange&& other) = default; + BtreeKeyRange& operator=(const BtreeKeyRange< K >& other) = default; + BtreeKeyRange& operator=(BtreeKeyRange< K >&& other) = default; void set_multi_option(MultiMatchOption o) { m_multi_selector = o; } - const K& start_key() const { return *m_input_start_key; } - const K& end_key() const { return *m_input_end_key; } + const K& start_key() const { return m_start_key; } + const K& end_key() const { return m_end_key; } bool is_start_inclusive() const { return m_start_incl; } bool is_end_inclusive() const { return m_end_incl; } MultiMatchOption multi_option() const { return m_multi_selector; } + void set_start_key(K&& key, bool incl) { + m_start_key = std::move(key); + m_start_incl = incl; + } + void set_end_key(K&& key, bool incl) { - m_actual_end_key = std::move(key); + m_end_key = std::move(key); m_end_incl = incl; } @@ -154,52 +126,6 @@ class BtreeKeyRange { return fmt::format("{}{}-{}{}", is_start_inclusive() ? '[' : '(', start_key().to_string(), end_key().to_string(), is_end_inclusive() ? ']' : ')'); } - -private: - const K& actual_start_key() const { return m_actual_start_key; } - const K& actual_end_key() const { return m_actual_end_key; } -}; - -/* - * This type is for keys which is range in itself i.e each key is having its own - * start() and end(). - */ -template < typename K > -class ExtentBtreeKey : public BtreeKey { -public: - ExtentBtreeKey() = default; - virtual ~ExtentBtreeKey() = default; - virtual bool is_extent_key() const { return true; } - - // Provide the length of the extent key, which is end - start + 1 - virtual uint32_t extent_length() const = 0; - - // Get the distance between the start of this key and start of other key. It returns equivalent of - // (other.start - this->start + 1) - virtual int64_t distance_start(const ExtentBtreeKey< K >& other) const = 0; - - // Get the distance between the end of this key and end of other key. It returns equivalent of - // (other.end - this->end + 1) - virtual int64_t distance_end(const ExtentBtreeKey< K >& other) const = 0; - - // Get the distance between the start of this key and end of other key. It returns equivalent of - // (other.end - this->start + 1) - virtual int64_t distance(const ExtentBtreeKey< K >& other) const = 0; - - // Extract a new extent key from the given offset upto this length from this key and optionally do a deep copy - virtual K extract(uint32_t offset, uint32_t length, bool copy) const = 0; - - // Merge this extent btree key with other extent btree key and return a new key - virtual K combine(const ExtentBtreeKey< K >& other) const = 0; - - // TODO: Evaluate if we need these 3 methods or we can manage with other methods - virtual int compare_start(const BtreeKey& other) const = 0; - virtual int compare_end(const BtreeKey& other) const = 0; - - /* we always compare the end key in case of extent */ - virtual int compare(const BtreeKey& other) const override { return (compare_end(other)); } - - K extract_end(bool copy) const { return extract(extent_length() - 1, 1, copy); } }; class BtreeValue { @@ -207,9 +133,6 @@ class BtreeValue { BtreeValue() = default; virtual ~BtreeValue() = default; - // Deleting copy constructor forces the derived class to define its own copy constructor - BtreeValue(const BtreeValue& other) = delete; - virtual sisl::blob serialize() const = 0; virtual uint32_t serialized_size() const = 0; virtual void deserialize(const sisl::blob& b, bool copy) = 0; @@ -217,30 +140,16 @@ class BtreeValue { virtual std::string to_string() const { return ""; } }; -template < typename V > -class ExtentBtreeValue : public BtreeValue { +class BtreeIntervalValue : public BtreeValue { public: - virtual ~ExtentBtreeValue() = default; - - // Extract a new extent value from the given offset upto this length from this value and optionally do a deep copy - virtual V extract(uint32_t offset, uint32_t length, bool copy) const = 0; + virtual void shift(int n) = 0; - // Returns the returns the serialized size if we were to extract other value from offset upto length - // This method is equivalent to: extract(offset, length, false).serialized_size() - // However, this method provides values to directly compute the extracted size without extracting - which is more - // efficient. - virtual uint32_t extracted_size(uint32_t offset, uint32_t length) const = 0; + virtual sisl::blob serialize_prefix() const = 0; + virtual sisl::blob serialize_suffix() const = 0; - // This method is similar to extract(0, length) along with moving the current values start to length. So for example - // if value has 0-100 and if shift(80) is called, this method returns a value from 0-79 and moves the start offset - // of current value to 80. - virtual V shift(uint32_t length, bool copy) = 0; - - // Given the length, report back how many extents from the current value can fit. - virtual uint32_t num_extents_fit(uint32_t length) const = 0; - - // Returns if every piece of extents are equally sized. - virtual bool is_equal_sized() const = 0; + virtual uint32_t serialized_prefix_size() const = 0; + virtual uint32_t serialized_suffix_size() const = 0; + virtual void deserialize(sisl::blob const& prefix, sisl::blob const& suffix, bool copy) = 0; }; struct BtreeLockTracker; @@ -262,53 +171,61 @@ class BtreeTraversalState { protected: const BtreeKeyRange< K > m_input_range; BtreeKeyRange< K > m_working_range; - BtreeKeyRange< K > m_next_range; - std::unique_ptr< BtreeQueryCursor< K > > m_cursor; + bool m_trimmed{false}; // Keep track of trimmed, so that a shift doesn't do unwanted copy of input_range + bool m_exhausted{false}; // The entire working range is exhausted public: - BtreeTraversalState(BtreeKeyRange< K >&& inp_range, bool paginated_query = false) : - m_input_range{std::move(inp_range)}, m_working_range{m_input_range} { - if (paginated_query) { m_cursor = std::make_unique< BtreeQueryCursor< K > >(); } - } + BtreeTraversalState(BtreeKeyRange< K >&& inp_range) : + m_input_range{std::move(inp_range)}, m_working_range{m_input_range} {} BtreeTraversalState(const BtreeTraversalState& other) = default; BtreeTraversalState(BtreeTraversalState&& other) = default; - const BtreeQueryCursor< K >* const_cursor() const { return m_cursor.get(); } - BtreeQueryCursor< K >* cursor() { return m_cursor.get(); } - bool is_cursor_valid() const { return (m_cursor != nullptr); } - - void set_cursor_key(const K& end_key) { - // no need to set cursor as user doesn't want to keep track of it - if (!m_cursor) { return; } - m_cursor->m_last_key = std::make_unique< K >(end_key); - } - const BtreeKeyRange< K >& input_range() const { return m_input_range; } - const BtreeKeyRange< K >& working_range() const { return m_working_range; } + const BtreeKeyRange< K >& working_range() const { + DEBUG_ASSERT_EQ(m_exhausted, false, "requested for working range on an exhausted traversal state"); + return m_working_range; + } // Returns the mutable reference to the end key, which caller can update it to trim down the end key - void trim_working_range(K&& end_key, bool end_incl) { m_working_range.set_end_key(std::move(end_key), end_incl); } - - const K& next_key() const { - return (m_cursor && m_cursor->m_last_key) ? *m_cursor->m_last_key : m_input_range.start_key(); + void trim_working_range(K&& end_key, bool end_incl) { + m_working_range.set_end_key(std::move(end_key), end_incl); + m_trimmed = true; } - const BtreeKeyRange< K >& next_range() { - if (m_cursor && m_cursor->m_last_key) { - m_next_range = BtreeKeyRange< K >(*m_cursor->m_last_key, false, m_input_range.end_key(), is_end_inclusive(), - m_input_range.multi_option()); - return m_next_range; + // Shift the working range start to previous working range end_key + void shift_working_range() { + if (m_trimmed) { + m_working_range.set_start_key(std::move(m_working_range.m_end_key), false); + m_working_range.m_end_key = m_input_range.end_key(); + m_working_range.m_end_incl = m_input_range.is_end_inclusive(); + m_trimmed = false; } else { - return m_input_range; + m_exhausted = true; } } -private: - bool is_start_inclusive() const { - // cursor always have the last key not included - return (m_cursor && m_cursor->m_last_key) ? false : m_input_range.is_start_inclusive(); + // Shift the working range start to specific end key + void shift_working_range(K&& start_key, bool start_incl) { + m_working_range.set_start_key(std::move(start_key), start_incl); + if (m_trimmed) { + m_working_range.m_end_key = m_input_range.end_key(); + m_working_range.m_end_incl = m_input_range.is_end_inclusive(); + m_trimmed = false; + } } + const K& first_key() const { return m_working_range.start_key(); } + + uint32_t first_key_size() const { + if (is_start_inclusive() || K::is_fixed_size()) { + return m_working_range.start_key().serialized_size(); + } else { + return K::get_max_size(); + } + } + +private: + bool is_start_inclusive() const { return m_input_range.is_start_inclusive(); } bool is_end_inclusive() const { return m_input_range.is_end_inclusive(); } }; diff --git a/src/include/homestore/btree/btree_req.hpp b/src/include/homestore/btree/btree_req.hpp index 6684c0be3..4e28dec8e 100644 --- a/src/include/homestore/btree/btree_req.hpp +++ b/src/include/homestore/btree/btree_req.hpp @@ -55,31 +55,24 @@ struct BtreeRangeRequest : public BtreeRequest { uint32_t batch_size() const { return m_batch_size; } void set_batch_size(uint32_t count) { m_batch_size = count; } - bool is_empty_cursor() const { - return ((m_search_state.const_cursor()->m_last_key == nullptr) && - (m_search_state.const_cursor()->m_locked_nodes == nullptr)); - } - BtreeTraversalState< K >& search_state() { return m_search_state; } - BtreeQueryCursor< K >* cursor() { return m_search_state.cursor(); } - const BtreeQueryCursor< K >* const_cursor() const { return m_search_state.const_cursor(); } - const BtreeKeyRange< K >& input_range() const { return m_search_state.input_range(); } - const BtreeKeyRange< K >& next_range() { return m_search_state.next_range(); } + void shift_working_range(K&& start_key, bool start_incl) { + m_search_state.shift_working_range(std::move(start_key), start_incl); + } + void shift_working_range() { m_search_state.shift_working_range(); } const BtreeKeyRange< K >& working_range() const { return m_search_state.working_range(); } - const K& next_key() const { return m_search_state.next_key(); } + const K& first_key() const { return m_search_state.first_key(); } + uint32_t first_key_size() const { return m_search_state.first_key_size(); } + void trim_working_range(K&& end_key, bool end_incl) { m_search_state.trim_working_range(std::move(end_key), end_incl); } - void set_cursor_key(const K& end_key) { return m_search_state.set_cursor_key(end_key); } protected: - BtreeRangeRequest(BtreeKeyRange< K >&& input_range, bool external_pagination = false, void* app_context = nullptr, - uint32_t batch_size = UINT32_MAX) : - BtreeRequest{app_context, nullptr}, - m_search_state{std::move(input_range), external_pagination}, - m_batch_size{batch_size} {} + BtreeRangeRequest(BtreeKeyRange< K >&& input_range, void* app_context = nullptr, uint32_t batch_size = UINT32_MAX) : + BtreeRequest{app_context, nullptr}, m_search_state{std::move(input_range)}, m_batch_size{batch_size} {} private: BtreeTraversalState< K > m_search_state; @@ -87,11 +80,14 @@ struct BtreeRangeRequest : public BtreeRequest { }; /////////////////////////// 1: Put Operations ///////////////////////////////////// +ENUM(put_filter_decision, uint8_t, keep, replace, remove); +using put_filter_cb_t = std::function< put_filter_decision(BtreeKey const&, BtreeValue const&, BtreeValue const&) >; + struct BtreeSinglePutRequest : public BtreeRequest { public: BtreeSinglePutRequest(const BtreeKey* k, const BtreeValue* v, btree_put_type put_type, - BtreeValue* existing_val = nullptr) : - m_k{k}, m_v{v}, m_put_type{put_type}, m_existing_val{existing_val} {} + BtreeValue* existing_val = nullptr, put_filter_cb_t filter_cb = nullptr) : + m_k{k}, m_v{v}, m_put_type{put_type}, m_existing_val{existing_val}, m_filter_cb{std::move(filter_cb)} {} const BtreeKey& key() const { return *m_k; } const BtreeValue& value() const { return *m_v; } @@ -100,19 +96,23 @@ struct BtreeSinglePutRequest : public BtreeRequest { const BtreeValue* m_v; const btree_put_type m_put_type; BtreeValue* m_existing_val; + put_filter_cb_t m_filter_cb; }; template < typename K > struct BtreeRangePutRequest : public BtreeRangeRequest< K > { public: BtreeRangePutRequest(BtreeKeyRange< K >&& inp_range, btree_put_type put_type, const BtreeValue* value, - void* app_context = nullptr, uint32_t batch_size = std::numeric_limits< uint32_t >::max()) : - BtreeRangeRequest< K >(std::move(inp_range), false, app_context, batch_size), + void* app_context = nullptr, uint32_t batch_size = std::numeric_limits< uint32_t >::max(), + put_filter_cb_t filter_cb = nullptr) : + BtreeRangeRequest< K >(std::move(inp_range), app_context, batch_size), m_put_type{put_type}, - m_newval{value} {} + m_newval{value}, + m_filter_cb{std::move(filter_cb)} {} - const btree_put_type m_put_type{btree_put_type::REPLACE_ONLY_IF_EXISTS}; + const btree_put_type m_put_type{btree_put_type::UPDATE}; const BtreeValue* m_newval; + put_filter_cb_t m_filter_cb; }; /////////////////////////// 2: Remove Operations ///////////////////////////////////// @@ -138,12 +138,18 @@ struct BtreeRemoveAnyRequest : public BtreeRequest { BtreeValue* m_outval; }; +using remove_filter_cb_t = std::function< bool(BtreeKey const&, BtreeValue const&) >; + template < typename K > struct BtreeRangeRemoveRequest : public BtreeRangeRequest< K > { +public: + remove_filter_cb_t m_filter_cb; + public: BtreeRangeRemoveRequest(BtreeKeyRange< K >&& inp_range, void* app_context = nullptr, - uint32_t batch_size = std::numeric_limits< uint32_t >::max()) : - BtreeRangeRequest< K >(std::move(inp_range), false, app_context, batch_size) {} + uint32_t batch_size = std::numeric_limits< uint32_t >::max(), + remove_filter_cb_t filter_cb = nullptr) : + BtreeRangeRequest< K >(std::move(inp_range), app_context, batch_size), m_filter_cb{std::move(filter_cb)} {} }; /////////////////////////// 3: Get Operations ///////////////////////////////////// @@ -191,21 +197,28 @@ ENUM(BtreeQueryType, uint8_t, // essentially create a serializable level of isolation. SERIALIZABLE_QUERY) +using get_filter_cb_t = std::function< bool(BtreeKey const&, BtreeValue const&) >; + template < typename K > struct BtreeQueryRequest : public BtreeRangeRequest< K > { public: BtreeQueryRequest(BtreeKeyRange< K >&& inp_range, BtreeQueryType query_type = BtreeQueryType::SWEEP_NON_INTRUSIVE_PAGINATION_QUERY, - uint32_t batch_size = UINT32_MAX, void* app_context = nullptr) : - BtreeRangeRequest< K >{std::move(inp_range), true, app_context, batch_size}, m_query_type{query_type} {} + uint32_t batch_size = UINT32_MAX, get_filter_cb_t filter_cb = nullptr, + void* app_context = nullptr) : + BtreeRangeRequest< K >{std::move(inp_range), app_context, batch_size}, + m_query_type{query_type}, + m_filter_cb{std::move(filter_cb)} {} ~BtreeQueryRequest() = default; // virtual bool is_serializable() const = 0; BtreeQueryType query_type() const { return m_query_type; } + get_filter_cb_t const& filter() const { return m_filter_cb; } + protected: - const BtreeQueryType m_query_type; // Type of the query - const std::unique_ptr< BtreeQueryCursor< K > > m_paginated_query; // Is it a paginated query + const BtreeQueryType m_query_type; // Type of the query + get_filter_cb_t m_filter_cb; }; /* This class is a top level class to keep track of the locks that are held currently. It is diff --git a/src/include/homestore/btree/detail/btree_common.ipp b/src/include/homestore/btree/detail/btree_common.ipp index bc03db7ba..44035d238 100644 --- a/src/include/homestore/btree/detail/btree_common.ipp +++ b/src/include/homestore/btree/detail/btree_common.ipp @@ -18,6 +18,8 @@ namespace homestore { +#define to_variant_node(n) boost::static_pointer_cast< VariantNode< K, V > >(n) + template < typename K, typename V > btree_status_t Btree< K, V >::post_order_traversal(locktype_t ltype, const auto& cb) { BtreeNodePtr root; @@ -258,37 +260,6 @@ done: BT_LOG(INFO, "Node: <{}>", buf); } -template < typename K, typename V > -bool Btree< K, V >::call_on_read_kv_cb(const BtreeNodePtr& node, uint32_t idx, const BtreeRequest& req) const { - if (m_on_read_cb) { - V v; - node->get_nth_value(idx, &v, false); - return m_on_read_cb(node->get_nth_key< K >(idx, false), v, req); - } - return true; -} - -template < typename K, typename V > -bool Btree< K, V >::call_on_remove_kv_cb(const BtreeNodePtr& node, uint32_t idx, const BtreeRequest& req) const { - if (m_on_remove_cb) { - V v; - node->get_nth_value(idx, &v, false); - return m_on_remove_cb(node->get_nth_key< K >(idx, false), v, req); - } - return true; -} - -template < typename K, typename V > -bool Btree< K, V >::call_on_update_kv_cb(const BtreeNodePtr& node, uint32_t idx, const BtreeKey& new_key, - const BtreeRequest& req) const { - if (m_on_update_cb) { - V v; - node->get_nth_value(idx, &v, false); - return m_on_update_cb(node->get_nth_key< K >(idx, false), new_key, v, req); - } - return true; -} - template < typename K, typename V > void Btree< K, V >::append_route_trace(BtreeRequest& req, const BtreeNodePtr& node, btree_event_t event, uint32_t start_idx, uint32_t end_idx) const { diff --git a/src/include/homestore/btree/detail/btree_get_impl.ipp b/src/include/homestore/btree/detail/btree_get_impl.ipp index f2081e22f..4f0c09732 100644 --- a/src/include/homestore/btree/detail/btree_get_impl.ipp +++ b/src/include/homestore/btree/detail/btree_get_impl.ipp @@ -26,11 +26,10 @@ btree_status_t Btree< K, V >::do_get(const BtreeNodePtr& my_node, ReqT& greq) co if (my_node->is_leaf()) { if constexpr (std::is_same_v< BtreeGetAnyRequest< K >, ReqT >) { - std::tie(found, idx) = my_node->get_any(greq.m_range, greq.m_outkey, greq.m_outval, true, true); - if (found) { call_on_read_kv_cb(my_node, idx, greq); } + std::tie(found, idx) = + to_variant_node(my_node)->get_any(greq.m_range, greq.m_outkey, greq.m_outval, true, true); } else if constexpr (std::is_same_v< BtreeSingleGetRequest, ReqT >) { std::tie(found, idx) = my_node->find(greq.key(), greq.m_outval, true); - if (found) { call_on_read_kv_cb(my_node, idx, greq); } } if (!found) { ret = btree_status_t::not_found; diff --git a/src/include/homestore/btree/detail/btree_internal.hpp b/src/include/homestore/btree/detail/btree_internal.hpp index 53ebc34e5..cec11deaf 100644 --- a/src/include/homestore/btree/detail/btree_internal.hpp +++ b/src/include/homestore/btree/detail/btree_internal.hpp @@ -201,9 +201,8 @@ VENUM(btree_node_type, uint32_t, FIXED = 0, VAR_VALUE = 1, VAR_KEY = 2, VAR_OBJE VENUM(btree_store_type, uint8_t, MEM = 0, SSD = 1) #endif -ENUM(btree_status_t, uint32_t, success, not_found, retry, has_more, read_failed, write_failed, stale_buf, - refresh_failed, put_failed, space_not_avail, split_failed, insert_failed, cp_mismatch, merge_not_required, - merge_failed, replay_not_needed, fast_path_not_possible, resource_full, crc_mismatch, not_supported, node_freed) +ENUM(btree_status_t, uint32_t, success, not_found, retry, has_more, node_read_failed, put_failed, space_not_avail, + cp_mismatch, merge_not_required, merge_failed, crc_mismatch, not_supported, node_freed) /*ENUM(btree_node_write_type, uint8_t, new_node, // Node write whenever a new node is created. @@ -300,7 +299,6 @@ class BtreeMetrics : public sisl::MetricsGroup { REGISTER_COUNTER(btree_int_node_count, "Btree Interior node count", "btree_node_count", {"node_type", "interior"}, _publish_as::publish_as_gauge); REGISTER_COUNTER(btree_split_count, "Total number of btree node splits"); - REGISTER_COUNTER(insert_failed_count, "Total number of inserts failed"); REGISTER_COUNTER(btree_merge_count, "Total number of btree node merges"); REGISTER_COUNTER(btree_depth, "Depth of btree", _publish_as::publish_as_gauge); @@ -316,7 +314,6 @@ class BtreeMetrics : public sisl::MetricsGroup { {"node_type", "leaf"}, HistogramBucketsType(LinearUpto128Buckets)); REGISTER_COUNTER(btree_retry_count, "number of retries"); REGISTER_COUNTER(write_err_cnt, "number of errors in write"); - REGISTER_COUNTER(split_failed, "split failed"); REGISTER_COUNTER(query_err_cnt, "number of errors in query"); REGISTER_COUNTER(read_node_count_in_write_ops, "number of nodes read in write_op"); REGISTER_COUNTER(read_node_count_in_query_ops, "number of nodes read in query_op"); diff --git a/src/include/homestore/btree/detail/btree_mutate_impl.ipp b/src/include/homestore/btree/detail/btree_mutate_impl.ipp index 3eee5b75e..28db36591 100644 --- a/src/include/homestore/btree/detail/btree_mutate_impl.ipp +++ b/src/include/homestore/btree/detail/btree_mutate_impl.ipp @@ -52,10 +52,10 @@ retry: uint32_t curr_idx; if constexpr (std::is_same_v< ReqT, BtreeRangePutRequest< K > >) { - const auto count = my_node->template get_all< K, V >(req.next_range(), UINT32_MAX, start_idx, end_idx); - if (count == 0) { - BT_NODE_LOG_ASSERT(false, my_node, "get_all returns 0 entries for interior node is not valid pattern"); - ret = btree_status_t::retry; + const auto matched = my_node->match_range(req.working_range(), start_idx, end_idx); + if (!matched) { + BT_NODE_LOG_ASSERT(false, my_node, "match_range returns 0 entries for interior node is not valid pattern"); + ret = btree_status_t::put_failed; goto out; } } else if constexpr (std::is_same_v< ReqT, BtreeSinglePutRequest >) { @@ -102,17 +102,7 @@ retry: // If the child and child_info link in the parent mismatch, we need to do btree repair, it might have // encountered a crash in-between the split or merge and only partial commit happened. - if (is_split_needed(child_node, m_bt_cfg, req) || is_repair_needed(child_node, child_info)) { - - // TODO remove the split_node retry logic and use key max size. - if (!my_node->can_accomodate(m_bt_cfg, K::get_estimate_max_size(), BtreeLinkInfo::get_fixed_size())) { - // Mark the parent_node itself to be split upon next retry. - bt_thread_vars()->force_split_node = my_node; - unlock_node(child_node, child_cur_lock); - ret = btree_status_t::retry; - goto out; - } - + if (is_split_needed(child_node, req) || is_repair_needed(child_node, child_info)) { ret = upgrade_node_locks(my_node, child_node, curlock, child_cur_lock, req.m_op_context); if (ret != btree_status_t::success) { BT_NODE_LOG(DEBUG, my_node, "Upgrade of node lock failed, retrying from root"); @@ -125,7 +115,6 @@ retry: if (is_repair_needed(child_node, child_info)) { BT_NODE_LOG(TRACE, child_node, "Node repair needed"); ret = repair_split(my_node, child_node, curr_idx, req.m_op_context); - } else { K split_key; BT_NODE_LOG(TRACE, my_node, "Split node needed"); @@ -148,11 +137,12 @@ retry: if (child_node->is_leaf()) { // We get the trimmed range only for leaf because this is where we will be inserting keys. In // interior nodes, keys are always propogated from the lower nodes. - bool is_inp_key_lesser = false; - K end_key = - my_node->min_of(s_cast< const K& >(req.input_range().end_key()), curr_idx, is_inp_key_lesser); - bool end_incl = is_inp_key_lesser ? req.input_range().is_end_inclusive() : true; - req.trim_working_range(std::move(end_key), end_incl); + if (curr_idx < my_node->total_entries()) { + K child_end_key = my_node->get_nth_key< K >(curr_idx, true); + if (child_end_key.compare(req.working_range().end_key()) < 0) { + req.trim_working_range(std::move(child_end_key), true /* inclusive child key */); + } + } BT_NODE_LOG(DEBUG, my_node, "Subrange:idx=[{}-{}],c={},working={}", start_idx, end_idx, curr_idx, req.working_range().to_string()); @@ -206,42 +196,17 @@ template < typename ReqT > btree_status_t Btree< K, V >::mutate_write_leaf_node(const BtreeNodePtr& my_node, ReqT& req) { btree_status_t ret = btree_status_t::success; if constexpr (std::is_same_v< ReqT, BtreeRangePutRequest< K > >) { - const BtreeKeyRange< K >& subrange = req.working_range(); - - if (subrange.start_key().is_extent_key()) { - ret = mutate_extents_in_leaf(my_node, req); - } else { - auto const [start_found, start_idx] = my_node->find(subrange.start_key(), nullptr, false); - auto const [end_found, end_idx] = my_node->find(subrange.end_key(), nullptr, false); - if (req.m_put_type != btree_put_type::REPLACE_ONLY_IF_EXISTS) { - BT_DBG_ASSERT(false, "For non-extent keys range-update should be really update and cannot insert"); - ret = btree_status_t::not_supported; - } else { - if (!end_found) { - if (end_idx == my_node->total_entries() || end_idx == start_idx) { - return btree_status_t::not_found; - } - K tail_key = my_node->get_nth_key< K >(end_idx, false); - if (tail_key.compare(subrange.end_key()) == 1) { return btree_status_t::not_found; } - } - if (!start_found && !end_found && end_idx >= start_idx) { return btree_status_t::not_found; } - if (end_idx < start_idx) { return btree_status_t::not_found; } - const auto new_val_size{(*req.m_newval).serialized_size()}; - V tmp_v; - for (auto idx{start_idx}; idx <= end_idx; ++idx) { - my_node->get_nth_value(idx, &tmp_v, false); - if (my_node->available_size(m_bt_cfg) + tmp_v.serialized_size() < new_val_size) { - req.set_cursor_key(my_node->get_nth_key< K >(idx, false)); - return btree_status_t::has_more; - } - my_node->update(idx, *req.m_newval); - } - } - // update cursor in intermediate search state - req.set_cursor_key(subrange.end_key()); + K last_failed_key; + ret = to_variant_node(my_node)->multi_put(req.working_range(), req.input_range().start_key(), *req.m_newval, + req.m_put_type, &last_failed_key, req.m_filter_cb); + if (ret == btree_status_t::has_more) { + req.shift_working_range(std::move(last_failed_key), true /* make it including last_failed_key */); + } else if (ret == btree_status_t::success) { + req.shift_working_range(); } } else if constexpr (std::is_same_v< ReqT, BtreeSinglePutRequest >) { - if (!my_node->put(req.key(), req.value(), req.m_put_type, req.m_existing_val)) { + if (!to_variant_node(my_node)->put(req.key(), req.value(), req.m_put_type, req.m_existing_val, + req.m_filter_cb)) { ret = btree_status_t::put_failed; } COUNTER_INCREMENT(m_metrics, btree_obj_count, 1); @@ -254,169 +219,6 @@ btree_status_t Btree< K, V >::mutate_write_leaf_node(const BtreeNodePtr& my_node return ret; } -template < typename K, typename V > -btree_status_t Btree< K, V >::mutate_extents_in_leaf(const BtreeNodePtr& node, BtreeRangePutRequest< K >& rpreq) { - if constexpr (std::is_base_of_v< ExtentBtreeKey< K >, K > && std::is_base_of_v< ExtentBtreeValue< V >, V >) { - const BtreeKeyRange< K >& subrange = rpreq.current_sub_range(); - const auto& start_key = static_cast< const ExtentBtreeKey< K >& >(subrange.start_key()); - const auto& end_key = static_cast< ExtentBtreeKey< K >& >(subrange.end_key()); - ExtentBtreeValue< V >* new_value = static_cast< ExtentBtreeValue< V >* >(rpreq.m_newval.get()); - btree_status_t ret{btree_status_t::success}; - - BT_DBG_ASSERT_EQ(start_key.extent_length(), 1, "Search range start key can't be multiple extents"); - BT_DBG_ASSERT_EQ(end_key.extent_length(), 1, "Search range end key can't be multiple extents"); - - if (!can_extents_auto_merge()) { - BT_REL_ASSERT(false, "Yet to support non-auto merge range of extents in range put"); - return btree_status_t::not_supported; - } - - bool retry{false}; - auto const [start_found, start_idx] = node->find(start_key, nullptr, false); - do { - auto const [end_found, end_idx] = node->find(end_key, nullptr, false); - ExtentBtreeKey const new_k = start_key.combine(end_key); - auto idx = start_idx; - - { // Scope this to avoid head_k and tail_k are used beyond - K h_k, t_k; - V h_v, t_v; - int64_t head_offset{0}; - int64_t tail_offset{0}; - ExtentBtreeKey< K >& head_k = static_cast< ExtentBtreeKey< K >& >(h_k); - ExtentBtreeKey< K >& tail_k = static_cast< ExtentBtreeKey< K >& >(t_k); - ExtentBtreeValue< V >& head_v = static_cast< ExtentBtreeValue< V >& >(h_v); - ExtentBtreeValue< V >& tail_v = static_cast< ExtentBtreeValue< V >& >(t_v); - - // Get the residue head and tail key first if it is present, before updating any fields, otherwise - // updating fields will modify the other entry. - if (start_found) { - head_k = node->get_nth_key< K >(start_idx, false); - head_offset = head_k.distance_start(start_key); - BT_NODE_DBG_ASSERT_GE(head_offset, 0, node, "Invalid start_key or head_k"); - if (head_offset > 0) { node->get_nth_value(start_idx, &head_v, false); } - } - if (end_found) { - tail_k = node->get_nth_key< K >(end_idx, false); - tail_offset = end_key.distance_end(tail_k); - BT_NODE_DBG_ASSERT_GE(tail_offset, 0, node, "Invalid end_key or tail_k"); - if (tail_offset > 0) { node->get_nth_value(end_idx, &tail_v, false); } - } - - // Shortcut to simple update of the existing range, which is a normal case. Its a simple update only - // if the value we are replacing is all equal sized for every extent piece (which is normal use - // cases of the extents) - if (start_found && end_found && (head_offset == 0) && (tail_offset == 0) && (start_idx == end_idx) && - new_value->is_equal_sized()) { - call_on_update_kv_cb(node, start_idx, new_k, rpreq); - node->update(start_idx, new_k, new_value->shift(new_k.extent_length(), false)); - break; - } - - // Do size check, first check if we can accomodate the keys if checked conservatively. Thats most - // common case and thus efficient. Next we go aggressively, the more aggressive the check, more - // performance impact. - // - // First level check: Try assuming the entire value + 2 keys + 2 records to be inserted. If there is - // a space available, no need any additional check. - auto const record_size = (2 * (new_k.serialized_size() + node->get_record_size())); - auto size_needed = new_value->extracted_size(0, new_k.extent_length()) + record_size; - - auto const available_space = node->available_size(m_bt_cfg); - if (size_needed > available_space) { - BT_NODE_DBG_ASSERT_EQ(retry, false, node, "Don't expect multiple attempts of size not available"); - - // Second level check: Take into account the head and tail overlapped space and see if it saves - // some - if (head_offset > 0) { - size_needed -= (head_v.serialized_size() - head_v.extracted_size(0, head_offset)); - } - if (tail_offset > 0) { size_needed -= tail_v.extracted_size(0, tail_offset); } - - if (size_needed > available_space) { - // Third level check: Walk through every entry in the about to remove list and account for - // theirs - V tmp_v; - for (auto i = start_idx; i < end_idx; ++i) { - node->get_nth_value(i, &tmp_v, false); - size_needed -= - (node->get_nth_key< K >(i, false).serialized_size() + tmp_v.serialized_size()); - } - - // If still size is not enough, no other option other than trimming down the keys and retry - if (size_needed > available_space) { - auto const nextents = new_value->num_extents_fit(available_space - record_size); - end_key = new_k.extract(0, nextents, true); - retry = true; - ret = btree_status_t::has_more; - continue; - } - } - } - retry = false; - - // Write partial head and tail kv. At this point we are committing and we can't go back and not - // update some of the extents. - if (end_idx == start_idx) { - // Special case - where there is a overlap and single entry is split into 3 - auto const tail_start = tail_k.extent_length() - tail_offset; - if (m_on_remove_cb) { - m_on_remove_cb(head_k.extract(head_offset, tail_start - head_offset, false), - head_v.extract(head_offset, tail_start - head_offset, false), rpreq); - } - - if (tail_offset > 0) { - node->insert(end_idx + 1, tail_k.extract(tail_start, tail_offset, false), - tail_v.extract(tail_start, tail_offset, false)); - COUNTER_INCREMENT(m_metrics, btree_obj_count, 1); - } - - if (head_offset > 0) { - node->update(idx++, head_k.extract(0, head_offset, false), - head_v.extract(0, head_offset, false)); - } - } else { - if (tail_offset > 0) { - auto const tail_start = tail_k.extent_length() - tail_offset; - auto const shrunk_k = tail_k.extract(tail_start, tail_offset, false); - call_on_update_kv_cb(node, end_idx, shrunk_k, rpreq); - node->update(end_idx, shrunk_k, tail_v.extract(tail_start, tail_offset, false)); - } else if (end_found) { - ++end_idx; - } - - if (head_offset > 0) { - auto const shrunk_k = head_k.extract(0, -head_offset, false); - call_on_update_kv_cb(node, idx, shrunk_k, rpreq); - node->update(idx++, shrunk_k, head_v.extract(0, -head_offset, false)); - } - } - } - - // Remove everything in-between - if (idx < end_idx) { - if (m_on_remove_cb) { - for (auto i{idx}; i <= end_idx; ++i) { - call_on_remove_kv_cb(node, i, rpreq); - } - } - node->remove(idx, end_idx - 1); - COUNTER_DECREMENT(m_metrics, btree_obj_count, end_idx - idx); - } - - // Now we should have enough room to insert the combined entry - node->insert(idx, new_k, new_value->shift(new_k.extent_length())); - COUNTER_INCREMENT(m_metrics, btree_obj_count, 1); - } while (retry); - - rpreq.set_cursor_key(end_key); - return ret; - } else { - BT_REL_ASSERT(false, "Don't expect mutate_extents to be called on non-extent code path"); - return btree_status_t::not_supported; - } -} - template < typename K, typename V > template < typename ReqT > btree_status_t Btree< K, V >::check_split_root(ReqT& req) { @@ -430,7 +232,7 @@ btree_status_t Btree< K, V >::check_split_root(ReqT& req) { ret = read_and_lock_node(m_root_node_info.bnode_id(), root, locktype_t::WRITE, locktype_t::WRITE, req.m_op_context); if (ret != btree_status_t::success) { goto done; } - if (!is_split_needed(root, m_bt_cfg, req) && !is_repair_needed(root, m_root_node_info)) { + if (!is_split_needed(root, req) && !is_repair_needed(root, m_root_node_info)) { unlock_node(root, locktype_t::WRITE); goto done; } @@ -482,7 +284,7 @@ done: template < typename K, typename V > btree_status_t Btree< K, V >::split_node(const BtreeNodePtr& parent_node, const BtreeNodePtr& child_node, - uint32_t parent_ind, BtreeKey* out_split_key, void* context) { + uint32_t parent_ind, K* out_split_key, void* context) { BtreeNodePtr child_node1 = child_node; BtreeNodePtr child_node2; child_node2.reset(child_node1->is_leaf() ? alloc_leaf_node().get() : alloc_interior_node().get()); @@ -494,7 +296,7 @@ btree_status_t Btree< K, V >::split_node(const BtreeNodePtr& parent_node, const child_node2->set_next_bnode(child_node1->next_bnode()); child_node1->set_next_bnode(child_node2->node_id()); child_node2->set_level(child_node1->level()); - uint32_t child1_filled_size = m_bt_cfg.node_data_size() - child_node1->available_size(m_bt_cfg); + uint32_t child1_filled_size = m_bt_cfg.node_data_size() - child_node1->available_size(); auto split_size = m_bt_cfg.split_size(child1_filled_size); uint32_t res = child_node1->move_out_to_right_by_size(m_bt_cfg, *child_node2, split_size); @@ -506,35 +308,13 @@ btree_status_t Btree< K, V >::split_node(const BtreeNodePtr& parent_node, const // Insert the last entry in first child to parent node *out_split_key = child_node1->get_last_key< K >(); - // In an unlikely case where parent node has no room to accomodate the child key, we need to un-split and then - // free up the new node. This situation could happen on variable key, where the key max size is purely - // an estimation. This logic allows the max size to be declared more optimistically than say 1/4 of node - // which will have substantially large number of splits and performance constraints. - if (!parent_node->can_accomodate(m_bt_cfg, out_split_key->serialized_size(), BtreeLinkInfo::get_fixed_size())) { - uint32_t move_in_res = child_node1->copy_by_entries(m_bt_cfg, *child_node2, 0, child_node2->total_entries()); - BT_NODE_REL_ASSERT_EQ(move_in_res, res, child_node1, - "The split key size is more than estimated parent available space, but when revert is " - "attempted it fails. Continuing can cause data loss, so crashing"); - free_node(child_node2, locktype_t::NONE, context); - - // Mark the parent_node itself to be split upon next retry. - bt_thread_vars()->force_split_node = parent_node; - return btree_status_t::retry; - } - BT_NODE_LOG(TRACE, parent_node, "Available space for split entry={}", parent_node->available_size(m_bt_cfg)); + BT_NODE_LOG(TRACE, parent_node, "Available space for split entry={}", parent_node->available_size()); -// child_node1->inc_link_version(); + // child_node1->inc_link_version(); // Update the existing parent node entry to point to second child ptr. parent_node->update(parent_ind, child_node2->link_info()); - - // If key is extent then we always insert the tail portion of the extent key in the parent node - if (out_split_key->is_extent_key()) { - parent_node->insert(parent_ind, ((ExtentBtreeKey< K >*)out_split_key)->extract_end(false), - child_node1->link_info()); - } else { - parent_node->insert(parent_ind, *out_split_key, child_node1->link_info()); - } + parent_node->insert(parent_ind, *out_split_key, child_node1->link_info()); BT_NODE_DBG_ASSERT_GT(child_node2->get_first_key< K >().compare(*out_split_key), 0, child_node2); BT_NODE_LOG(DEBUG, parent_node, "Split child_node={} with new_child_node={}, split_key={}", child_node1->node_id(), @@ -551,34 +331,16 @@ btree_status_t Btree< K, V >::split_node(const BtreeNodePtr& parent_node, const template < typename K, typename V > template < typename ReqT > -bool Btree< K, V >::is_split_needed(const BtreeNodePtr& node, const BtreeConfig& cfg, ReqT& req) const { - if (bt_thread_vars()->force_split_node && (bt_thread_vars()->force_split_node == node)) { - bt_thread_vars()->force_split_node = nullptr; - return true; - } - - int64_t size_needed = 0; +bool Btree< K, V >::is_split_needed(const BtreeNodePtr& node, ReqT& req) const { if (!node->is_leaf()) { // if internal node, size is atmost one additional entry, size of K/V - size_needed = K::get_estimate_max_size() + BtreeLinkInfo::get_fixed_size() + node->get_record_size(); + return !node->has_room_for_put(btree_put_type::UPSERT, K::get_max_size(), BtreeLinkInfo::get_fixed_size()); } else if constexpr (std::is_same_v< ReqT, BtreeRangePutRequest< K > >) { - const BtreeKey& next_key = req.next_key(); - - if (next_key.is_extent_key()) { - // For extent keys we expect to write atleast first value in the req along with 2 possible keys - // in case of splitting existing key - auto val = static_cast< const ExtentBtreeValue< V >* >(req.m_newval); - size_needed = val->extracted_size(0, 1) + 2 * (next_key.serialized_size() + node->get_record_size()); - } else { - size_needed = req.m_newval->serialized_size(); - if (req.m_put_type != btree_put_type::REPLACE_ONLY_IF_EXISTS) { - size_needed += next_key.serialized_size() + node->get_record_size(); - } - } + return !node->has_room_for_put(req.m_put_type, req.first_key_size(), req.m_newval->serialized_size()); } else if constexpr (std::is_same_v< ReqT, BtreeSinglePutRequest >) { - size_needed = req.key().serialized_size() + req.value().serialized_size() + node->get_record_size(); + return !node->has_room_for_put(req.m_put_type, req.key().serialized_size(), req.value().serialized_size()); + } else { + return false; } - int64_t alreadyFilledSize = cfg.node_data_size() - node->available_size(cfg); - return (alreadyFilledSize + size_needed >= cfg.ideal_fill_size()); } template < typename K, typename V > @@ -588,54 +350,4 @@ btree_status_t Btree< K, V >::repair_split(const BtreeNodePtr& parent_node, cons parent_node->insert(parent_split_idx, child_node1->get_last_key< K >(), child_node1->link_info()); return write_node(parent_node, context); } - -#if 0 -template < typename K, typename V > -int64_t Btree< K, V >::compute_single_put_needed_size(const V& current_val, const V& new_val) const { - return new_val.serialized_size() - current_val.serialized_size(); -} - -template < typename K, typename V > -int64_t Btree< K, V >::compute_range_put_needed_size(const std::vector< std::pair< K, V > >& existing_kvs, - const V& new_val) const { - return new_val.serialized_size() * existing_kvs.size(); -} - -template < typename K, typename V > -btree_status_t -Btree< K, V >::custom_kv_select_for_write(uint8_t node_version, const std::vector< std::pair< K, V > >& match_kv, - std::vector< std::pair< K, V > >& replace_kv, const BtreeKeyRange& range, - const BtreeRangePutRequest& rpreq) const { - for (const auto& [k, v] : match_kv) { - replace_kv.push_back(std::make_pair(k, (V&)rpreq.m_newval)); - } - return btree_status_t::success; -} -#endif - -#if 0 -template < typename K, typename V > -btree_status_t Btree< K, V >::get_start_and_end_idx(const BtreeNodePtr& node, BtreeMutateRequest& req, - int& start_idx, int& end_idx) { - btree_status_t ret = btree_status_t::success; - if (is_range_put_req(req)) { - /* just get start/end index from get_all. We don't release the parent lock until this - * key range is not inserted from start_idx to end_idx. - */ - node->template get_all< V >(to_range_put_req(req).next_range(), UINT32_MAX, (uint32_t&)start_idx, - (uint32_t&)end_idx); - } else { - auto [found, idx] = node->find(to_single_put_req(req).key(), nullptr, true); - ASSERT_IS_VALID_INTERIOR_CHILD_INDX(found, idx, node); - end_idx = start_idx = (int)idx; - } - - if (start_idx > end_idx) { - BT_NODE_LOG_ASSERT(false, node, "start ind {} greater than end ind {}", start_idx, end_idx); - ret = btree_status_t::retry; - } - return ret; -} -#endif - } // namespace homestore diff --git a/src/include/homestore/btree/detail/btree_node.hpp b/src/include/homestore/btree/detail/btree_node.hpp index 05dc1a83e..5bbe0df7c 100644 --- a/src/include/homestore/btree/detail/btree_node.hpp +++ b/src/include/homestore/btree/detail/btree_node.hpp @@ -54,8 +54,7 @@ struct persistent_hdr_t { bnodeid_t node_id{empty_bnodeid}; bnodeid_t next_node{empty_bnodeid}; - uint32_t nentries : 27; - uint32_t node_type : 3; + uint32_t nentries : 30; uint32_t leaf : 1; uint32_t valid_node : 1; @@ -63,9 +62,11 @@ struct persistent_hdr_t { uint64_t link_version{0}; // Version of the link between its parent, updated if structure changes BtreeLinkInfo::bnode_link_info edge_info; // Edge entry information - uint16_t level; // Level of the node within the tree - uint16_t reserved1; - uint32_t reserved2; + uint16_t level; // Level of the node within the tree + uint8_t node_type; // Type of the node (simple vs varlen etc..) + uint8_t reserved1; + uint16_t node_size; + uint16_t reserved2; persistent_hdr_t() : nentries{0}, leaf{0}, valid_node{1} {} std::string to_string() const { @@ -78,7 +79,7 @@ struct persistent_hdr_t { #pragma pack() class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { - typedef std::pair< bool, uint32_t > node_find_result_t; + using node_find_result_t = std::pair< bool, uint32_t >; public: sisl::atomic_counter< int32_t > m_refcount{0}; @@ -86,12 +87,13 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { uint8_t* m_phys_node_buf; public: - ~BtreeNode() = default; - BtreeNode(uint8_t* node_buf, bnodeid_t id, bool init_buf, bool is_leaf) : m_phys_node_buf{node_buf} { + BtreeNode(uint8_t* node_buf, bnodeid_t id, bool init_buf, bool is_leaf, BtreeConfig const& cfg) : + m_phys_node_buf{node_buf} { if (init_buf) { new (node_buf) persistent_hdr_t{}; set_node_id(id); set_leaf(is_leaf); + set_node_size(cfg.node_size()); } else { DEBUG_ASSERT_EQ(node_id(), id); DEBUG_ASSERT_EQ(magic(), BTREE_NODE_MAGIC); @@ -99,11 +101,26 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { } m_trans_hdr.is_leaf_node = is_leaf; } + virtual ~BtreeNode() = default; // Identify if a node is a leaf node or not, from raw buffer, by just reading persistent_hdr_t static bool identify_leaf_node(uint8_t* buf) { return (r_cast< persistent_hdr_t* >(buf))->leaf; } - node_find_result_t find(const BtreeKey& key, BtreeValue* outval, bool copy_val) const { + /// @brief Finds the index of the entry with the specified key in the node. + /// + /// This method performs a binary search on the node to find the index of the entry with the specified key. + /// If the key is not found in the node, the method returns the index of the first entry greater than the key. + /// + /// @param key The key to search for. + /// @param outval [optional] A pointer to a BtreeValue object to store the value associated with the key. + /// @param copy_val If outval is non-null, is the value deserialized from node needs to be copy of the btree + /// internal buffer. Safest option is to set this true, it is ok to set it false, if find() is called and value is + /// accessed and used before subsequent node modification. + /// @return A pair of values representing the result of the search. + /// The first value is a boolean indicating whether the key was found in the node. + /// The second value is an integer representing the index of the entry with the specified key or the index + /// of the first entry greater than the key. + node_find_result_t find(BtreeKey const& key, BtreeValue* outval, bool copy_val) const { LOGMSG_ASSERT_EQ(magic(), BTREE_NODE_MAGIC, "Magic mismatch on btree_node {}", get_persistent_header_const()->to_string()); @@ -120,134 +137,42 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { return std::make_pair(found, idx); } - template < typename K, typename V > - uint32_t get_all(const BtreeKeyRange< K >& range, uint32_t max_count, uint32_t& start_idx, uint32_t& end_idx, - std::vector< std::pair< K, V > >* out_values = nullptr) const { - LOGMSG_ASSERT_EQ(magic(), BTREE_NODE_MAGIC, "Magic mismatch on btree_node {}", - get_persistent_header_const()->to_string()); - auto count = 0U; - bool sfound, efound; - // Get the start index of the search range. - std::tie(sfound, start_idx) = bsearch_node(range.start_key()); - if (sfound && !range.is_start_inclusive()) { - ++start_idx; - sfound = false; - } - if (start_idx == total_entries()) { - end_idx = start_idx; - if (is_leaf() || !has_valid_edge()) { - return 0; // No result found - } else { - goto out; - } - } - - std::tie(efound, end_idx) = bsearch_node(range.end_key()); - if (efound && !range.is_end_inclusive()) { - if (end_idx == 0) { return 0; } - --end_idx; - efound = false; - } - - // If we point to same start and end without any match, it is hitting unavailable range - if ((start_idx == end_idx) && is_leaf() && !sfound && !efound) { return 0; } - - if (end_idx == total_entries()) { - DEBUG_ASSERT_GT(end_idx, 0); // At this point end_idx should never have been zero - if (!has_valid_edge()) { --end_idx; } - } - - out: - count = std::min(end_idx - start_idx + 1, max_count); - if (out_values) { - /* get the keys and values */ - for (auto i{start_idx}; i < (start_idx + count); ++i) { - add_nth_obj_to_list< K, V >(i, out_values, true); - } - } - return count; - } - template < typename K > - std::pair< bool, uint32_t > get_any(const BtreeKeyRange< K >& range, BtreeKey* out_key, BtreeValue* out_val, - bool copy_key, bool copy_val) const { + bool match_range(BtreeKeyRange< K > const& range, uint32_t& start_idx, uint32_t& end_idx) const { LOGMSG_ASSERT_EQ(magic(), BTREE_NODE_MAGIC, "Magic mismatch on btree_node {}", get_persistent_header_const()->to_string()); - uint32_t result_idx; - const auto mm_opt = range.multi_option(); - bool efound; - uint32_t end_idx; + bool sfound, efound; // Get the start index of the search range. - auto [sfound, start_idx] = bsearch_node(range.start_key()); + std::tie(sfound, start_idx) = this->bsearch_node(range.start_key()); if (sfound && !range.is_start_inclusive()) { ++start_idx; sfound = false; } - if (sfound && ((mm_opt == MultiMatchOption::DO_NOT_CARE) || (mm_opt == MultiMatchOption::LEFT_MOST))) { - result_idx = start_idx; - goto found_result; - } else if (start_idx == total_entries()) { - DEBUG_ASSERT(is_leaf() || has_valid_edge(), "Invalid node"); - return std::make_pair(false, 0); // out_of_range - } - - std::tie(efound, end_idx) = bsearch_node(range.end_key()); - if (efound && !range.is_end_inclusive()) { - if (end_idx == 0) { return std::make_pair(false, 0); } - --end_idx; - efound = false; + if (start_idx == this->total_entries()) { + // We are already at the end of search, we should return this as the only entry + end_idx = start_idx; + return (!is_leaf() && this->has_valid_edge()); // No result found unless its a edge node } - if (end_idx > start_idx) { - if (mm_opt == MultiMatchOption::RIGHT_MOST) { - result_idx = end_idx; - } else if (mm_opt == MultiMatchOption::MID) { - result_idx = (end_idx - start_idx) / 2; - } else { - result_idx = start_idx; + // Get the end index of the search range. + std::tie(efound, end_idx) = this->bsearch_node(range.end_key()); + if (is_leaf() || ((end_idx == this->total_entries()) && !has_valid_edge())) { + // Binary search will always return the index as the first key that is >= given key (end_key in this + // case). Our goal here in leaf node is to find the last key that is less than in case of non_inclusive + // search or less than or equal in case of inclusive search. + if (!efound || !range.is_end_inclusive()) { + // If we are already on the first key, then obviously nothing has been matched. + if (end_idx == 0) { return false; } + --end_idx; } - } else if ((start_idx == end_idx) && ((sfound || efound))) { - result_idx = start_idx; - } else { - return std::make_pair(false, 0); - } - - found_result: - if (out_key) { get_nth_key_internal(result_idx, *out_key, copy_key); } - if (out_val) { get_nth_value(result_idx, out_val, copy_val); } - return std::make_pair(true, result_idx); - } - - bool put(const BtreeKey& key, const BtreeValue& val, btree_put_type put_type, BtreeValue* existing_val) { - LOGMSG_ASSERT_EQ(magic(), BTREE_NODE_MAGIC, "Magic mismatch on btree_node {}", - get_persistent_header_const()->to_string()); - bool ret = true; - - const auto [found, idx] = find(key, nullptr, false); - if (found && existing_val) { get_nth_value(idx, existing_val, true); } - if (put_type == btree_put_type::INSERT_ONLY_IF_NOT_EXISTS) { - if (found) { - LOGDEBUG("Attempt to insert duplicate entry {}", key.to_string()); - return false; - } - ret = (insert(idx, key, val) == btree_status_t::success); - } else if (put_type == btree_put_type::REPLACE_ONLY_IF_EXISTS) { - if (!found) return false; - update(idx, key, val); - } else if (put_type == btree_put_type::REPLACE_IF_EXISTS_ELSE_INSERT) { - (found) ? update(idx, key, val) : (void)insert(idx, key, val); - } else if (put_type == btree_put_type::APPEND_ONLY_IF_EXISTS) { - if (!found) return false; - append(idx, key, val); - } else if (put_type == btree_put_type::APPEND_IF_EXISTS_ELSE_INSERT) { - (found) ? append(idx, key, val) : (void)insert(idx, key, val); - } else { - DEBUG_ASSERT(false, "Wrong put_type {}", put_type); + // If we point to same start and end without any match, it is hitting unavailable range + if (start_idx > end_idx) { return false; } } - return ret; + + return true; } virtual btree_status_t insert(const BtreeKey& key, const BtreeValue& val) { @@ -314,59 +239,6 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { } } - template < typename K > - K min_of(const K& cmp_key, uint32_t cmp_ind, bool& is_cmp_key_lesser) const { - K min_key; - int x{-1}; - is_cmp_key_lesser = false; - - if (cmp_ind < total_entries()) { - get_nth_key_internal(cmp_ind, min_key, false); - x = cmp_key.compare(min_key); - } - - if (x < 0) { - min_key = cmp_key; - is_cmp_key_lesser = true; - } - return min_key; - } - - /*BtreeKeyRange get_subrange(const BtreeKeyRange< K >& inp_range, int upto_ind) const { - #ifndef NDEBUG - if (upto_ind > 0) { - // start of input range should always be more then the key in curr_ind - 1 - DEBUG_ASSERT_LE(get_nth_key< K >(upto_ind - 1, false).compare(inp_range.start_key()), 0, "[node={}]", - to_string()); - } - #endif - - // find end of subrange - bool end_inc = true; - K end_key; - - if (upto_ind < int_cast(total_entries())) { - end_key = get_nth_key< K >(upto_ind, false); - if (end_key.compare(inp_range.end_key()) >= 0) { - // this is last index to process as end of range is smaller then key in this node - end_key = inp_range.end_key(); - end_inc = inp_range.is_end_inclusive(); - } else { - end_inc = true; - } - } else { - // it is the edge node. end key is the end of input range - LOGMSG_ASSERT_EQ(has_valid_edge(), true, "node={}", to_string()); - end_key = inp_range.end_key(); - end_inc = inp_range.is_end_inclusive(); - } - - BtreeKeyRangeSafe< K > subrange{inp_range.start_key(), inp_range.is_start_inclusive(), end_key, end_inc}; - RELEASE_ASSERT_LE(subrange.start_key().compare(subrange.end_key()), 0, "[node={}]", to_string()); - RELEASE_ASSERT_LE(subrange.start_key().compare(inp_range.end_key()), 0, "[node={}]", to_string()); - return subrange; - } */ - template < typename K > K get_nth_key(uint32_t idx, bool copy) const { K k; @@ -442,29 +314,17 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { void lock_acknowledge() { m_trans_hdr.upgraders.decrement(1); } bool any_upgrade_waiters() const { return (!m_trans_hdr.upgraders.testz()); } - bool can_accomodate(const BtreeConfig& cfg, uint32_t key_size, uint32_t value_size) const { - return ((key_size + value_size + get_record_size()) <= available_size(cfg)); - } - - template < typename K, typename V > - void add_nth_obj_to_list(uint32_t ind, std::vector< std::pair< K, V > >* vec, bool copy) const { - std::pair< K, V > kv; - vec->emplace_back(kv); - - auto* pkv = &vec->back(); - if (ind == total_entries() && !is_leaf()) { - pkv->second = edge_value_internal< V >(); - } else { - get_nth_key_internal(ind, pkv->first, copy); - get_nth_value(ind, &pkv->second, copy); - } - } - public: // Public method which needs to be implemented by variants + virtual btree_status_t insert(uint32_t ind, const BtreeKey& key, const BtreeValue& val) = 0; + virtual void remove(uint32_t ind) { remove(ind, ind); } + virtual void remove(uint32_t ind_s, uint32_t ind_e) = 0; + virtual void remove_all(const BtreeConfig& cfg) = 0; + virtual void update(uint32_t ind, const BtreeValue& val) = 0; + virtual void update(uint32_t ind, const BtreeKey& key, const BtreeValue& val) = 0; + virtual uint32_t move_out_to_right_by_entries(const BtreeConfig& cfg, BtreeNode& other_node, uint32_t nentries) = 0; virtual uint32_t move_out_to_right_by_size(const BtreeConfig& cfg, BtreeNode& other_node, uint32_t size) = 0; - virtual uint32_t num_entries_by_size(uint32_t start_idx, uint32_t size) const = 0; virtual uint32_t copy_by_size(const BtreeConfig& cfg, const BtreeNode& other_node, uint32_t start_idx, uint32_t size) = 0; virtual uint32_t copy_by_entries(const BtreeConfig& cfg, const BtreeNode& other_node, uint32_t start_idx, @@ -472,23 +332,17 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { /*virtual uint32_t move_in_from_right_by_entries(const BtreeConfig& cfg, BtreeNode& other_node, uint32_t nentries) = 0; virtual uint32_t move_in_from_right_by_size(const BtreeConfig& cfg, BtreeNode& other_node, uint32_t size) = 0;*/ - virtual uint32_t available_size(const BtreeConfig& cfg) const = 0; - virtual std::string to_string(bool print_friendly = false) const = 0; - virtual std::string to_string_keys(bool print_friendly = false) const = 0; - virtual void get_nth_value(uint32_t ind, BtreeValue* out_val, bool copy) const = 0; - virtual void get_nth_key_internal(uint32_t ind, BtreeKey& out_key, bool copykey) const = 0; - virtual btree_status_t insert(uint32_t ind, const BtreeKey& key, const BtreeValue& val) = 0; - virtual void remove(uint32_t ind) { remove(ind, ind); } - virtual void remove(uint32_t ind_s, uint32_t ind_e) = 0; - virtual void remove_all(const BtreeConfig& cfg) = 0; - virtual void update(uint32_t ind, const BtreeValue& val) = 0; - virtual void update(uint32_t ind, const BtreeKey& key, const BtreeValue& val) = 0; - virtual void append(uint32_t ind, const BtreeKey& key, const BtreeValue& val) = 0; + virtual uint32_t available_size() const = 0; + virtual bool has_room_for_put(btree_put_type put_type, uint32_t key_size, uint32_t value_size) const = 0; + virtual uint32_t num_entries_by_size(uint32_t start_idx, uint32_t size) const = 0; - virtual uint32_t get_nth_obj_size(uint32_t ind) const = 0; - virtual uint16_t get_record_size() const = 0; virtual int compare_nth_key(const BtreeKey& cmp_key, uint32_t ind) const = 0; + virtual void get_nth_key_internal(uint32_t ind, BtreeKey& out_key, bool copykey) const = 0; + virtual uint32_t get_nth_key_size(uint32_t ind) const = 0; + virtual void get_nth_value(uint32_t ind, BtreeValue* out_val, bool copy) const = 0; + virtual uint32_t get_nth_value_size(uint32_t ind) const = 0; + virtual uint32_t get_nth_obj_size(uint32_t ind) const { return get_nth_key_size(ind) + get_nth_value_size(ind); } virtual uint8_t* get_node_context() = 0; // Method just to please compiler @@ -497,7 +351,10 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { return V{edge_id()}; } -private: + virtual std::string to_string(bool print_friendly = false) const = 0; + virtual std::string to_string_keys(bool print_friendly = false) const = 0; + +protected: node_find_result_t bsearch_node(const BtreeKey& key) const { DEBUG_ASSERT_EQ(magic(), BTREE_NODE_MAGIC); auto [found, idx] = bsearch(-1, total_entries(), key); @@ -575,7 +432,11 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { void set_leaf(bool leaf) { get_persistent_header()->leaf = leaf; } void set_node_type(btree_node_type t) { get_persistent_header()->node_type = uint32_cast(t); } + void set_node_size(uint32_t size) { get_persistent_header()->node_size = s_cast< uint16_t >(size - 1); } uint64_t node_gen() const { return get_persistent_header_const()->node_gen; } + uint32_t node_size() const { return s_cast< uint32_t >(get_persistent_header_const()->node_size) + 1; } + uint32_t node_data_size() const { return node_size() - sizeof(persistent_hdr_t); } + void inc_gen() { get_persistent_header()->node_gen++; } void set_gen(uint64_t g) { get_persistent_header()->node_gen = g; } uint64_t link_version() const { return get_persistent_header_const()->link_version; } @@ -587,21 +448,19 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { BtreeLinkInfo link_info() const { return BtreeLinkInfo{node_id(), link_version()}; } - virtual uint32_t occupied_size(const BtreeConfig& cfg) const { - return (cfg.node_data_size() - available_size(cfg)); - } + virtual uint32_t occupied_size() const { return (node_data_size() - available_size()); } bool is_merge_needed(const BtreeConfig& cfg) const { #if 0 #ifdef _PRERELEASE - if (iomgr_flip::instance()->test_flip("btree_merge_node") && occupied_size(cfg) < node_area_size(cfg)) { + if (iomgr_flip::instance()->test_flip("btree_merge_node") && occupied_size() < node_data_size) { return true; } auto ret = iomgr_flip::instance()->get_test_flip< uint64_t >("btree_merge_node_pct"); - if (ret && occupied_size(cfg) < (ret.get() * node_area_size(cfg) / 100)) { return true; } + if (ret && occupied_size() < (ret.get() * node_data_size() / 100)) { return true; } #endif #endif - return (occupied_size(cfg) < cfg.suggested_min_size()); + return (occupied_size() < cfg.suggested_min_size()); } bnodeid_t next_bnode() const { return get_persistent_header_const()->next_node; } diff --git a/src/include/homestore/btree/detail/btree_node_mgr.ipp b/src/include/homestore/btree/detail/btree_node_mgr.ipp index cfd06165e..f1476574c 100644 --- a/src/include/homestore/btree/detail/btree_node_mgr.ipp +++ b/src/include/homestore/btree/detail/btree_node_mgr.ipp @@ -18,6 +18,7 @@ #include #include #include +#include #include // #include @@ -52,7 +53,7 @@ btree_status_t Btree< K, V >::read_and_lock_node(bnodeid_t id, BtreeNodePtr& nod locktype_t leaf_lock_type, void* context) const { auto ret = read_node_impl(id, node_ptr); if (node_ptr == nullptr) { - if (ret != btree_status_t::fast_path_not_possible) { BT_LOG(ERROR, "read failed, reason: {}", ret); } + BT_LOG(ERROR, "read failed, reason: {}", ret); return ret; } @@ -88,7 +89,7 @@ template < typename K, typename V > btree_status_t Btree< K, V >::write_node(const BtreeNodePtr& node, void* context) { COUNTER_INCREMENT_IF_ELSE(m_metrics, node->is_leaf(), btree_leaf_node_writes, btree_int_node_writes, 1); HISTOGRAM_OBSERVE_IF_ELSE(m_metrics, node->is_leaf(), btree_leaf_node_occupancy, btree_int_node_occupancy, - ((m_node_size - node->available_size(m_bt_cfg)) * 100) / m_node_size); + ((m_node_size - node->available_size()) * 100) / m_node_size); return (write_node_impl(node, context)); } @@ -289,6 +290,13 @@ BtreeNode* Btree< K, V >::init_node(uint8_t* node_buf, uint32_t node_ctx_size, b this->m_bt_cfg); break; + case btree_node_type::PREFIX: + n = is_leaf + ? create_node< FixedPrefixNode< K, V > >(node_ctx_size, node_buf, id, init_buf, true, this->m_bt_cfg) + : create_node< FixedPrefixNode< K, BtreeLinkInfo > >(node_ctx_size, node_buf, id, init_buf, false, + this->m_bt_cfg); + break; + default: BT_REL_ASSERT(false, "Unsupported node type {}", node_type); break; diff --git a/src/include/homestore/btree/detail/btree_query_impl.ipp b/src/include/homestore/btree/detail/btree_query_impl.ipp index c151f6e5e..8d21c26b7 100644 --- a/src/include/homestore/btree/detail/btree_query_impl.ipp +++ b/src/include/homestore/btree/detail/btree_query_impl.ipp @@ -36,12 +36,8 @@ btree_status_t Btree< K, V >::do_sweep_query(BtreeNodePtr& my_node, BtreeQueryRe uint32_t start_ind{0}; uint32_t end_ind{0}; - auto cur_count = - my_node->template get_all< K, V >(qreq.next_range(), qreq.batch_size() - count, start_ind, end_ind); - for (auto idx{start_ind}; idx < (start_ind + cur_count); ++idx) { - call_on_read_kv_cb(my_node, idx, qreq); - my_node->add_nth_obj_to_list(idx, &out_values, true); - } + auto cur_count = to_variant_node(my_node)->multi_get(qreq.working_range(), qreq.batch_size() - count, + start_ind, end_ind, &out_values, qreq.filter()); count += cur_count; if (qreq.route_tracing) { @@ -49,7 +45,7 @@ btree_status_t Btree< K, V >::do_sweep_query(BtreeNodePtr& my_node, BtreeQueryRe } // If this is not the last entry found, then surely we have reached the end of search criteria - if ((end_ind + 1) < my_node->total_entries()) { break; } + // if ((end_ind + 1) < my_node->total_entries()) { break; } // Keep querying sibling nodes if (count < qreq.batch_size()) { @@ -71,7 +67,7 @@ btree_status_t Btree< K, V >::do_sweep_query(BtreeNodePtr& my_node, BtreeQueryRe } BtreeLinkInfo start_child_info; - [[maybe_unused]] const auto [isfound, idx] = my_node->find(qreq.next_key(), &start_child_info, false); + [[maybe_unused]] const auto [isfound, idx] = my_node->find(qreq.first_key(), &start_child_info, false); ASSERT_IS_VALID_INTERIOR_CHILD_INDX(isfound, idx, my_node); if (qreq.route_tracing) { append_route_trace(qreq, my_node, btree_event_t::READ, idx, idx); } @@ -92,17 +88,11 @@ btree_status_t Btree< K, V >::do_traversal_query(const BtreeNodePtr& my_node, Bt if (my_node->is_leaf()) { BT_NODE_LOG_ASSERT_GT(qreq.batch_size(), 0, my_node); - uint32_t start_ind = 0, end_ind = 0; - auto cur_count = my_node->get_all(qreq.next_range(), qreq.batch_size() - (uint32_t)out_values.size(), start_ind, - end_ind, &out_values); - - if (cur_count) { - for (auto idx{start_ind}; idx < (start_ind + cur_count); ++idx) { - call_on_read_kv_cb(my_node, idx, qreq); - // my_node->add_nth_obj_to_list(idx, &out_values, true); - } - } - + uint32_t start_ind{0}; + uint32_t end_ind{0}; + auto cur_count = to_variant_node(my_node)->multi_get(qreq.working_range(), + qreq.batch_size() - uint32_cast(out_values.size()), + start_ind, end_ind, &out_values, qreq.filter()); if (qreq.route_tracing) { append_route_trace(qreq, my_node, btree_event_t::READ, start_ind, start_ind + cur_count); } @@ -114,7 +104,7 @@ btree_status_t Btree< K, V >::do_traversal_query(const BtreeNodePtr& my_node, Bt return ret; } - const auto [start_isfound, start_idx] = my_node->find(qreq.next_key(), nullptr, false); + const auto [start_isfound, start_idx] = my_node->find(qreq.first_key(), nullptr, false); auto [end_is_found, end_idx] = my_node->find(qreq.input_range().end_key(), nullptr, false); bool unlocked_already = false; @@ -241,9 +231,7 @@ btree_status_t do_serialzable_query(const BtreeNodePtr& my_node, BtreeSerializab return ret; } } -#endif -#ifdef SERIALIZABLE_QUERY_IMPLEMENTATION btree_status_t sweep_query(BtreeQueryRequest< K >& qreq, std::vector< std::pair< K, V > >& out_values) { COUNTER_INCREMENT(m_metrics, btree_read_ops_count, 1); qreq.init_batch_range(); diff --git a/src/include/homestore/btree/detail/btree_remove_impl.ipp b/src/include/homestore/btree/detail/btree_remove_impl.ipp index 60b983562..d0df9909e 100644 --- a/src/include/homestore/btree/detail/btree_remove_impl.ipp +++ b/src/include/homestore/btree/detail/btree_remove_impl.ipp @@ -34,24 +34,9 @@ btree_status_t Btree< K, V >::do_remove(const BtreeNodePtr& my_node, locktype_t if constexpr (std::is_same_v< ReqT, BtreeSingleRemoveRequest >) { if ((modified = my_node->remove_one(req.key(), nullptr, req.m_outval))) { ++removed_count; } } else if constexpr (std::is_same_v< ReqT, BtreeRangeRemoveRequest< K > >) { - if (req.next_key().is_extent_key()) { - modified = remove_extents_in_leaf(my_node, req); - } else { - if (my_node->total_entries()) { - auto& subrange = req.working_range(); - auto const [start_found, start_idx] = my_node->find(subrange.start_key(), nullptr, false); - auto [end_found, end_idx] = my_node->find(subrange.end_key(), nullptr, false); - if (end_found) { ++end_idx; } - - removed_count = end_idx - start_idx; - for (uint32_t count = 0; count < removed_count; ++count) { - call_on_remove_kv_cb(my_node, start_idx, req); - // since start_idx is getting updated, always call remove_start_idx - my_node->remove(start_idx); - modified = true; - } - } - } + removed_count = to_variant_node(my_node)->multi_remove(req.working_range(), req.m_filter_cb); + modified = (removed_count != 0); + req.shift_working_range(); } else if constexpr (std::is_same_v< ReqT, BtreeRemoveAnyRequest< K > >) { if ((modified = my_node->remove_any(req.m_range, req.m_outkey, req.m_outval))) { ++removed_count; } } @@ -68,7 +53,7 @@ btree_status_t Btree< K, V >::do_remove(const BtreeNodePtr& my_node, locktype_t return modified ? btree_status_t::success : btree_status_t::not_found; } - bool go_to_out = false; + // bool go_to_out = false; retry: locktype_t child_cur_lock = locktype_t::NONE; uint32_t curr_idx; @@ -80,21 +65,22 @@ retry: auto const [found, idx] = my_node->find(req.key(), nullptr, false); ASSERT_IS_VALID_INTERIOR_CHILD_INDX(found, idx, my_node); end_idx = start_idx = idx; + if (false) { goto out_return; } // Please the compiler } else if constexpr (std::is_same_v< ReqT, BtreeRangeRemoveRequest< K > >) { - const auto count = my_node->template get_all< K, V >(req.next_range(), UINT32_MAX, start_idx, end_idx); - if (!count) { + auto const matched = my_node->match_range< K >(req.working_range(), start_idx, end_idx); + if (!matched) { ret = btree_status_t::not_found; - go_to_out = true; + goto out_return; } - // BT_NODE_REL_ASSERT_NE(count, 0, my_node, "get_all returns 0 entries for interior node is not valid - // pattern"); - } else if constexpr (std::is_same_v< ReqT, BtreeRemoveAnyRequest< K > >) { - const auto count = my_node->template get_all< V >(req.m_range, UINT32_MAX, start_idx, end_idx); - BT_NODE_REL_ASSERT_NE(count, 0, my_node, "get_all returns 0 entries for interior node is not valid pattern"); + auto const matched = my_node->match_range< K >(req.m_range, start_idx, end_idx); + if (!matched) { + ret = btree_status_t::not_found; + goto out_return; + } end_idx = start_idx = (end_idx - start_idx) / 2; // Pick the middle, TODO: Ideally we need to pick random } - if (go_to_out) { goto out_return; } + // if (go_to_out) { goto out_return; } if (req.route_tracing) { append_route_trace(req, my_node, btree_event_t::READ, start_idx, end_idx); } curr_idx = start_idx; @@ -155,14 +141,15 @@ retry: if (child_node->is_leaf()) { // We get the trimmed range only for leaf because this is where we will be removing keys. In interior // nodes, keys are always propogated from the lower nodes. - bool is_inp_key_lesser = false; - K end_key = - my_node->min_of(s_cast< const K& >(req.input_range().end_key()), curr_idx, is_inp_key_lesser); - bool end_incl = is_inp_key_lesser ? req.input_range().is_end_inclusive() : true; - req.trim_working_range(std::move(end_key), end_incl); - - BT_NODE_LOG(DEBUG, my_node, "Subrange:idx=[{}-{}],c={},working={}", start_idx, end_idx, curr_idx, - req.working_range().to_string()); + if (curr_idx < my_node->total_entries()) { + K child_end_key = my_node->get_nth_key< K >(curr_idx, true); + if (child_end_key.compare(req.working_range().end_key()) < 0) { + req.trim_working_range(std::move(child_end_key), true /* inclusive child key */); + } + + BT_NODE_LOG(DEBUG, my_node, "Subrange:idx=[{}-{}],c={},working={}", start_idx, end_idx, curr_idx, + req.working_range().to_string()); + } } } @@ -200,91 +187,6 @@ out_return: return (at_least_one_child_modified == btree_status_t::success) ? btree_status_t::success : ret; } -template < typename K, typename V > -bool Btree< K, V >::remove_extents_in_leaf(const BtreeNodePtr& node, BtreeRangeRemoveRequest< K >& rrreq) { - if constexpr (std::is_base_of_v< ExtentBtreeKey< K >, K > && std::is_base_of_v< ExtentBtreeValue< V >, V >) { - const BtreeKeyRange< K >& subrange = rrreq.working_range(); - const auto& start_key = static_cast< const ExtentBtreeKey< K >& >(subrange.start_key()); - const auto& end_key = static_cast< ExtentBtreeKey< K >& >(subrange.end_key()); - - auto const [start_found, start_idx] = node->find(start_key, nullptr, false); - auto const [end_found, end_idx] = node->find(end_key, nullptr, false); - - K h_k, t_k; - V h_v, t_v; - int64_t head_offset{0}; - int64_t tail_offset{0}; - ExtentBtreeKey< K >& head_k = static_cast< ExtentBtreeKey< K >& >(h_k); - ExtentBtreeKey< K >& tail_k = static_cast< ExtentBtreeKey< K >& >(t_k); - ExtentBtreeValue< V >& head_v = static_cast< ExtentBtreeValue< V >& >(h_v); - ExtentBtreeValue< V >& tail_v = static_cast< ExtentBtreeValue< V >& >(t_v); - - if (start_found) { - head_k = node->get_nth_key< K >(start_idx, false); - head_offset = head_k.distance_start(start_key); - BT_NODE_DBG_ASSERT_GE(head_offset, 0, node, "Invalid start_key or head_k"); - if (head_offset > 0) { node->get_nth_value(start_idx, &head_v, false); } - } - if (end_found) { - tail_k = node->get_nth_key< K >(end_idx, false); - tail_offset = end_key.distance_end(tail_k); - BT_NODE_DBG_ASSERT_GE(tail_offset, 0, node, "Invalid end_key or tail_k"); - if (tail_offset > 0) { node->get_nth_value(end_idx, &tail_v, false); } - } - - // Write partial head and tail kv. At this point we are committing and we can't go back and not update - // some of the extents. - auto idx = start_idx; - if (end_idx == start_idx) { - // Special case - where there is a overlap and single entry is split into 3 - auto const tail_start = tail_k.extent_length() - tail_offset; - if (m_on_remove_cb) { - m_on_remove_cb(head_k.extract(head_offset, tail_start - head_offset, false), - head_v.extract(head_offset, tail_start - head_offset, false), rrreq); - } - - if (tail_offset > 0) { - node->insert(end_idx + 1, tail_k.extract(tail_start, tail_offset, false), - tail_v.extract(tail_start, tail_offset, false)); - COUNTER_INCREMENT(m_metrics, btree_obj_count, 1); - } - - if (head_offset > 0) { - node->update(idx++, head_k.extract(0, head_offset, false), head_v.extract(0, head_offset, false)); - } - } else { - if (tail_offset > 0) { - auto const tail_start = tail_k.extent_length() - tail_offset; - auto const shrunk_k = tail_k.extract(tail_start, tail_offset, false); - call_on_update_kv_cb(node, end_idx, shrunk_k, rrreq); - node->update(end_idx, shrunk_k, tail_v.extract(tail_start, tail_offset, false)); - } else if (end_found) { - ++end_idx; - } - if (head_offset > 0) { - auto const shrunk_k = head_k.extract(0, -head_offset, false); - call_on_update_kv_cb(node, idx, shrunk_k, rrreq); - node->update(idx++, shrunk_k, head_v.extract(0, -head_offset, false)); - } - } - - // Remove everything in-between - if (idx < end_idx) { - if (m_on_remove_cb) { - for (auto i{idx}; i <= end_idx; ++i) { - call_on_remove_kv_cb(node, i, rrreq); - } - } - node->remove(idx, end_idx - 1); - COUNTER_DECREMENT(m_metrics, btree_obj_count, end_idx - idx); - } - return true; - } else { - BT_REL_ASSERT(false, "Don't expect remove_extents to be called on non-extent code path"); - return false; - } -} - template < typename K, typename V > template < typename ReqT > btree_status_t Btree< K, V >::check_collapse_root(ReqT& req) { @@ -351,7 +253,7 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const _leftmost_src_info leftmost_src; _src_cursor_info src_cursor; - total_size = leftmost_node->occupied_size(m_bt_cfg); + total_size = leftmost_node->occupied_size(); for (auto indx = start_idx + 1; indx <= end_idx; ++indx) { if (indx == parent_node->total_entries()) { BT_NODE_LOG_ASSERT(parent_node->has_valid_edge(), parent_node, @@ -367,7 +269,7 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const BT_NODE_LOG_ASSERT_EQ(child->is_valid_node(), true, child); old_nodes.push_back(child); - total_size += child->occupied_size(m_bt_cfg); + total_size += child->occupied_size(); } // Determine if packing the nodes would result in reducing the number of nodes, if so go with that. If else @@ -382,7 +284,7 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const } balanced_size = (total_size == 0) ? 0 : (total_size - 1) / num_nodes + 1; - if (leftmost_node->occupied_size(m_bt_cfg) > balanced_size) { + if (leftmost_node->occupied_size() > balanced_size) { // If for some reason balancing increases the current size, give up. // TODO: Is this a real case, isn't happening would mean some sort of bug in calculation of is_merge_needed? BT_NODE_DBG_ASSERT(false, leftmost_node, @@ -394,7 +296,7 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const // First try to see how many entries you can fit in the leftmost node within the balanced size. We are checking // leftmost node as special case without moving, because that is the only node which is modified in-place and hence // doing a dry run and if for some reason there is a problem in balancing the nodes, then it is easy to give up. - available_size = static_cast< int32_t >(balanced_size) - leftmost_node->occupied_size(m_bt_cfg); + available_size = static_cast< int32_t >(balanced_size) - leftmost_node->occupied_size(); src_cursor.ith_node = old_nodes.size(); for (uint32_t i{0}; (i < old_nodes.size() && available_size >= 0); ++i) { leftmost_src.ith_nodes.push_back(i); @@ -402,7 +304,7 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const // node contains one entry and the value size is much bigger than available size auto const nentries = old_nodes[i]->num_entries_by_size(0, available_size); if ((old_nodes[i]->total_entries() - nentries) == 0) { // Entire node goes in - available_size -= old_nodes[i]->occupied_size(m_bt_cfg); + available_size -= old_nodes[i]->occupied_size(); if (i >= old_nodes.size() - 1) { src_cursor.ith_node = i + 1; src_cursor.nth_entry = std::numeric_limits< uint32_t >::max(); @@ -438,7 +340,7 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const // Copied entire node ++src_cursor.ith_node; src_cursor.nth_entry = 0; - available_size = balanced_size - new_node->occupied_size(m_bt_cfg); + available_size = balanced_size - new_node->occupied_size(); } else { src_cursor.nth_entry += nentries; available_size = 0; @@ -455,8 +357,7 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const // There is a case where we are rebalancing and the second node which rebalanced didn't move any size, in that case // the first node is going to be exactly same and we will do again merge, so bail out here. - if ((new_nodes.size() == old_nodes.size()) && - (old_nodes[0]->occupied_size(m_bt_cfg) >= new_nodes[0]->occupied_size(m_bt_cfg))) { + if ((new_nodes.size() == old_nodes.size()) && (old_nodes[0]->occupied_size() >= new_nodes[0]->occupied_size())) { ret = btree_status_t::merge_not_required; goto out; } @@ -480,11 +381,11 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const if (node->total_entries()) { post_merge_size -= node->get_nth_obj_size(node->total_entries() - 1); } } - if (post_merge_size > parent_node->available_size(m_bt_cfg)) { + if (post_merge_size > parent_node->available_size()) { BT_NODE_LOG(DEBUG, parent_node, "Merge is needed, however after merge it will add {} bytes which is more than " "available_size={}, so not proceeding with merge", - post_merge_size, parent_node->available_size(m_bt_cfg)); + post_merge_size, parent_node->available_size()); ret = btree_status_t::merge_not_required; goto out; } diff --git a/src/include/homestore/btree/detail/prefix_node.hpp b/src/include/homestore/btree/detail/prefix_node.hpp new file mode 100644 index 000000000..62003da7a --- /dev/null +++ b/src/include/homestore/btree/detail/prefix_node.hpp @@ -0,0 +1,828 @@ +/********************************************************************************* + * Modifications Copyright 2017-2019 eBay Inc. + * + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ + +#pragma once + +#include +#include +#include "btree_node.hpp" +#include +#include + +SISL_LOGGING_DECL(btree) + +namespace homestore { + +// Internal format of variable node: +// [Persistent Header][prefix_node_header][prefix_area_bitset][KV Suffix][KV Suffix].. ... ... [KV Prefix][KV Prefix] +// +template < typename K, typename V > +class FixedPrefixNode : public VariantNode< K, V > { + using BtreeNode::get_nth_key_internal; + using BtreeNode::get_nth_key_size; + using BtreeNode::get_nth_obj_size; + using BtreeNode::get_nth_value; + using BtreeNode::get_nth_value_size; + using BtreeNode::to_string; + using VariantNode< K, V >::get_nth_value; + +private: +#pragma pack(1) + struct prefix_node_header { + uint16_t used_slots; // Number of slots actually used. TODO: We can deduce from set_bit_count of bitset + uint16_t tail_slot; // What is the tail slot number being used + + std::string to_string() const { return fmt::format("slots_used={} tail_slot={} ", used_slots, tail_slot); } + + static constexpr uint16_t min_holes_to_compact = 10; + // Followed by bitset + }; + + struct prefix_entry { + uint16_t ref_count{0}; + // Followed by common prefix key + // Followed by common prefix value + + static constexpr uint32_t size() { return key_size() + value_size() + sizeof(prefix_entry); } + + static constexpr uint32_t key_size() { + if constexpr (std::is_base_of_v< BtreeIntervalKey, K >) { + return dummy_key< K >.serialized_prefix_size(); + } else { + return 0u; // There is no prefix for non interval key + } + } + + static constexpr uint32_t value_size() { + if constexpr (std::is_base_of_v< BtreeIntervalValue, V >) { + return dummy_value< V >.serialized_prefix_size(); + } else { + return 0u; // There is no prefix for non interval value + } + } + + void write_kv(BtreeKey const& key, BtreeValue const& val) { + if constexpr (std::is_base_of_v< BtreeIntervalKey, K > && std::is_base_of_v< BtreeIntervalValue, V >) { + sisl::blob const kblob = s_cast< K const& >(key).serialize_prefix(); + sisl::blob const vblob = s_cast< V const& >(val).serialize_prefix(); + + DEBUG_ASSERT_EQ(kblob.size, key_size(), "Prefix key size mismatch with serialized prefix size"); + DEBUG_ASSERT_EQ(vblob.size, value_size(), "Prefix value size mismatch with serialized prefix size"); + + uint8_t* cur_ptr = uintptr_cast(this) + sizeof(prefix_entry); + std::memcpy(cur_ptr, kblob.bytes, kblob.size); + cur_ptr += kblob.size; + std::memcpy(cur_ptr, vblob.bytes, vblob.size); + } + } + + sisl::blob key_buf() const { + return sisl::blob{const_cast< uint8_t* >(r_cast< uint8_t const* >(this) + sizeof(prefix_entry)), + key_size()}; + } + sisl::blob val_buf() const { return sisl::blob{key_buf().bytes + key_buf().size, value_size()}; } + }; + + struct suffix_entry { + uint16_t prefix_slot; + // Followed by suffix key + // Followed by suffix value + + static constexpr uint32_t size() { return key_size() + value_size() + sizeof(suffix_entry); } + + static constexpr uint32_t key_size() { + if constexpr (std::is_base_of_v< BtreeIntervalKey, K >) { + return dummy_key< K >.serialized_suffix_size(); + } else { + return dummy_key< K >.serialized_size(); + } + } + + static constexpr uint32_t value_size() { + if constexpr (std::is_base_of_v< BtreeIntervalValue, V >) { + return dummy_value< V >.serialized_suffix_size(); + } else { + return dummy_value< V >.serialized_size(); + } + } + + void write_kv(BtreeKey const& key, BtreeValue const& val) { + sisl::blob kblob; + sisl::blob vblob; + + uint8_t* cur_ptr = uintptr_cast(this) + sizeof(suffix_entry); + if constexpr (std::is_base_of_v< BtreeIntervalKey, K > && std::is_base_of_v< BtreeIntervalValue, V >) { + kblob = s_cast< K const& >(key).serialize_suffix(); + vblob = s_cast< V const& >(val).serialize_suffix(); + } else { + kblob = key.serialize(); + vblob = val.serialize(); + } + DEBUG_ASSERT_EQ(kblob.size, key_size(), "Suffix key size mismatch with serialized suffix size"); + DEBUG_ASSERT_EQ(vblob.size, value_size(), "Suffix value size mismatch with serialized suffix size"); + + std::memcpy(cur_ptr, kblob.bytes, kblob.size); + cur_ptr += kblob.size; + std::memcpy(cur_ptr, vblob.bytes, vblob.size); + } + + sisl::blob key_buf() const { + return sisl::blob{const_cast< uint8_t* >(r_cast< uint8_t const* >(this) + sizeof(suffix_entry)), + key_size()}; + } + sisl::blob val_buf() const { return sisl::blob{key_buf().bytes + key_buf().size, value_size()}; } + }; +#pragma pack() + + sisl::CompactBitSet prefix_bitset_; + +public: + FixedPrefixNode(uint8_t* node_buf, bnodeid_t id, bool init, bool is_leaf, const BtreeConfig& cfg) : + VariantNode< K, V >(node_buf, id, init, is_leaf, cfg), + prefix_bitset_{sisl::blob{bitset_area(), reqd_bitset_size(cfg)}, init} { + if (init) { + auto phdr = prefix_header(); + phdr->used_slots = 0; + phdr->tail_slot = 0; + } + } + + virtual ~FixedPrefixNode() = default; + + ///////////////////////////// All overrides of BtreeIntervalNode /////////////////////////////////// + /// @brief Upserts a batch of entries into a prefix node. + /// + /// This method upserts all entries in the node that have keys within the specified range. + /// The method is supported only for leaf nodes. + /// + /// @param keys The range of keys to upsert. + /// @param val The value to upsert. + /// @param on_found_cb The callback function to be called for each entry found within the range. + /// The function should take two arguments: a key and a value, and return a + /// batch_upsert_decision_t value. If the function returns: + /// batch_upsert_decision_t::replace, the entry is upserted with the new value. + /// batch_upsert_decision_t::remove, the entry is removed from the node. + /// batch_upsert_decision_t::keep, the entry is not modified and the method moves on to the + /// next entry. + /// @return An optional key that was not upserted due to lack of space in the node. + /// If all keys were upserted successfully, the method returns std::nullopt. + /// If the method ran out of space in the node, the method returns the key that was last upserted + btree_status_t multi_put(BtreeKeyRange< K > const& keys, BtreeKey const& first_input_key, BtreeValue const& val, + btree_put_type put_type, K* last_failed_key, + put_filter_cb_t const& filter_cb = nullptr) override { + DEBUG_ASSERT_EQ(this->is_leaf(), true, "Multi put entries on node are supported only for leaf nodes"); + if constexpr (std::is_base_of_v< BtreeIntervalKey, K > && std::is_base_of_v< BtreeIntervalValue, V >) { + uint32_t modified{0}; + + uint16_t prefix_slot{std::numeric_limits< uint16_t >::max()}; + K cur_key = keys.start_key(); + + if (!keys.is_start_inclusive()) { cur_key.shift(1); } + if (!has_room(1u)) { return btree_status_t::space_not_avail; } + bool upserted_all{false}; + + auto [found, idx] = this->find(cur_key, nullptr, false); + do { + auto x = cur_key.compare(keys.end_key()); + if ((x > 0) || ((x == 0) && !keys.is_end_inclusive())) { + upserted_all = true; + break; + } + + put_filter_decision decision{put_filter_decision::replace}; + if (found) { + if (put_type == btree_put_type::INSERT) { // Insert operation should skip existing entries + decision = put_filter_decision::keep; + } else if (filter_cb) { + decision = filter_cb(cur_key, get_nth_value(idx, false), val); + if (decision == put_filter_decision::remove) { + ++modified; + remove(idx); + } + } + + // We found the entry and it will be replaced in next step, for now, we need to deref the prefix + // corresponding to this suffix entry + if (decision == put_filter_decision::replace) { + deref_remove_prefix(get_suffix_entry_c(idx)->prefix_slot); + } + } else { + if (put_type == btree_put_type::UPDATE) { // Update would need existing entries found + decision = put_filter_decision::keep; + } else { + std::memmove(get_suffix_entry(idx + 1), get_suffix_entry(idx), + (this->total_entries() - idx) * suffix_entry::size()); + this->inc_entries(); + } + } + + if (decision == put_filter_decision::replace) { + if (prefix_slot == std::numeric_limits< uint16_t >::max()) { + prefix_slot = add_prefix(cur_key, val); + } + V new_val{s_cast< V const& >(val)}; + new_val.shift(s_cast< K const& >(cur_key).distance(first_input_key)); + write_suffix(idx, prefix_slot, cur_key, new_val); + } + + cur_key.shift(1); + if (!has_room(1u)) { break; } + + if (decision != put_filter_decision::remove) { ++idx; } + found = + (idx < this->total_entries() && (BtreeNode::get_nth_key< K >(idx, false).compare(cur_key) == 0)); + } while (true); + + if (modified) { this->inc_gen(); } +#ifndef NDEBUG + validate_sanity(); +#endif + if (!upserted_all) { + if (last_failed_key) { *last_failed_key = cur_key; } + return btree_status_t::has_more; + } else { + return btree_status_t::success; + } + } else { + return btree_status_t::not_supported; + } + } + + /** + * @brief Removes a batch of entries from a prefix node. + * + * This method removes all entries in the node that have keys within the specified range. + * The method is supported only for leaf nodes. + * + * @param keys The range of keys to remove. + * @param on_found_cb The callback function to be called for each entry found within the range. The function should + * take two arguments: a key and a value, and return a boolean value. If the function returns true or if there is + * no callback function, the entry is removed from the node. If the function returns false, the entry is not + * removed and the method moves on to the next entry. + * + * @return Returns number of objects removed + */ + uint32_t multi_remove(BtreeKeyRange< K > const& keys, remove_filter_cb_t const& filter_cb = nullptr) override { + DEBUG_ASSERT_EQ(this->is_leaf(), true, "remove_batch api is supported only for leaf node"); + if constexpr (std::is_base_of_v< BtreeIntervalKey, K > && std::is_base_of_v< BtreeIntervalValue, V >) { + K cur_key = keys.start_key(); + if (!keys.is_start_inclusive()) { cur_key.shift(1); } + uint32_t num_removed{0}; + + auto [_, idx] = this->find(cur_key, nullptr, false); + while (idx < this->total_entries()) { + cur_key = BtreeNode::get_nth_key< K >(idx, false); + auto x = cur_key.compare(keys.end_key()); + if ((x > 0) || ((x == 0) && !keys.is_end_inclusive())) { break; } + + bool remove{true}; + if (!filter_cb || filter_cb(cur_key, get_nth_value(idx, false))) { + suffix_entry* sentry = get_suffix_entry(idx); + deref_remove_prefix(sentry->prefix_slot); + std::memmove(uintptr_cast(sentry), uintptr_cast(get_suffix_entry(idx + 1)), + (this->total_entries() - idx - 1) * suffix_entry::size()); + this->dec_entries(); + ++num_removed; + } else { + ++idx; + } + } + if (num_removed) { this->inc_gen(); } + +#ifndef NDEBUG + validate_sanity(); +#endif + return num_removed; + } else { + return 0; + } + } + + ///////////////////////////// All overrides of BtreeNode /////////////////////////////////// + void get_nth_key_internal(uint32_t idx, BtreeKey& out_key, bool) const override { + suffix_entry const* sentry = get_suffix_entry_c(idx); + prefix_entry const* pentry = get_prefix_entry_c(sentry->prefix_slot); + DEBUG_ASSERT(prefix_bitset_.is_bit_set(sentry->prefix_slot), + "Prefix slot number is in suffix entry, but corresponding bit is not set"); + s_cast< BtreeIntervalKey& >(out_key).deserialize(pentry->key_buf(), sentry->key_buf(), true); + } + + void get_nth_value(uint32_t idx, BtreeValue* out_val, bool) const override { + if (idx == this->total_entries()) { + DEBUG_ASSERT_EQ(this->is_leaf(), false, "get_nth_value out-of-bound"); + DEBUG_ASSERT_EQ(this->has_valid_edge(), true, "get_nth_value out-of-bound"); + *(r_cast< BtreeLinkInfo* >(out_val)) = this->get_edge_value(); + } else { + suffix_entry const* sentry = get_suffix_entry_c(idx); + prefix_entry const* pentry = get_prefix_entry_c(sentry->prefix_slot); + DEBUG_ASSERT(prefix_bitset_.is_bit_set(sentry->prefix_slot), + "Prefix slot number is in suffix entry, but corresponding bit is not set"); + s_cast< BtreeIntervalValue* >(out_val)->deserialize(pentry->val_buf(), sentry->val_buf(), true); + } + } + + uint32_t available_size() const override { + auto num_holes = num_prefix_holes(); + if (num_holes > prefix_node_header::min_holes_to_compact) { + return available_size_without_compaction() + (num_holes * prefix_entry::size()); + } else { + return available_size_without_compaction(); + } + } + + bool has_room_for_put(btree_put_type, uint32_t, uint32_t) const override { return has_room(1u); } + + uint32_t get_nth_key_size(uint32_t) const override { return dummy_key< K >.serialized_size(); } + + uint32_t get_nth_value_size(uint32_t) const override { return dummy_value< V >.serialized_size(); } + + uint32_t move_out_to_right_by_size(const BtreeConfig& cfg, BtreeNode& on, uint32_t size_to_move) override { + return move_out_to_right_internal(cfg, on, true /* by_size*/, size_to_move); + } + + uint32_t move_out_to_right_by_entries(const BtreeConfig& cfg, BtreeNode& on, uint32_t num_entries) override { + return move_out_to_right_internal(cfg, on, false /* by_size*/, num_entries); + } + + uint32_t move_out_to_right_internal(const BtreeConfig& cfg, BtreeNode& on, bool by_size, uint32_t limit) { + FixedPrefixNode& dst_node = s_cast< FixedPrefixNode& >(on); + + uint32_t dst_node_size = dst_node.occupied_size(); + uint32_t num_moved{0}; + + // Nothing to move + if (this->total_entries() == 0) { return by_size ? 0 : dst_node_size; } + + // Step 1: Walk through from last idx towards first and map the current node prefix slot to new prefix slot. + // This map is used both to map the prefix slot as well as presence of if the prefix slot is used for multiple + // suffixes. At the end of this step, all prefixes that needs to be moved are moved with correct bitset + // settings on both source and destination + std::map< uint16_t, uint16_t > this_to_dst_prefix; + uint16_t idx = this->total_entries() - 1; + do { + if (by_size) { + if (dst_node_size > limit) { break; } + } else { + if (num_moved == limit) { break; } + } + suffix_entry* this_sentry = get_suffix_entry(idx); + + auto const this_prefix_slot = this_sentry->prefix_slot; + auto const it = this_to_dst_prefix.find(this_prefix_slot); + + if (it == this_to_dst_prefix.cend()) { + // Have not seen the prefix before, new entry, allocate a prefix in dest node and copy the prefix to dst + // node and update our suffix entry to point to that slot temporarily here. The suffix memory is all + // moved will be moved to dst node all at once later. + uint16_t dst_prefix_slot = dst_node.alloc_prefix(); + prefix_entry* dst_pentry = dst_node.get_prefix_entry(dst_prefix_slot); + + std::memcpy(voidptr_cast(dst_pentry), c_voidptr_cast(get_prefix_entry_c(this_prefix_slot)), + prefix_entry::size()); + + dst_pentry->ref_count = 1; + this_sentry->prefix_slot = dst_prefix_slot; + + this_to_dst_prefix.insert(std::pair(this_prefix_slot, dst_prefix_slot)); + dst_node_size += prefix_entry::size(); + } else { + prefix_entry* new_pentry = dst_node.get_prefix_entry(it->second); + ++new_pentry->ref_count; + this_sentry->prefix_slot = it->second; + } + + // Remove a reference to this prefix slot, since the suffix will be eventually moved to dst node + deref_remove_prefix(this_prefix_slot); + dst_node_size += suffix_entry::size(); + ++num_moved; + } while (idx-- > 0); + + // Step 2: Move the suffixes and adjust the num_entries in source and destination. All tomove suffixes have + // adjusted to their new prefix slot already as part of Step 1 + std::memmove(uintptr_cast(dst_node.get_suffix_entry(0)), uintptr_cast(get_suffix_entry(idx + 1)), + num_moved * suffix_entry::size()); + this->sub_entries(num_moved); + dst_node.add_entries(num_moved); + + // Step 3: Adjust all the header parameters for old and new. For old we other header parameters are adjusted as + // part of Step 1, except generation count + this->inc_gen(); + dst_node.inc_gen(); + auto new_phdr = dst_node.prefix_header(); + + if (!this->is_leaf() && (dst_node.total_entries() != 0)) { + // Incase this node is an edge node, move the stick to the right hand side node + dst_node.set_edge_info(this->edge_info()); + this->invalidate_edge(); + } + + // Step 4: Use this oppurtunity to compact the source node if it needs. Destination node is written in + // compacted state anyways + if (is_compaction_suggested()) { compact(); } + +#ifndef NDEBUG + validate_sanity(); + dst_node.validate_sanity(); +#endif + return by_size ? num_moved : dst_node_size; + } + + btree_status_t insert(uint32_t idx, BtreeKey const& key, BtreeValue const& val) override { + if (!has_room(1u)) { return btree_status_t::space_not_avail; } + + std::memmove(get_suffix_entry(idx + 1), get_suffix_entry(idx), + (this->total_entries() - idx) * suffix_entry::size()); + + write_suffix(idx, add_prefix(key, val), key, val); + this->inc_entries(); + this->inc_gen(); + +#ifndef NDEBUG + validate_sanity(); +#endif + return btree_status_t::success; + } + + void update(uint32_t idx, BtreeValue const& val) override { + update(idx, BtreeNode::get_nth_key< K >(idx, false), val); + } + + void update(uint32_t idx, BtreeKey const& key, BtreeValue const& val) override { + // If we are updating the edge value, none of the other logic matter. Just update edge value and move on + if (idx == this->total_entries()) { + DEBUG_ASSERT_EQ(this->is_leaf(), false); + this->set_edge_value(val); + this->inc_gen(); + return; + } + + if (!has_room(1u)) { + if (has_room_after_compaction(1u)) { + compact(); + } else { + LOGMSG_ASSERT(false, "Even after compaction there is no room for update"); + return; + } + } + write_suffix(idx, add_prefix(key, val), key, val); + this->inc_gen(); + +#ifndef NDEBUG + validate_sanity(); +#endif + } + + void remove(uint32_t idx) override { + if (idx == this->total_entries()) { + DEBUG_ASSERT(!this->is_leaf() && this->has_valid_edge(), + "idx={} == num_entries={} for leaf or non-edge node", idx, this->total_entries()); + + if (idx == 0) { + this->invalidate_edge(); + } else { + V last_1_val; + get_nth_value(idx - 1, &last_1_val, false); + this->set_edge_value(last_1_val); + } + } else { + suffix_entry* sentry = get_suffix_entry(idx); + deref_remove_prefix(sentry->prefix_slot); + std::memmove(uintptr_cast(sentry), uintptr_cast(get_suffix_entry(idx + 1)), + (this->total_entries() - idx - 1) * suffix_entry::size()); + this->dec_entries(); + } + this->inc_gen(); + } + + void remove(uint32_t idx_s, uint32_t idx_e) override { + for (auto idx{idx_s}; idx < idx_e; ++idx) { + remove(idx); + } + } + + void remove_all(BtreeConfig const& cfg) override { + this->sub_entries(this->total_entries()); + this->invalidate_edge(); + this->inc_gen(); + prefix_bitset_ = sisl::CompactBitSet{sisl::blob{bitset_area(), reqd_bitset_size(cfg)}, true}; + +#ifndef NDEBUG + validate_sanity(); +#endif + } + + uint8_t* get_node_context() override { return uintptr_cast(this) + sizeof(FixedPrefixNode< K, V >); } + + uint32_t get_nth_obj_size(uint32_t) const override { return get_key_size() + get_value_size(); } + + uint32_t num_entries_by_size(uint32_t start_idx, uint32_t size) const { + uint32_t num_entries{0}; + uint32_t cum_size{0}; + + std::unordered_set< uint16_t > prefixes; + for (auto idx{start_idx}; idx < this->total_entries(); ++idx) { + suffix_entry const* sentry = get_suffix_entry_c(idx); + if (prefixes.find(sentry->prefix_slot) == prefixes.cend()) { + prefixes.insert(sentry->prefix_slot); + cum_size += prefix_entry::size(); + } + cum_size += suffix_entry::size(); + + if (cum_size > size) { return num_entries; } + ++num_entries; + } + return num_entries; + } + + uint32_t copy_by_size(BtreeConfig const& cfg, BtreeNode const& o, uint32_t start_idx, uint32_t size) override { + return copy_internal(cfg, o, start_idx, true /* by_size*/, size); + } + + uint32_t copy_by_entries(BtreeConfig const& cfg, BtreeNode const& o, uint32_t start_idx, + uint32_t nentries) override { + return copy_internal(cfg, o, start_idx, false /* by_size*/, nentries); + } + + uint32_t copy_internal(BtreeConfig const& cfg, BtreeNode const& o, uint32_t start_idx, bool by_size, + uint32_t limit) { + FixedPrefixNode const& src_node = s_cast< FixedPrefixNode const& >(o); + + // Adjust the size_to_move to cover the new node's reqd header space. + uint32_t copied_size{0}; + + // Step 1: Walk through from last idx towards first and map the current node prefix slot to new prefix slot. + // This map is used both to map the prefix slot as well as presence of if the prefix slot is used for multiple + // suffixes. At the end of this step, all prefixes that needs to be coped are copied with correct bitset + // settings on both source and destination + std::map< uint16_t, uint16_t > src_to_my_prefix; + uint16_t src_idx{s_cast< uint16_t >(start_idx)}; + uint16_t my_prefix_slot{0}; + uint16_t my_idx = this->total_entries(); + uint32_t num_copied{0}; + + while ((src_idx < src_node.total_entries()) && has_room(1u)) { + if (!by_size && num_copied >= limit) { break; } + + suffix_entry const* src_sentry = src_node.get_suffix_entry_c(src_idx); + auto const src_prefix_slot = src_sentry->prefix_slot; + + // Map the prefix slot from src node to my node. If we don't have a prefix slot yet, we need to allocate one + // for the remote prefix slot and copy the prefix entry from src node to my node. If we have one, just + // continue to use that by incrementing the ref_count. + auto const it = src_to_my_prefix.find(src_prefix_slot); + if (it == src_to_my_prefix.cend()) { + copied_size += prefix_entry::size() + suffix_entry::size(); + if (by_size && (copied_size > limit)) { break; } + + my_prefix_slot = alloc_prefix(); + prefix_entry* my_pentry = get_prefix_entry(my_prefix_slot); + std::memcpy(voidptr_cast(my_pentry), c_voidptr_cast(src_node.get_prefix_entry_c(src_prefix_slot)), + prefix_entry::size()); + my_pentry->ref_count = 1; + + src_to_my_prefix.insert(std::pair(src_prefix_slot, my_prefix_slot)); + } else { + copied_size += suffix_entry::size(); + if (by_size && (copied_size > limit)) { break; } + + my_prefix_slot = it->second; + prefix_entry* my_pentry = get_prefix_entry(it->second); + ++my_pentry->ref_count; + } + + suffix_entry* my_sentry = get_suffix_entry(my_idx++); + std::memcpy(voidptr_cast(my_sentry), c_voidptr_cast(src_sentry), suffix_entry::size()); + my_sentry->prefix_slot = my_prefix_slot; + + ++src_idx; + ++num_copied; + } + + this->add_entries(num_copied); + this->inc_gen(); + + // If we copied everything from start_idx till end and if its an edge node, need to copy the edge id as + // well. + if (src_node.has_valid_edge() && ((start_idx + num_copied) == src_node.total_entries())) { + this->set_edge_info(src_node.edge_info()); + } + +#ifndef NDEBUG + validate_sanity(); +#endif + return by_size ? num_copied : copied_size; + } + + std::string to_string(bool print_friendly = false) const override { + auto str = fmt::format("{}id={} level={} nEntries={} {} next_node={} ", + (print_friendly ? "------------------------------------------------------------\n" : ""), + this->node_id(), this->level(), this->total_entries(), + (this->is_leaf() ? "LEAF" : "INTERIOR"), this->next_bnode()); + if (!this->is_leaf() && (this->has_valid_edge())) { + fmt::format_to(std::back_inserter(str), "edge_id={}.{}", this->edge_info().m_bnodeid, + this->edge_info().m_link_version); + } + + fmt::format_to(std::back_inserter(str), "{}Prefix_Hdr={}, Prefix_Bitmap=[{}]\n", + (print_friendly ? "\n\t" : " "), cprefix_header()->to_string(), prefix_bitset_.to_string()); + + for (uint32_t i{0}; i < this->total_entries(); ++i) { + fmt::format_to(std::back_inserter(str), "{}Entry{} [Key={} Val={}]", (print_friendly ? "\n\t" : " "), i + 1, + BtreeNode::get_nth_key< K >(i, false).to_string(), + this->get_nth_value(i, false).to_string()); + } + return str; + } + + std::string to_string_keys(bool print_friendly = false) const override { return "NOT Supported"; } + +private: + uint16_t add_prefix(BtreeKey const& key, BtreeValue const& val) { + auto const slot_num = alloc_prefix(); + + // Layout the prefix key/value into the prefix slot allocated + prefix_entry* pentry = get_prefix_entry(slot_num); + pentry->ref_count = 0; // Num suffix referencing this prefix + pentry->write_kv(key, val); + + return slot_num; + } + + uint16_t alloc_prefix() { + auto const slot_num = prefix_bitset_.get_next_reset_bit(0); + if (slot_num == std::numeric_limits< uint16_t >::max()) { + DEBUG_ASSERT(false, "Unable to alloc slot, shouldn't be mutating in this node without splitting"); + return std::numeric_limits< uint16_t >::max(); + } + prefix_bitset_.set_bit(slot_num); + + auto phdr = prefix_header(); + ++phdr->used_slots; + if (slot_num > phdr->tail_slot) { phdr->tail_slot = slot_num; } + return slot_num; + } + + void ref_prefix(uint16_t slot_num) { ++(get_prefix_entry(slot_num)->ref_count); } + + void deref_remove_prefix(uint16_t slot_num) { + auto phdr = prefix_header(); + auto pentry = get_prefix_entry(slot_num); + DEBUG_ASSERT_GT(pentry->ref_count, 0, "Deref of prefix slot={} error: ref_count already 0", slot_num); + DEBUG_ASSERT_GT(phdr->used_slots, 0, "Deref of prefix slot={} error: used slot count is already 0", slot_num); + + if (--pentry->ref_count == 0) { + --phdr->used_slots; + prefix_bitset_.reset_bit(slot_num); + if ((slot_num != 0) && (slot_num == phdr->tail_slot)) { + uint16_t prev_slot = prefix_bitset_.get_prev_set_bit(slot_num); + if (prev_slot != std::numeric_limits< uint16_t >::max()) { phdr->tail_slot = prev_slot; } + } + } + } + + void write_suffix(uint16_t idx, uint16_t prefix_slot, BtreeKey const& key, BtreeValue const& val) { + suffix_entry* sentry = get_suffix_entry(idx); + sentry->prefix_slot = prefix_slot; + sentry->write_kv(key, val); + ref_prefix(prefix_slot); + } + + uint32_t available_size_without_compaction() const { + uint8_t const* suffix = r_cast< uint8_t const* >(get_suffix_entry_c(this->total_entries())); + uint8_t const* prefix = r_cast< uint8_t const* >(get_prefix_entry_c(cprefix_header()->tail_slot)); + + if (suffix <= prefix) { + return prefix - suffix; + } else { + DEBUG_ASSERT(false, "Node data is corrupted, suffix area is overlapping prefix area"); + return 0; + } + } + + uint32_t available_size_with_compaction() const { + return available_size_without_compaction() + (num_prefix_holes() * prefix_entry::size()); + } + + bool has_room(uint16_t for_nentries) const { + return (available_size_without_compaction() >= (prefix_entry::size() + (for_nentries * suffix_entry::size()))); + } + + bool has_room_after_compaction(uint16_t for_nentries) const { + return (available_size_with_compaction() >= (prefix_entry::size() + (for_nentries * suffix_entry::size()))); + } + + uint32_t num_prefix_holes() const { + auto phdr = cprefix_header(); + return (phdr->tail_slot + 1 - phdr->used_slots); + } + + bool is_compaction_suggested() const { return (num_prefix_holes() > prefix_node_header::min_holes_to_compact); } + + void compact() { + // Build reverse map from prefix to suffix + std::multimap< uint16_t, uint16_t > prefix_to_suffix; + for (uint16_t idx{0}; idx < this->total_entries(); ++idx) { + suffix_entry const* sentry = get_suffix_entry_c(idx); + prefix_to_suffix.insert(std::pair(sentry->prefix_slot, idx)); + } + + // Starting from a slot outside of actual used slots, keep finding all the slots which are out of slots used + // count are moved to free area within the compactable area. + uint16_t from_slot{prefix_header()->used_slots}; + uint16_t to_slot{0}; + while (true) { + from_slot = prefix_bitset_.get_next_set_bit(from_slot); + if (from_slot == std::numeric_limits< uint16_t >::max()) { break; } + + auto const to_slot = prefix_bitset_.get_next_reset_bit(0u); + DEBUG_ASSERT_NE(to_slot, std::numeric_limits< uint16_t >::max(), + "Didn't find a free location on to compaction side, not expected"); + DEBUG_ASSERT_LT(to_slot, prefix_header()->used_slots, + "Couldn't find enough slots inside compactable area, not expected"); + + std::memcpy(uintptr_cast(get_prefix_entry(to_slot)), (void*)get_prefix_entry(from_slot), + prefix_entry::size()); + prefix_bitset_.reset_bit(from_slot); + prefix_bitset_.set_bit(to_slot); + + // Move all the suffixes that are referencing this prefix to the new location + auto range = prefix_to_suffix.equal_range(from_slot); + for (auto it = range.first; it != range.second; ++it) { + suffix_entry* sentry = get_suffix_entry(it->second); + sentry->prefix_slot = to_slot; + } + } + + // Finally adjust the tail offset to the compacted area. + auto phdr = prefix_header(); + phdr->tail_slot = phdr->used_slots; + } + +#ifdef _DEBUG + void validate_sanity() { + uint32_t i{0}; + // validate if keys are in ascending order + K prevKey; + while (i < this->total_entries()) { + K key = BtreeNode::get_nth_key< K >(i, false); + uint64_t kp = *(uint64_t*)key.serialize().bytes; + if (i > 0 && prevKey.compare(key) > 0) { + DEBUG_ASSERT(false, "Found non sorted entry: {} -> {}", kp, to_string()); + } + prevKey = key; + ++i; + } + } +#endif + + //////////////////////// All Helper methods section //////////////////////// + static uint32_t reqd_bitset_size(BtreeConfig const& cfg) { + return sisl::round_up(cfg.node_data_size() / (prefix_entry::key_size() + prefix_entry::value_size()) / 8, + sisl::CompactBitSet::size_multiples()); + } + + prefix_node_header* prefix_header() { return r_cast< prefix_node_header* >(this->node_data_area()); } + prefix_node_header const* cprefix_header() const { + return r_cast< prefix_node_header const* >(this->node_data_area_const()); + } + + uint8_t* bitset_area() { return this->node_data_area() + sizeof(prefix_node_header); } + uint8_t const* cbitset_area() const { return this->node_data_area_const() + sizeof(prefix_node_header); } + + uint8_t* suffix_kv_area() { return bitset_area() + (prefix_bitset_.size() / 8); } + uint8_t const* csuffix_kv_area() const { return cbitset_area() + (prefix_bitset_.size() / 8); } + + prefix_entry* get_prefix_entry(uint16_t slot_num) { + return r_cast< prefix_entry* >(this->node_data_area() + + (this->node_data_size() - ((slot_num + 1) * prefix_entry::size()))); + } + + prefix_entry const* get_prefix_entry_c(uint16_t slot_num) const { + return r_cast< prefix_entry const* >(this->node_data_area_const() + + (this->node_data_size() - ((slot_num + 1) * prefix_entry::size()))); + } + + suffix_entry* get_suffix_entry(uint16_t idx) { + return r_cast< suffix_entry* >(suffix_kv_area() + (idx * suffix_entry::size())); + } + suffix_entry const* get_suffix_entry_c(uint16_t idx) const { + return r_cast< suffix_entry const* >(csuffix_kv_area() + (idx * suffix_entry::size())); + } + + static constexpr uint32_t get_key_size() { return prefix_entry::key_size() + suffix_entry::key_size(); } + static constexpr uint32_t get_value_size() { return prefix_entry::value_size() + suffix_entry::value_size(); } +}; +} // namespace homestore diff --git a/src/include/homestore/btree/detail/simple_node.hpp b/src/include/homestore/btree/detail/simple_node.hpp index 8dbbc62a6..39f930bbb 100644 --- a/src/include/homestore/btree/detail/simple_node.hpp +++ b/src/include/homestore/btree/detail/simple_node.hpp @@ -16,8 +16,8 @@ #pragma once #include -#include "btree_node.hpp" -#include "btree_internal.hpp" +#include +#include #include "homestore/index/index_internal.hpp" using namespace std; @@ -28,13 +28,21 @@ SISL_LOGGING_DECL(btree) namespace homestore { template < typename K, typename V > -class SimpleNode : public BtreeNode { +class SimpleNode : public VariantNode< K, V > { public: SimpleNode(uint8_t* node_buf, bnodeid_t id, bool init, bool is_leaf, const BtreeConfig& cfg) : - BtreeNode(node_buf, id, init, is_leaf) { + VariantNode< K, V >(node_buf, id, init, is_leaf, cfg) { this->set_node_type(btree_node_type::FIXED); } + using BtreeNode::get_nth_key_internal; + using BtreeNode::get_nth_key_size; + using BtreeNode::get_nth_obj_size; + using BtreeNode::get_nth_value; + using BtreeNode::get_nth_value_size; + using BtreeNode::to_string; + using VariantNode< K, V >::get_nth_value; + // Insert the key and value in provided index // Assumption: Node lock is already taken btree_status_t insert(uint32_t ind, const BtreeKey& key, const BtreeValue& val) override { @@ -83,7 +91,7 @@ class SimpleNode : public BtreeNode { // Set the last key/value as edge entry and by decrementing entry count automatically removed the last // entry. BtreeLinkInfo new_edge; - get_nth_value(ind_s - 1, &new_edge, false); + this->get_nth_value(ind_s - 1, &new_edge, false); this->set_nth_value(total_entries, new_edge); this->sub_entries(total_entries - ind_s + 1); } else { @@ -107,15 +115,11 @@ class SimpleNode : public BtreeNode { #endif } - void append(uint32_t ind, const BtreeKey& key, const BtreeValue& val) override { - RELEASE_ASSERT(false, "Append operation is not supported on simple node"); - } - uint32_t move_out_to_right_by_entries(const BtreeConfig& cfg, BtreeNode& o, uint32_t nentries) override { auto& other_node = s_cast< SimpleNode< K, V >& >(o); // Minimum of whats to be moved out and how many slots available in other node - nentries = std::min({nentries, this->total_entries(), other_node.get_available_entries(cfg)}); + nentries = std::min({nentries, this->total_entries(), other_node.get_available_entries()}); uint32_t sz = nentries * get_nth_obj_size(0); if (sz != 0) { @@ -161,7 +165,7 @@ class SimpleNode : public BtreeNode { auto& other = s_cast< const SimpleNode< K, V >& >(o); nentries = std::min(nentries, other.total_entries() - start_idx); - nentries = std::min(nentries, this->get_available_entries(cfg)); + nentries = std::min(nentries, this->get_available_entries()); uint32_t sz = nentries * get_nth_obj_size(0); if (sz != 0) { std::memcpy(get_nth_obj(this->total_entries()), other.get_nth_obj_const(start_idx), sz); } this->add_entries(nentries); @@ -174,51 +178,15 @@ class SimpleNode : public BtreeNode { return nentries; } - /*uint32_t move_in_from_right_by_entries(const BtreeConfig& cfg, BtreeNode& o, uint32_t nentries) override { - auto& other_node = s_cast< SimpleNode< K, V >& >(o); - - // Minimum of whats to be moved and how many slots available - nentries = std::min({nentries, other_node.total_entries(), get_available_entries(cfg)}); - uint32_t sz = nentries * get_nth_obj_size(0); - if (sz != 0) { - uint32_t othersz = (other_node.total_entries() - nentries) * other_node.get_nth_obj_size(0); - std::memmove(get_nth_obj(this->total_entries()), other_node.get_nth_obj(0), sz); - std::memmove(other_node.get_nth_obj(0), other_node.get_nth_obj(nentries), othersz); - } - - other_node.sub_entries(nentries); - this->add_entries(nentries); - - // If next node does not have any more entries, but only a edge entry - // we need to move that to us, so that if need be next node could be freed. - if ((other_node.total_entries() == 0) && other_node.has_valid_edge()) { - DEBUG_ASSERT_EQ(this->has_valid_edge(), false, "node={}", to_string()); - this->set_edge_id(other_node.edge_id()); - other_node.invalidate_edge(); - } - - other_node.inc_gen(); - this->inc_gen(); - -#ifndef NDEBUG - validate_sanity(); -#endif - return nentries; - } - - uint32_t move_in_from_right_by_size(const BtreeConfig& cfg, BtreeNode& o, uint32_t size) override { - return (get_nth_obj_size(0) * move_in_from_right_by_entries(cfg, o, size / get_nth_obj_size(0))); - } */ - - uint32_t available_size(const BtreeConfig& cfg) const override { - return (cfg.node_data_size() - (this->total_entries() * get_nth_obj_size(0))); + uint32_t available_size() const override { + return (this->node_data_size() - (this->total_entries() * get_nth_obj_size(0))); } void get_nth_key_internal(uint32_t ind, BtreeKey& out_key, bool copy) const override { DEBUG_ASSERT_LT(ind, this->total_entries(), "node={}", to_string()); sisl::blob b; b.bytes = (uint8_t*)(this->node_data_area_const() + (get_nth_obj_size(ind) * ind)); - b.size = get_obj_key_size(ind); + b.size = get_nth_key_size(ind); out_key.deserialize(b, copy); } @@ -228,19 +196,18 @@ class SimpleNode : public BtreeNode { DEBUG_ASSERT_EQ(this->has_valid_edge(), true, "node={}", to_string()); *(BtreeLinkInfo*)out_val = this->get_edge_value(); } else { - sisl::blob b; - b.bytes = const_cast< uint8_t* >(reinterpret_cast< const uint8_t* >( - this->node_data_area_const() + (get_nth_obj_size(ind) * ind) + get_obj_key_size(ind))); - b.size = V::get_fixed_size(); + sisl::blob b{const_cast< uint8_t* >(this->node_data_area_const() + (get_nth_obj_size(ind) * ind) + + get_nth_key_size(ind)), + dummy_value< V >.serialized_size()}; out_val->deserialize(b, copy); } } - /*V get_nth_value(uint32_t ind, bool copy) const { - V val; - get_nth_value(ind, &val, copy); - return val; - }*/ + bool has_room_for_put(btree_put_type put_type, uint32_t key_size, uint32_t value_size) const override { + return ((put_type == btree_put_type::UPSERT) || (put_type == btree_put_type::INSERT)) + ? (get_available_entries() > 0) + : true; + } std::string to_string(bool print_friendly = false) const override { auto str = fmt::format("{}id={} level={} nEntries={} {} next_node={} ", @@ -253,10 +220,9 @@ class SimpleNode : public BtreeNode { } for (uint32_t i{0}; i < this->total_entries(); ++i) { - V val; - get_nth_value(i, &val, false); fmt::format_to(std::back_inserter(str), "{}Entry{} [Key={} Val={}]", (print_friendly ? "\n\t" : " "), i + 1, - get_nth_key< K >(i, false).to_string(), val.to_string()); + BtreeNode::get_nth_key< K >(i, false).to_string(), + this->get_nth_value(i, false).to_string()); } return str; } @@ -278,15 +244,15 @@ class SimpleNode : public BtreeNode { if (!this->is_leaf()) { fmt::format_to(std::back_inserter(str), " ["); for (uint32_t i{0}; i < this->total_entries(); ++i) { - uint32_t cur_key = get_nth_key< K >(i, false).key(); + uint32_t cur_key = BtreeNode::get_nth_key< K >(i, false).key(); fmt::format_to(std::back_inserter(str), "{}{}", cur_key, i == this->total_entries() - 1 ? "" : ", "); } fmt::format_to(std::back_inserter(str), "]"); return str; } - uint32_t prev_key = get_nth_key< K >(0, false).key(); + uint32_t prev_key = BtreeNode::get_nth_key< K >(0, false).key(); uint32_t cur_key = prev_key; - uint32_t last_key = get_nth_key< K >(this->total_entries() - 1, false).key(); + uint32_t last_key = BtreeNode::get_nth_key< K >(this->total_entries() - 1, false).key(); if (last_key - prev_key == this->total_entries() - 1) { if (this->total_entries() == 1) fmt::format_to(std::back_inserter(str), "{}[{}]", delimiter, prev_key); @@ -297,7 +263,7 @@ class SimpleNode : public BtreeNode { fmt::format_to(std::back_inserter(str), "{}0 - [{}", delimiter, prev_key); uint32_t start_interval_key = prev_key; for (uint32_t i{1}; i < this->total_entries(); ++i) { - cur_key = get_nth_key< K >(i, false).key(); + cur_key = BtreeNode::get_nth_key< K >(i, false).key(); if (cur_key != prev_key + 1) { if (start_interval_key == prev_key) { fmt::format_to(std::back_inserter(str), "-{}]{}{}- [{}", prev_key, delimiter, i, cur_key); @@ -327,10 +293,10 @@ class SimpleNode : public BtreeNode { // validate if keys are in ascending order uint32_t i{1}; - K prevKey = get_nth_key< K >(0, false); + K prevKey = BtreeNode::get_nth_key< K >(0, false); while (i < this->total_entries()) { - K key = get_nth_key< K >(i, false); + K key = BtreeNode::get_nth_key< K >(i, false); if (i > 0 && prevKey.compare(key) > 0) { LOGINFO("non sorted entry : {} -> {} ", prevKey.to_string(), key.to_string()); DEBUG_ASSERT(false, "node={}", to_string()); @@ -342,16 +308,9 @@ class SimpleNode : public BtreeNode { #endif inline uint32_t get_nth_obj_size(uint32_t ind) const override { - return (get_obj_key_size(ind) + get_obj_value_size(ind)); + return (get_nth_key_size(ind) + get_nth_value_size(ind)); } - int compare_nth_key(const BtreeKey& cmp_key, uint32_t ind) const override { - return get_nth_key< K >(ind, false).compare(cmp_key); - } - - // Simple/Fixed node doesn't need a record to point key/value object - uint16_t get_record_size() const override { return 0; } - /*int compare_nth_key_range(const BtreeKeyRange& range, uint32_t ind) const override { return get_nth_key(ind, false).compare_range(range); }*/ @@ -370,11 +329,11 @@ class SimpleNode : public BtreeNode { } } - uint32_t get_available_entries(const BtreeConfig& cfg) const { return available_size(cfg) / get_nth_obj_size(0); } + uint32_t get_available_entries() const { return available_size() / get_nth_obj_size(0); } - inline uint32_t get_obj_key_size(uint32_t ind) const { return K::get_fixed_size(); } + uint32_t get_nth_key_size(uint32_t ind) const override { return dummy_key< K >.serialized_size(); } - inline uint32_t get_obj_value_size(uint32_t ind) const { return V::get_fixed_size(); } + uint32_t get_nth_value_size(uint32_t ind) const override { return dummy_value< V >.serialized_size(); } uint8_t* get_nth_obj(uint32_t ind) { return (this->node_data_area() + (get_nth_obj_size(ind) * ind)); } const uint8_t* get_nth_obj_const(uint32_t ind) const { @@ -395,7 +354,7 @@ class SimpleNode : public BtreeNode { "Invalid value size being set for non-leaf node"); this->set_edge_info(*r_cast< BtreeLinkInfo::bnode_link_info* >(b.bytes)); } else { - uint8_t* entry = this->node_data_area() + (get_nth_obj_size(ind) * ind) + get_obj_key_size(ind); + uint8_t* entry = this->node_data_area() + (get_nth_obj_size(ind) * ind) + get_nth_key_size(ind); std::memcpy(entry, b.bytes, b.size); } } diff --git a/src/include/homestore/btree/detail/variant_node.hpp b/src/include/homestore/btree/detail/variant_node.hpp new file mode 100644 index 000000000..b54dca2f8 --- /dev/null +++ b/src/include/homestore/btree/detail/variant_node.hpp @@ -0,0 +1,311 @@ +/********************************************************************************* + * Modifications Copyright 2017-2019 eBay Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ +#pragma once + +#include +#include + +namespace homestore { +template < typename K > +static K dummy_key; + +template < typename V > +static V dummy_value; + +template < typename K, typename V > +class VariantNode : public BtreeNode { +public: + using BtreeNode::get_nth_key_size; + using BtreeNode::get_nth_value; + + VariantNode(uint8_t* node_buf, bnodeid_t id, bool init_buf, bool is_leaf, BtreeConfig const& cfg) : + BtreeNode(node_buf, id, init_buf, is_leaf, cfg) {} + + ///////////////////////////////////////// Get related APIs of the node ///////////////////////////////////////// + + /// @brief Gets all entries in the node that have keys within the specified range. + /// + /// This method returns all entries in the node that have keys within the specified range. The method searches the + /// node using a binary search algorithm to find the first and last entries that have keys within the range. The + /// method returns the indices of these entries in the node and optionally returns the key-value pairs of the + /// entries. + /// + /// @tparam K The type of the keys in the node. + /// @tparam V The type of the values in the node. + /// @param range The range of keys to search for. + /// @param max_count The maximum number of entries to return. + /// @param start_idx [out] A reference to an integer to store the index of the first entry that has a key within the + /// range. + /// @param end_idx [out] A reference to an integer to store the index of the last entry that has a key within the + /// range. + /// @param out_values [optional] A pointer to a vector to store the key-value pairs of the entries if provided. Can + /// be nullptr + /// @param filter_cb [optional] A callback function to be called for each entry found in the node that has a key. + /// The callback is expected to return true if the entry should be included in the result and false otherwise. + /// @return The number of entries found in the node that have keys within the range and callback if any, allowed + /// the method to include result to. + virtual uint32_t multi_get(BtreeKeyRange< K > const& range, uint32_t max_count, uint32_t& start_idx, + uint32_t& end_idx, std::vector< std::pair< K, V > >* out_values = nullptr, + get_filter_cb_t const& filter_cb = nullptr) const { + if (!match_range(range, start_idx, end_idx)) { return 0; } + + uint32_t count = std::min(end_idx - start_idx + 1, max_count); + if (out_values || filter_cb) { + /* get the keys and values */ + auto const upto_idx = start_idx + count; + for (auto i{start_idx}; i < upto_idx; ++i) { + K key = get_nth_key< K >(i, (out_values != nullptr) /* copy */); + V val = get_nth_value(i, (out_values != nullptr) /* copy */); + if (!filter_cb || filter_cb(key, val)) { + if (out_values) { out_values->emplace_back(std::move(key), std::move(val)); } + } else { + --count; + } + } + } + return count; + } + + /// @brief Gets any entry in the node that has a key within the specified range. + /// + /// This method returns any entry in the node that has a key within the specified range. The method does a binary + /// search to find the first entry that has a key within the range. It returns the index of the entry in the node + /// and optionally returns the key and value of the entry. + /// + /// @param range The range of keys to search for. + /// @param out_key [optional] A pointer to a key to store the key of the entry if desired. + /// @param out_val [optional] A pointer to a value to store the value of the entry if desired. + /// @param copy_key Whether to copy the key of the entry to the output key. If not copied, it uses its internal node + /// pointer to construct the key. It is not advisable to set this to true in case the key is accessed after any + /// mutation on nodes. + /// @param copy_val Whether to copy the value of the entry to the output value. If not copied, it uses its internal + /// node pointer to construct the value. It is not advisable to set this to true in case the value is accessed after + /// any mutation on nodes. + /// @param filter_cb [optional] A callback function to be called for each entry found in the node that has a key. + /// The callback is expected to return true if the entry should be included in the result and false otherwise. + /// @return A pair of a boolean and an integer. + /// The boolean indicates whether an entry was found within the range. + /// The integer is the index of the entry in the node. + virtual std::pair< bool, uint32_t > get_any(BtreeKeyRange< K > const& range, BtreeKey* out_key, BtreeValue* out_val, + bool copy_key, bool copy_val, + get_filter_cb_t const& filter_cb = nullptr) const { + LOGMSG_ASSERT_EQ(magic(), BTREE_NODE_MAGIC, "Magic mismatch on btree_node {}", + get_persistent_header_const()->to_string()); + uint32_t result_idx; + const auto mm_opt = range.multi_option(); + bool efound; + uint32_t end_idx; + + // Get the start index of the search range. + auto [sfound, start_idx] = bsearch_node(range.start_key()); + if (sfound && !range.is_start_inclusive()) { + ++start_idx; + sfound = false; + } + + if (sfound && ((mm_opt == MultiMatchOption::DO_NOT_CARE) || (mm_opt == MultiMatchOption::LEFT_MOST))) { + result_idx = start_idx; + goto found_result; + } else if (start_idx == total_entries()) { + DEBUG_ASSERT(is_leaf() || has_valid_edge(), "Invalid node"); + return std::make_pair(false, 0); // out_of_range + } + + std::tie(efound, end_idx) = bsearch_node(range.end_key()); + if (efound && !range.is_end_inclusive()) { + if (end_idx == 0) { return std::make_pair(false, 0); } + --end_idx; + efound = false; + } + + if (end_idx > start_idx) { + if (mm_opt == MultiMatchOption::RIGHT_MOST) { + result_idx = end_idx; + } else if (mm_opt == MultiMatchOption::MID) { + result_idx = (end_idx - start_idx) / 2; + } else { + result_idx = start_idx; + } + } else if ((start_idx == end_idx) && ((sfound || efound))) { + result_idx = start_idx; + } else { + return std::make_pair(false, 0); + } + + found_result: + K tmp_key; + if (filter_cb && !out_key) { + out_key = &tmp_key; + copy_key = false; + } + + V tmp_val; + if (filter_cb && !out_val) { + out_val = &tmp_val; + copy_val = false; + } + + if (out_key) { get_nth_key_internal(result_idx, *out_key, copy_key); } + if (out_val) { get_nth_value(result_idx, out_val, copy_val); } + + return (!filter_cb || filter_cb(*out_key, *out_val)) ? std::make_pair(true, result_idx) + : std::make_pair(false, 0u); + } + + V get_nth_value(uint32_t idx, bool copy) const { + V out_val; + get_nth_value(idx, &out_val, copy); + return out_val; + } + + int compare_nth_key(const BtreeKey& cmp_key, uint32_t ind) const override { + return get_nth_key< K >(ind, false).compare(cmp_key); + } + + ///////////////////////////////////////// Put related APIs of the node ///////////////////////////////////////// + /// @brief Inserts or updates an entry with the specified key and value in the node. + /// + /// This method inserts or updates an entry with the specified key and value in the node. It binary searches + /// the node to find the index of the entry with the specified key. If an entry with the specified key is found, it + /// updates the value for key according to the specified put type. If an entry with the specified key is not found, + /// it inserts a new entry with the specified key and value. The method optionally returns the value of the existing + /// entry if it was updated. + /// + /// NOTE: The operation fails if the put type is INSERT and an entry with the specified key already exists in the + /// node. + /// + /// @param key The key of the entry to insert or update. + /// @param val The value of the entry to insert or update. + /// @param put_type The type of put operation to perform if an entry with the specified key is found. put_type + /// translates into one of "Insert", "Update" or "Upsert". + /// @param existing_val [optional] A pointer to a value to store the value of the existing entry if it was updated. + /// @param filter_cb [optional] A callback function to be called for each entry found in the node that has a key. It + /// is used as an filter to remove anything that needn't be updated. + /// @return A boolean indicating whether the operation was successful. + /// + virtual bool put(BtreeKey const& key, BtreeValue const& val, btree_put_type put_type, BtreeValue* existing_val, + put_filter_cb_t const& filter_cb = nullptr) { + LOGMSG_ASSERT_EQ(magic(), BTREE_NODE_MAGIC, "Magic mismatch on btree_node {}", + get_persistent_header_const()->to_string()); + bool ret = true; + + DEBUG_ASSERT_EQ( + this->is_leaf(), true, + "Put operation on node is supported only for leaf nodes, interiors do use insert/update on index APIs"); + + const auto [found, idx] = find(key, nullptr, false); + if (found) { + if (existing_val) { get_nth_value(idx, existing_val, true); } + if (filter_cb && + filter_cb(get_nth_key< K >(idx, false), get_nth_value(idx, false), val) != + put_filter_decision::replace) { + return false; + } + } + + if (put_type == btree_put_type::INSERT) { + if (found) { + LOGDEBUG("Attempt to insert duplicate entry {}", key.to_string()); + return false; + } + ret = (insert(idx, key, val) == btree_status_t::success); + } else if (put_type == btree_put_type::UPDATE) { + if (!found) return false; + update(idx, key, val); + } else if (put_type == btree_put_type::UPSERT) { + (found) ? update(idx, key, val) : (void)insert(idx, key, val); + } else { + DEBUG_ASSERT(false, "Wrong put_type {}", put_type); + } + return ret; + } + + /// @brief Put a batch of key/values into this node + /// + /// This method updates all entries in the node that have keys within the specified range. + /// NOTE: The method is supported only for leaf nodes. + /// NOTE: This base class version only supports range updates. + /// + /// @param keys The range of keys to upsert. + /// @param val The value to upsert. + /// @param last_failed_key [optional] If non-null and if there an not enough room to put the objects, the key where + /// it was not able to put. + /// @param filter_cb The callback function to be called for each entry found within the range. The function should + /// take two arguments: a key and a value, and return a batch_upsert_decision_t value. If the function returns: + /// put_filter_decision::replace, the entry is upserted with the new value. + /// put_filter_decision::remove, the entry is removed from the node. + /// put_filter_decision::keep, the entry is not modified and the method moves on to the next entry. + /// @return Btree status typically . + /// If all keys were upserted successfully, the method returns btree_status_t::success. + /// If the method ran out of space in the node, the method returns the key that was last put and the status + /// as btree_status_t::has_more + virtual btree_status_t multi_put(BtreeKeyRange< K > const& keys, BtreeKey const&, BtreeValue const& val, + btree_put_type put_type, K* last_failed_key, + put_filter_cb_t const& filter_cb = nullptr) { + if (put_type != btree_put_type::UPDATE) { + DEBUG_ASSERT(false, "For non-interval keys multi-put should be really update and cannot insert"); + return btree_status_t::not_supported; + } + DEBUG_ASSERT_EQ(this->is_leaf(), true, "Multi put entries on node are supported only for leaf nodes"); + + // Match the key range to get start and end idx. If none of the ranges here matches, we have to return not_found + uint32_t start_idx; + uint32_t end_idx; + if (!this->match_range(keys, start_idx, end_idx)) { return btree_status_t::not_found; } + + const auto new_val_size = val.serialized_size(); + for (auto idx{start_idx}; idx <= end_idx; ++idx) { + if (!has_room_for_put(put_type, get_nth_key_size(idx), new_val_size)) { + if (last_failed_key) { this->get_nth_key_internal(idx, *last_failed_key, true); } + return btree_status_t::has_more; + } + if (filter_cb) { + auto decision = filter_cb(get_nth_key< K >(idx, false), get_nth_value(idx, false), val); + if (decision == put_filter_decision::replace) { + this->update(idx, val); + } else if (decision == put_filter_decision::remove) { + this->remove(idx); + --idx; + } + } else { + update(idx, val); + } + } + return btree_status_t::success; + } + + ///////////////////////////////////////// Remove related APIs of the node ///////////////////////////////////////// + virtual uint32_t multi_remove(BtreeKeyRange< K > const& keys, remove_filter_cb_t const& filter_cb = nullptr) { + DEBUG_ASSERT_EQ(this->is_leaf(), true, "Multi put entries on node are supported only for leaf nodes"); + + // Match the key range to get start and end idx. If none of the ranges here matches, we have to return not_found + uint32_t start_idx{0}; + uint32_t end_idx{0}; + if (!this->match_range(keys, start_idx, end_idx)) { return 0u; } + + auto removed_count = end_idx - start_idx + 1; + auto ret = removed_count; + for (uint32_t count = 0; count < removed_count; ++count) { + if (!filter_cb || filter_cb(get_nth_key< K >(start_idx, false), get_nth_value(start_idx, false))) { + this->remove(start_idx); + } else { + ++start_idx; // Skipping the entry + --ret; + } + } + return ret; + } +}; +} // namespace homestore \ No newline at end of file diff --git a/src/include/homestore/btree/detail/varlen_node.hpp b/src/include/homestore/btree/detail/varlen_node.hpp index 9c89a89c0..8582f639b 100644 --- a/src/include/homestore/btree/detail/varlen_node.hpp +++ b/src/include/homestore/btree/detail/varlen_node.hpp @@ -17,7 +17,7 @@ #pragma once #include -#include "btree_node.hpp" +#include #include #include "homestore/index/index_internal.hpp" @@ -47,10 +47,18 @@ struct var_node_header { // [Persistent Header][var node header][Record][Record].. ... ... [key][value][key][value] // template < typename K, typename V > -class VariableNode : public BtreeNode { +class VariableNode : public VariantNode< K, V > { public: + using BtreeNode::get_nth_key_internal; + using BtreeNode::get_nth_key_size; + using BtreeNode::get_nth_obj_size; + using BtreeNode::get_nth_value; + using BtreeNode::get_nth_value_size; + using BtreeNode::to_string; + using VariantNode< K, V >::get_nth_value; + VariableNode(uint8_t* node_buf, bnodeid_t id, bool init, bool is_leaf, const BtreeConfig& cfg) : - BtreeNode(node_buf, id, init, is_leaf) { + VariantNode< K, V >(node_buf, id, init, is_leaf, cfg) { if (init) { // Tail arena points to the edge of the node as data arena grows backwards. Entire space is now available // except for the header itself @@ -63,10 +71,6 @@ class VariableNode : public BtreeNode { virtual ~VariableNode() = default; - uint32_t occupied_size(const BtreeConfig& cfg) const override { - return (cfg.node_data_size() - sizeof(var_node_header) - available_size(cfg)); - } - /* Insert the key and value in provided index * Assumption: Node lock is already taken */ btree_status_t insert(uint32_t ind, const BtreeKey& key, const BtreeValue& val) override { @@ -75,8 +79,7 @@ class VariableNode : public BtreeNode { #ifndef NDEBUG validate_sanity(); #endif - if (sz == 0) { return btree_status_t::insert_failed; } - return btree_status_t::success; + return (sz == 0) ? btree_status_t::space_not_avail : btree_status_t::success; } #ifndef NDEBUG @@ -85,7 +88,7 @@ class VariableNode : public BtreeNode { // validate if keys are in ascending order K prevKey; while (i < this->total_entries()) { - K key = get_nth_key< K >(i, false); + K key = BtreeNode::get_nth_key< K >(i, false); uint64_t kp = *(uint64_t*)key.serialize().bytes; if (i > 0 && prevKey.compare(key) > 0) { DEBUG_ASSERT(false, "Found non sorted entry: {} -> {}", kp, to_string()); @@ -105,7 +108,7 @@ class VariableNode : public BtreeNode { this->set_edge_value(val); this->inc_gen(); } else { - K key = get_nth_key< K >(ind, true); + K key = BtreeNode::get_nth_key< K >(ind, true); update(ind, key, val); } } @@ -167,13 +170,13 @@ class VariableNode : public BtreeNode { this->set_edge_value(last_1_val); for (uint32_t i = ind_s; i < total_entries; i++) { - get_var_node_header()->m_available_space += get_nth_key_len(i) + get_nth_value_len(i) + recSize; + get_var_node_header()->m_available_space += get_nth_key_size(i) + get_nth_value_size(i) + recSize; } this->sub_entries(total_entries - ind_s + 1); } else { // claim available memory for (uint32_t i = ind_s; i <= ind_e; i++) { - get_var_node_header()->m_available_space += get_nth_key_len(i) + get_nth_value_len(i) + recSize; + get_var_node_header()->m_available_space += get_nth_key_size(i) + get_nth_value_size(i) + recSize; } uint8_t* rec_ptr = get_nth_record_mutable(ind_s); memmove(rec_ptr, rec_ptr + recSize * no_of_elem, (this->total_entries() - ind_e - 1) * recSize); @@ -223,11 +226,11 @@ class VariableNode : public BtreeNode { // Get the ith key and value blob and then remove the entry from here and insert to the other node sisl::blob kb; kb.bytes = (uint8_t*)get_nth_obj(ind); - kb.size = get_nth_key_len(ind); + kb.size = get_nth_key_size(ind); sisl::blob vb; vb.bytes = kb.bytes + kb.size; - vb.size = get_nth_value_len(ind); + vb.size = get_nth_value_size(ind); auto sz = other.insert(0, kb, vb); if (!sz) { break; } @@ -264,11 +267,11 @@ class VariableNode : public BtreeNode { while (ind > 0) { sisl::blob kb; kb.bytes = (uint8_t*)get_nth_obj(ind); - kb.size = get_nth_key_len(ind); + kb.size = get_nth_key_size(ind); sisl::blob vb; vb.bytes = kb.bytes + kb.size; - vb.size = get_nth_value_len(ind); + vb.size = get_nth_value_size(ind); auto sz = other.insert(0, kb, vb); // Keep on inserting on the first index, thus moving everything to right if (!sz) break; @@ -303,7 +306,7 @@ class VariableNode : public BtreeNode { uint32_t cum_size{0}; while (idx < this->total_entries()) { - uint32_t const rec_size = this->get_record_size() + get_nth_key_len(idx) + get_nth_value_len(idx); + uint32_t const rec_size = this->get_record_size() + get_nth_key_size(idx) + get_nth_value_size(idx); cum_size += rec_size; if (cum_size > size) { break; } ++idx; @@ -319,8 +322,8 @@ class VariableNode : public BtreeNode { auto idx = start_idx; uint32_t n = 0; while (idx < other.total_entries()) { - sisl::blob kb{(uint8_t*)other.get_nth_obj(idx), other.get_nth_key_len(idx)}; - sisl::blob vb{kb.bytes + kb.size, other.get_nth_value_len(idx)}; + sisl::blob kb{(uint8_t*)other.get_nth_obj(idx), other.get_nth_key_size(idx)}; + sisl::blob vb{kb.bytes + kb.size, other.get_nth_value_size(idx)}; // We reached threshold of how much we could move if ((kb.size + vb.size + other.get_record_size()) > copy_size) { break; } @@ -348,8 +351,8 @@ class VariableNode : public BtreeNode { auto idx = start_idx; uint32_t n = 0; while (n < nentries) { - sisl::blob kb{(uint8_t*)other.get_nth_obj(idx), other.get_nth_key_len(idx)}; - sisl::blob vb{kb.bytes + kb.size, other.get_nth_value_len(idx)}; + sisl::blob kb{(uint8_t*)other.get_nth_obj(idx), other.get_nth_key_size(idx)}; + sisl::blob vb{kb.bytes + kb.size, other.get_nth_value_size(idx)}; auto sz = insert(this->total_entries(), kb, vb); if (sz == 0) { break; } @@ -377,11 +380,11 @@ class VariableNode : public BtreeNode { // Get the ith key and value blob and then remove the entry from here and insert to the other node sisl::blob kb; kb.bytes = (uint8_t*)other.get_nth_obj(other_ind); - kb.size = other.get_nth_key_len(other_ind); + kb.size = other.get_nth_key_size(other_ind); sisl::blob vb; vb.bytes = kb.bytes + kb.size; - vb.size = other.get_nth_value_len(other_ind); + vb.size = other.get_nth_value_size(other_ind); auto sz = insert(this->total_entries(), kb, vb); if (!sz) { break; } @@ -418,11 +421,11 @@ class VariableNode : public BtreeNode { while (ind < this->total_entries()) { sisl::blob kb; kb.bytes = (uint8_t*)other.get_nth_obj(ind); - kb.size = other.get_nth_key_len(ind); + kb.size = other.get_nth_key_size(ind); sisl::blob vb; vb.bytes = kb.bytes + kb.size; - vb.size = other.get_nth_value_len(ind); + vb.size = other.get_nth_value_size(ind); if ((kb.size + vb.size + other.get_record_size()) > size_to_move) { // We reached threshold of how much we could move @@ -452,31 +455,30 @@ class VariableNode : public BtreeNode { return moved_size; } */ - void append(uint32_t ind, const BtreeKey& key, const BtreeValue& val) override { - RELEASE_ASSERT(false, "Append operation is not supported on var node"); - } - - uint32_t available_size(const BtreeConfig& cfg) const override { - return get_var_node_header_const()->m_available_space; - } - - uint32_t get_nth_obj_size(uint32_t ind) const override { return get_nth_key_len(ind) + get_nth_value_len(ind); } + uint32_t available_size() const override { return get_var_node_header_const()->m_available_space; } void set_nth_key(uint32_t ind, const BtreeKey& key) { const auto kb = key.serialize(); assert(ind < this->total_entries()); - assert(kb.size == get_nth_key_len(ind)); + assert(kb.size == get_nth_key_size(ind)); memcpy(uintptr_cast(get_nth_obj(ind)), kb.bytes, kb.size); } - virtual uint16_t get_nth_key_len(uint32_t ind) const = 0; - virtual uint16_t get_nth_value_len(uint32_t ind) const = 0; - virtual void set_nth_key_len(uint8_t* rec_ptr, uint16_t key_len) = 0; - virtual void set_nth_value_len(uint8_t* rec_ptr, uint16_t value_len) = 0; + bool has_room_for_put(btree_put_type put_type, uint32_t key_size, uint32_t value_size) const override { + auto needed_size = key_size + value_size; + if ((put_type == btree_put_type::UPSERT) + (put_type == btree_put_type::INSERT)) { + needed_size += get_record_size(); + } + return (available_size() >= needed_size); + } + + virtual uint32_t get_record_size() const = 0; + virtual void set_nth_key_len(uint8_t* rec_ptr, uint32_t key_len) = 0; + virtual void set_nth_value_len(uint8_t* rec_ptr, uint32_t value_len) = 0; void get_nth_key_internal(uint32_t ind, BtreeKey& out_key, bool copy) const override { assert(ind < this->total_entries()); - sisl::blob b{const_cast< uint8_t* >(get_nth_obj(ind)), get_nth_key_len(ind)}; + sisl::blob b{const_cast< uint8_t* >(get_nth_obj(ind)), get_nth_key_size(ind)}; out_key.deserialize(b, copy); } @@ -486,17 +488,11 @@ class VariableNode : public BtreeNode { DEBUG_ASSERT_EQ(this->has_valid_edge(), true, "get_nth_value out-of-bound"); *(BtreeLinkInfo*)out_val = this->get_edge_value(); } else { - sisl::blob b{const_cast< uint8_t* >(get_nth_obj(ind)) + get_nth_key_len(ind), get_nth_value_len(ind)}; + sisl::blob b{const_cast< uint8_t* >(get_nth_obj(ind)) + get_nth_key_size(ind), get_nth_value_size(ind)}; out_val->deserialize(b, copy); } } - /*V get_nth_value(uint32_t ind, bool copy) const { - assert(ind < this->total_entries()); - sisl::blob b{const_cast< uint8_t* >(get_nth_obj(ind)) + get_nth_key_len(ind), get_nth_value_len(ind)}; - return V{b, copy}; - }*/ - std::string to_string(bool print_friendly = false) const override { auto str = fmt::format( "{}id={} level={} nEntries={} {} free_space={}{} ", @@ -512,7 +508,7 @@ class VariableNode : public BtreeNode { V val; get_nth_value(i, &val, false); fmt::format_to(std::back_inserter(str), "{}Entry{} [Key={} Val={}]", (print_friendly ? "\n\t" : " "), i + 1, - get_nth_key< K >(i, false).to_string(), val.to_string()); + BtreeNode::get_nth_key< K >(i, false).to_string(), val.to_string()); } return str; } @@ -534,15 +530,15 @@ class VariableNode : public BtreeNode { if (!this->is_leaf()) { fmt::format_to(std::back_inserter(str), " ["); for (uint32_t i{0}; i < this->total_entries(); ++i) { - uint32_t cur_key = get_nth_key< K >(i, false).key(); + uint32_t cur_key = BtreeNode::get_nth_key< K >(i, false).key(); fmt::format_to(std::back_inserter(str), "{}{}", cur_key, i == this->total_entries() - 1 ? "" : ", "); } fmt::format_to(std::back_inserter(str), "]"); return str; } - uint32_t prev_key = get_nth_key< K >(0, false).key(); + uint32_t prev_key = BtreeNode::get_nth_key< K >(0, false).key(); uint32_t cur_key = prev_key; - uint32_t last_key = get_nth_key< K >(this->total_entries() - 1, false).key(); + uint32_t last_key = BtreeNode::get_nth_key< K >(this->total_entries() - 1, false).key(); if (last_key - prev_key == this->total_entries() - 1) { if (this->total_entries() == 1) fmt::format_to(std::back_inserter(str), "{}[{}]", delimiter, prev_key); @@ -553,7 +549,7 @@ class VariableNode : public BtreeNode { fmt::format_to(std::back_inserter(str), "{}0 - [{}", delimiter, prev_key); uint32_t start_interval_key = prev_key; for (uint32_t i{1}; i < this->total_entries(); ++i) { - cur_key = get_nth_key< K >(i, false).key(); + cur_key = BtreeNode::get_nth_key< K >(i, false).key(); if (cur_key != prev_key + 1) { if (start_interval_key == prev_key) { fmt::format_to(std::back_inserter(str), "-{}]{}{}- [{}", prev_key, delimiter, i, cur_key); @@ -577,10 +573,6 @@ class VariableNode : public BtreeNode { uint8_t* get_node_context() override { return uintptr_cast(this) + sizeof(VariableNode< K, V >); } - int compare_nth_key(const BtreeKey& cmp_key, uint32_t ind) const { - return get_nth_key< K >(ind, false).compare(cmp_key); - } - /*int compare_nth_key_range(const BtreeKeyRange& range, uint32_t ind) const { return get_nth_key(ind, false).compare_range(range); }*/ @@ -680,7 +672,7 @@ class VariableNode : public BtreeNode { // loop records while (ind < no_of_entries) { uint16_t total_key_value_len = - get_nth_key_len(rec[ind].orig_record_index) + get_nth_value_len(rec[ind].orig_record_index); + get_nth_key_size(rec[ind].orig_record_index) + get_nth_value_size(rec[ind].orig_record_index); sparce_space = last_offset - (rec[ind].m_obj_offset + total_key_value_len); if (sparce_space > 0) { // do compaction @@ -752,16 +744,18 @@ class VarKeySizeNode : public VariableNode< K, V > { } virtual ~VarKeySizeNode() = default; - uint16_t get_nth_key_len(uint32_t ind) const override { + uint32_t get_nth_key_size(uint32_t ind) const override { return r_cast< const var_key_record* >(this->get_nth_record(ind))->m_key_len; } - uint16_t get_nth_value_len(uint32_t ind) const override { return V::get_fixed_size(); } - uint16_t get_record_size() const override { return sizeof(var_key_record); } + uint32_t get_nth_value_size(uint32_t ind) const override { return dummy_value< V >.serialized_size(); } + uint32_t get_record_size() const override { return sizeof(var_key_record); } - void set_nth_key_len(uint8_t* rec_ptr, uint16_t key_len) override { + void set_nth_key_len(uint8_t* rec_ptr, uint32_t key_len) override { r_cast< var_key_record* >(rec_ptr)->m_key_len = key_len; } - void set_nth_value_len(uint8_t* rec_ptr, uint16_t value_len) override { assert(value_len == V::get_fixed_size()); } + void set_nth_value_len(uint8_t* rec_ptr, uint32_t value_len) override { + assert(value_len == dummy_value< V >.serialized_size()); + } private: #pragma pack(1) @@ -782,14 +776,16 @@ class VarValueSizeNode : public VariableNode< K, V > { } virtual ~VarValueSizeNode() = default; - uint16_t get_nth_key_len(uint32_t ind) const override { return K::get_fixed_size(); } - uint16_t get_nth_value_len(uint32_t ind) const override { + uint32_t get_nth_key_size(uint32_t ind) const override { return dummy_key< K >.serialized_size(); } + uint32_t get_nth_value_size(uint32_t ind) const override { return r_cast< const var_value_record* >(this->get_nth_record(ind))->m_value_len; } - uint16_t get_record_size() const override { return sizeof(var_value_record); } + uint32_t get_record_size() const override { return sizeof(var_value_record); } - void set_nth_key_len(uint8_t* rec_ptr, uint16_t key_len) override { assert(key_len == K::get_fixed_size()); } - void set_nth_value_len(uint8_t* rec_ptr, uint16_t value_len) override { + void set_nth_key_len(uint8_t* rec_ptr, uint32_t key_len) override { + assert(key_len == dummy_key< K >.serialized_size()); + } + void set_nth_value_len(uint8_t* rec_ptr, uint32_t value_len) override { r_cast< var_value_record* >(rec_ptr)->m_value_len = value_len; } @@ -812,18 +808,18 @@ class VarObjSizeNode : public VariableNode< K, V > { } virtual ~VarObjSizeNode() = default; - uint16_t get_nth_key_len(uint32_t ind) const override { + uint32_t get_nth_key_size(uint32_t ind) const override { return r_cast< const var_obj_record* >(this->get_nth_record(ind))->m_key_len; } - uint16_t get_nth_value_len(uint32_t ind) const override { + uint32_t get_nth_value_size(uint32_t ind) const override { return r_cast< const var_obj_record* >(this->get_nth_record(ind))->m_value_len; } - uint16_t get_record_size() const override { return sizeof(var_obj_record); } + uint32_t get_record_size() const override { return sizeof(var_obj_record); } - void set_nth_key_len(uint8_t* rec_ptr, uint16_t key_len) override { + void set_nth_key_len(uint8_t* rec_ptr, uint32_t key_len) override { r_cast< var_obj_record* >(rec_ptr)->m_key_len = key_len; } - void set_nth_value_len(uint8_t* rec_ptr, uint16_t value_len) override { + void set_nth_value_len(uint8_t* rec_ptr, uint32_t value_len) override { r_cast< var_obj_record* >(rec_ptr)->m_value_len = value_len; } diff --git a/src/include/homestore/index/index_table.hpp b/src/include/homestore/index/index_table.hpp index 7818490da..246557a62 100644 --- a/src/include/homestore/index/index_table.hpp +++ b/src/include/homestore/index/index_table.hpp @@ -34,9 +34,8 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { superblk< index_table_sb > m_sb; public: - IndexTable(uuid_t uuid, uuid_t parent_uuid, uint32_t user_sb_size, const BtreeConfig& cfg, - on_kv_read_t read_cb = nullptr, on_kv_update_t update_cb = nullptr, on_kv_remove_t remove_cb = nullptr) : - Btree< K, V >{cfg, std::move(read_cb), std::move(update_cb), std::move(remove_cb)}, m_sb{"index"} { + IndexTable(uuid_t uuid, uuid_t parent_uuid, uint32_t user_sb_size, const BtreeConfig& cfg) : + Btree< K, V >{cfg}, m_sb{"index"} { m_sb.create(sizeof(index_table_sb)); m_sb->uuid = uuid; m_sb->parent_uuid = parent_uuid; @@ -46,9 +45,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { if (status != btree_status_t::success) { throw std::runtime_error(fmt::format("Unable to create root node")); } } - IndexTable(const superblk< index_table_sb >& sb, const BtreeConfig& cfg, on_kv_read_t read_cb = nullptr, - on_kv_update_t update_cb = nullptr, on_kv_remove_t remove_cb = nullptr) : - Btree< K, V >{cfg, std::move(read_cb), std::move(update_cb), std::move(remove_cb)} { + IndexTable(const superblk< index_table_sb >& sb, const BtreeConfig& cfg) : Btree< K, V >{cfg} { m_sb = sb; Btree< K, V >::set_root_node_info(BtreeLinkInfo{m_sb->root_node, m_sb->link_version}); } @@ -160,7 +157,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { return BtreeNodePtr{n}; }); return btree_status_t::success; - } catch (std::exception& e) { return btree_status_t::read_failed; } + } catch (std::exception& e) { return btree_status_t::node_read_failed; } } btree_status_t refresh_node(const BtreeNodePtr& node, bool for_read_modify_write, void* context) const override { diff --git a/src/tests/btree_helpers/btree_test_helper.hpp b/src/tests/btree_helpers/btree_test_helper.hpp new file mode 100644 index 000000000..3bc943fc0 --- /dev/null +++ b/src/tests/btree_helpers/btree_test_helper.hpp @@ -0,0 +1,383 @@ +/********************************************************************************* + * Modifications Copyright 2017-2019 eBay Inc. + * + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include "test_common/range_scheduler.hpp" +#include "shadow_map.hpp" +#include "btree_test_kvs.hpp" + +static constexpr uint32_t g_node_size{4096}; + +template < typename TestType > +struct BtreeTestHelper : public testing::Test { + using T = TestType; + using K = typename TestType::KeyType; + using V = typename TestType::ValueType; + using mutex = iomgr::FiberManagerLib::shared_mutex; + using op_func_t = std::function< void(void) >; + + BtreeTestHelper() : testing::Test(), m_range_scheduler{SISL_OPTIONS["num_entries"].as< uint32_t >()} {} + + void SetUp() override { + m_cfg.m_leaf_node_type = T::leaf_node_type; + m_cfg.m_int_node_type = T::interior_node_type; + m_max_range_input = SISL_OPTIONS["num_entries"].as< uint32_t >(); + if (SISL_OPTIONS.count("disable_merge")) { m_cfg.m_merge_turned_on = false; } + + if (m_is_multi_threaded) { + std::mutex mtx; + iomanager.run_on_wait(iomgr::reactor_regex::all_io, [this, &mtx]() { + auto fv = iomanager.sync_io_capable_fibers(); + std::unique_lock lg(mtx); + m_fibers.insert(m_fibers.end(), fv.begin(), fv.end()); + }); + } + + m_operations["put"] = std::bind(&BtreeTestHelper::put_random, this); + m_operations["remove"] = std::bind(&BtreeTestHelper::remove_random, this); + m_operations["range_put"] = std::bind(&BtreeTestHelper::range_put_random, this); + m_operations["range_remove"] = std::bind(&BtreeTestHelper::range_remove_existing_random, this); + m_operations["query"] = std::bind(&BtreeTestHelper::query_random, this); + } + + void TearDown() override {} + +protected: + std::shared_ptr< typename T::BtreeType > m_bt; + ShadowMap< K, V > m_shadow_map; + BtreeConfig m_cfg{g_node_size}; + RangeScheduler m_range_scheduler; + uint32_t m_max_range_input{1000}; + bool m_is_multi_threaded{false}; + + std::map< std::string, op_func_t > m_operations; + std::vector< iomgr::io_fiber_t > m_fibers; + std::mutex m_test_done_mtx; + std::condition_variable m_test_done_cv; + + std::random_device m_re; + +public: + void preload(uint32_t preload_size) { + const auto chunk_size = preload_size / m_fibers.size(); + const auto last_chunk_size = preload_size % chunk_size ?: chunk_size; + auto test_count = m_fibers.size(); + + for (std::size_t i = 0; i < m_fibers.size(); ++i) { + const auto start_range = i * chunk_size; + const auto end_range = start_range + ((i == m_fibers.size() - 1) ? last_chunk_size : chunk_size); + iomanager.run_on_forget(m_fibers[i], [this, start_range, end_range, &test_count]() { + for (uint32_t i = start_range; i < end_range; i++) { + put(i, btree_put_type::INSERT); + m_range_scheduler.put_key(i); + } + { + std::unique_lock lg(m_test_done_mtx); + if (--test_count == 0) { m_test_done_cv.notify_one(); } + } + }); + } + + { + std::unique_lock< std::mutex > lk(m_test_done_mtx); + m_test_done_cv.wait(lk, [&]() { return test_count == 0; }); + } + LOGINFO("Preload Done"); + } + + ////////////////////// All put operation variants /////////////////////////////// + void put(uint64_t k, btree_put_type put_type) { do_put(k, put_type, V::generate_rand()); } + + void put_random() { + auto [start_k, end_k] = m_range_scheduler.pick_random_non_existing_keys(1); + RELEASE_ASSERT_EQ(start_k, end_k, "Range scheduler pick_random_non_existing_keys issue"); + + do_put(start_k, btree_put_type::INSERT, V::generate_rand()); + } + + void range_put(uint32_t start_k, uint32_t end_k, V const& value, bool update) { + K start_key = K{start_k}; + K end_key = K{end_k}; + auto const nkeys = end_k - start_k + 1; + + auto preq = BtreeRangePutRequest< K >{BtreeKeyRange< K >{start_key, true, end_key, true}, + update ? btree_put_type::UPDATE : btree_put_type::UPSERT, &value}; + preq.enable_route_tracing(); + ASSERT_EQ(m_bt->put(preq), btree_status_t::success) << "range_put failed for " << start_k << "-" << end_k; + + if (update) { + m_shadow_map.range_update(start_key, nkeys, value); + m_range_scheduler.remove_keys_from_working(start_k, end_k); + } else { + m_shadow_map.range_upsert(start_k, nkeys, value); + m_range_scheduler.put_keys(start_k, end_k); + } + } + + void range_put_random() { + bool is_update{true}; + if constexpr (std::is_same_v< V, TestIntervalValue >) { is_update = false; } + + static thread_local std::uniform_int_distribution< uint32_t > s_rand_range_generator{1, 50}; + + auto const [start_k, end_k] = is_update + ? m_range_scheduler.pick_random_existing_keys(s_rand_range_generator(m_re)) + : m_range_scheduler.pick_random_non_working_keys(s_rand_range_generator(m_re)); + + range_put(start_k, end_k, V::generate_rand(), is_update); + } + + ////////////////////// All remove operation variants /////////////////////////////// + void remove_one(uint32_t k) { + auto existing_v = std::make_unique< V >(); + auto pk = std::make_unique< K >(k); + + auto rreq = BtreeSingleRemoveRequest{pk.get(), existing_v.get()}; + bool removed = (m_bt->remove(rreq) == btree_status_t::success); + + ASSERT_EQ(removed, m_shadow_map.exists(*pk)) + << "Removal of key " << pk->key() << " status doesn't match with shadow"; + + if (removed) { + m_shadow_map.validate_data(rreq.key(), (const V&)rreq.value()); + m_shadow_map.erase(rreq.key()); + } + m_range_scheduler.remove_key(k); + } + + void remove_random() { + auto const [start_k, end_k] = m_range_scheduler.pick_random_existing_keys(1); + RELEASE_ASSERT_EQ(start_k, end_k, "Range scheduler pick_random_existing_keys issue"); + + remove_one(start_k); + m_range_scheduler.remove_key(start_k); + } + + void range_remove_existing(uint32_t start_k, uint32_t count) { + auto [start_key, end_key] = m_shadow_map.pick_existing_range(K{start_k}, count); + do_range_remove(start_k, end_key.key(), true /* removing_all_existing */); + } + + void range_remove_existing_random() { + static std::uniform_int_distribution< uint32_t > s_rand_range_generator{2, 5}; + + auto const [start_k, end_k] = m_range_scheduler.pick_random_existing_keys(s_rand_range_generator(m_re)); + do_range_remove(start_k, end_k, true /* only_existing */); + } + + void range_remove_any(uint32_t start_k, uint32_t end_k) { + do_range_remove(start_k, end_k, false /* removing_all_existing */); + } + + ////////////////////// All query operation variants /////////////////////////////// + void query_all() { do_query(0u, SISL_OPTIONS["num_entries"].as< uint32_t >() - 1, UINT32_MAX); } + + void query_all_paginate(uint32_t batch_size) { + do_query(0u, SISL_OPTIONS["num_entries"].as< uint32_t >() - 1, batch_size); + } + + void do_query(uint32_t start_k, uint32_t end_k, uint32_t batch_size) { + std::vector< std::pair< K, V > > out_vector; + uint32_t remaining = m_shadow_map.num_elems_in_range(start_k, end_k); + auto it = m_shadow_map.map_const().lower_bound(K{start_k}); + + BtreeQueryRequest< K > qreq{BtreeKeyRange< K >{K{start_k}, true, K{end_k}, true}, + BtreeQueryType::SWEEP_NON_INTRUSIVE_PAGINATION_QUERY, batch_size}; + while (remaining > 0) { + out_vector.clear(); + qreq.enable_route_tracing(); + auto const ret = m_bt->query(qreq, out_vector); + auto const expected_count = std::min(remaining, batch_size); + + ASSERT_EQ(out_vector.size(), expected_count) << "Received incorrect value on query pagination"; + remaining -= expected_count; + + if (remaining == 0) { + ASSERT_EQ(ret, btree_status_t::success) << "Expected success on query"; + } else { + ASSERT_EQ(ret, btree_status_t::has_more) << "Expected query to return has_more"; + } + + for (size_t idx{0}; idx < out_vector.size(); ++idx) { + ASSERT_EQ(out_vector[idx].second, it->second) + << "Range get doesn't return correct data for key=" << it->first << " idx=" << idx; + ++it; + } + } + out_vector.clear(); + auto ret = m_bt->query(qreq, out_vector); + ASSERT_EQ(ret, btree_status_t::success) << "Expected success on query"; + ASSERT_EQ(out_vector.size(), 0) << "Received incorrect value on empty query pagination"; + + if (start_k < m_max_range_input) { + m_range_scheduler.remove_keys_from_working(start_k, std::min(end_k, m_max_range_input - 1)); + } + } + + void query_random() { + static thread_local std::uniform_int_distribution< uint32_t > s_rand_range_generator{1, 100}; + + auto const [start_k, end_k] = m_range_scheduler.pick_random_non_working_keys(s_rand_range_generator(m_re)); + do_query(start_k, end_k, 79); + } + + ////////////////////// All get operation variants /////////////////////////////// + void get_all() const { + for (const auto& [key, value] : m_shadow_map.map_const()) { + auto copy_key = std::make_unique< K >(); + *copy_key = key; + auto out_v = std::make_unique< V >(); + auto req = BtreeSingleGetRequest{copy_key.get(), out_v.get()}; + + const auto ret = m_bt->get(req); + ASSERT_EQ(ret, btree_status_t::success) << "Missing key " << key << " in btree but present in shadow map"; + ASSERT_EQ((const V&)req.value(), value) + << "Found value in btree doesn't return correct data for key=" << key; + } + } + + void get_specific(uint32_t k) const { + auto pk = std::make_unique< K >(k); + auto out_v = std::make_unique< V >(); + auto req = BtreeSingleGetRequest{pk.get(), out_v.get()}; + + const auto status = m_bt->get(req); + if (status == btree_status_t::success) { + m_shadow_map.validate_data(req.key(), (const V&)req.value()); + } else { + ASSERT_EQ(m_shadow_map.exists(req.key()), false) << "Node key " << k << " is missing in the btree"; + } + } + + void get_any(uint32_t start_k, uint32_t end_k) const { + auto out_k = std::make_unique< K >(); + auto out_v = std::make_unique< V >(); + auto req = + BtreeGetAnyRequest< K >{BtreeKeyRange< K >{K{start_k}, true, K{end_k}, true}, out_k.get(), out_v.get()}; + const auto status = m_bt->get(req); + if (status == btree_status_t::success) { + ASSERT_EQ(m_shadow_map.exists_in_range(*(K*)req.m_outkey, start_k, end_k), true) + << "Get Any returned key=" << *(K*)req.m_outkey << " which is not in range " << start_k << "-" << end_k + << "according to shadow map"; + m_shadow_map.validate_data(*(K*)req.m_outkey, *(V*)req.m_outval); + } else { + ASSERT_EQ(m_shadow_map.exists_in_range(*(K*)req.m_outkey, start_k, end_k), false) + << "Get Any couldn't find key in the range " << start_k << "-" << end_k + << " but it present in shadow map"; + } + } + + void multi_op_execute(const std::vector< std::pair< std::string, int > >& op_list) { + preload(SISL_OPTIONS["preload_size"].as< uint32_t >()); + print_keys(); + run_in_parallel(op_list); + print_keys(); + } + + void print(const std::string& file = "") const { m_bt->print_tree(file); } + void print_keys() const { m_bt->print_tree_keys(); } + + void compare_files(const std::string& before, const std::string& after) { + std::ifstream b(before); + std::ifstream a(after); + std::ostringstream ss_before, ss_after; + ss_before << b.rdbuf(); + ss_after << a.rdbuf(); + std::string s1 = ss_before.str(); + std::string s2 = ss_after.str(); + ASSERT_EQ(s1, s2) << "Mismatch in btree structure"; + } + +private: + void do_put(uint64_t k, btree_put_type put_type, V const& value) { + auto existing_v = std::make_unique< V >(); + K key = K{k}; + auto sreq = BtreeSinglePutRequest{&key, &value, put_type, existing_v.get()}; + bool done = (m_bt->put(sreq) == btree_status_t::success); + + if (put_type == btree_put_type::INSERT) { + ASSERT_EQ(done, !m_shadow_map.exists(key)); + } else { + ASSERT_EQ(done, m_shadow_map.exists(key)); + } + + m_shadow_map.put_and_check(key, value, *existing_v, done); + m_range_scheduler.put_key(k); + } + + void do_range_remove(uint64_t start_k, uint64_t end_k, bool all_existing) { + K start_key = K{start_k}; + K end_key = K{end_k}; + + auto rreq = BtreeRangeRemoveRequest< K >{BtreeKeyRange< K >{start_key, true, end_key, true}}; + auto const ret = m_bt->remove(rreq); + m_shadow_map.range_erase(start_key, end_key); + + if (all_existing) { + ASSERT_EQ((ret == btree_status_t::success), true) + << "not a successful remove op for range " << start_k << "-" << end_k; + } + + if (start_k < m_max_range_input) { + m_range_scheduler.remove_keys(start_k, std::min(end_k, uint64_cast(m_max_range_input - 1))); + } + } + + void run_in_parallel(const std::vector< std::pair< std::string, int > >& op_list) { + auto test_count = m_fibers.size(); + for (auto it = m_fibers.begin(); it < m_fibers.end(); ++it) { + iomanager.run_on_forget(*it, [this, &test_count, op_list]() { + std::random_device g_rd{}; + std::default_random_engine re{g_rd()}; + const auto num_iters_per_thread = + sisl::round_up(SISL_OPTIONS["num_iters"].as< uint32_t >() / m_fibers.size(), m_fibers.size()); + std::vector< uint32_t > weights; + std::transform(op_list.begin(), op_list.end(), std::back_inserter(weights), + [](const auto& pair) { return pair.second; }); + + // Construct a weighted distribution based on the input frequencies + std::discrete_distribution< uint32_t > s_rand_op_generator(weights.begin(), weights.end()); + + for (uint32_t i = 0; i < num_iters_per_thread; i++) { + uint32_t op_idx = s_rand_op_generator(re); + (this->m_operations[op_list[op_idx].first])(); + } + { + std::unique_lock lg(m_test_done_mtx); + if (--test_count == 0) { m_test_done_cv.notify_one(); } + } + }); + } + + { + std::unique_lock< std::mutex > lk(m_test_done_mtx); + m_test_done_cv.wait(lk, [&]() { return test_count == 0; }); + } + LOGINFO("ALL parallel jobs joined"); + } +}; \ No newline at end of file diff --git a/src/tests/btree_test_kvs.hpp b/src/tests/btree_helpers/btree_test_kvs.hpp similarity index 56% rename from src/tests/btree_test_kvs.hpp rename to src/tests/btree_helpers/btree_test_kvs.hpp index 21c283bc6..ffd45ed9c 100644 --- a/src/tests/btree_test_kvs.hpp +++ b/src/tests/btree_helpers/btree_test_kvs.hpp @@ -60,19 +60,19 @@ using namespace homestore; class TestFixedKey : public BtreeKey { private: - uint32_t m_key{0}; + uint64_t m_key{0}; public: TestFixedKey() = default; - TestFixedKey(uint32_t k) : m_key{k} {} + TestFixedKey(uint64_t k) : m_key{k} {} TestFixedKey(const TestFixedKey& other) : TestFixedKey(other.serialize(), true) {} TestFixedKey(const BtreeKey& other) : TestFixedKey(other.serialize(), true) {} - TestFixedKey(const sisl::blob& b, bool copy) : BtreeKey(), m_key{*(r_cast< const uint32_t* >(b.bytes))} {} - TestFixedKey& operator=(const TestFixedKey& other) { - clone(other); + TestFixedKey(const sisl::blob& b, bool copy) : BtreeKey(), m_key{*(r_cast< const uint64_t* >(b.bytes))} {} + TestFixedKey& operator=(const TestFixedKey& other) = default; + TestFixedKey& operator=(BtreeKey const& other) { + m_key = s_cast< TestFixedKey const& >(other).m_key; return *this; - }; - virtual void clone(const BtreeKey& other) override { m_key = ((TestFixedKey&)other).m_key; } + } virtual ~TestFixedKey() = default; @@ -102,16 +102,16 @@ class TestFixedKey : public BtreeKey { }*/ sisl::blob serialize() const override { - return sisl::blob{uintptr_cast(const_cast< uint32_t* >(&m_key)), uint32_cast(sizeof(uint32_t))}; + return sisl::blob{uintptr_cast(const_cast< uint64_t* >(&m_key)), uint32_cast(sizeof(uint64_t))}; } uint32_t serialized_size() const override { return get_fixed_size(); } static bool is_fixed_size() { return true; } - static uint32_t get_fixed_size() { return (sizeof(uint32_t)); } + static uint32_t get_fixed_size() { return (sizeof(uint64_t)); } std::string to_string() const { return fmt::format("{}", m_key); } - void deserialize(const sisl::blob& b, bool copy) override { m_key = *(r_cast< const uint32_t* >(b.bytes)); } + void deserialize(const sisl::blob& b, bool copy) override { m_key = *(r_cast< const uint64_t* >(b.bytes)); } - static uint32_t get_estimate_max_size() { return get_fixed_size(); } + static uint32_t get_max_size() { return get_fixed_size(); } friend std::ostream& operator<<(std::ostream& os, const TestFixedKey& k) { os << k.to_string(); return os; @@ -120,12 +120,12 @@ class TestFixedKey : public BtreeKey { bool operator<(const TestFixedKey& o) const { return (compare(o) < 0); } bool operator==(const TestFixedKey& other) const { return (compare(other) == 0); } - uint32_t key() const { return m_key; } - uint32_t start_key(const BtreeKeyRange< TestFixedKey >& range) const { + uint64_t key() const { return m_key; } + uint64_t start_key(const BtreeKeyRange< TestFixedKey >& range) const { const TestFixedKey& k = (const TestFixedKey&)(range.start_key()); return k.m_key; } - uint32_t end_key(const BtreeKeyRange< TestFixedKey >& range) const { + uint64_t end_key(const BtreeKeyRange< TestFixedKey >& range) const { const TestFixedKey& k = (const TestFixedKey&)(range.end_key()); return k.m_key; } @@ -133,10 +133,10 @@ class TestFixedKey : public BtreeKey { class TestVarLenKey : public BtreeKey { private: - uint32_t m_key{0}; + uint64_t m_key{0}; - static uint32_t rand_key_size() { - return (uint32_cast(std::abs(std::round(g_randkeysize_generator(g_re)))) % g_max_keysize) + 1; + static uint64_t rand_key_size() { + return (uint64_cast(std::abs(std::round(g_randkeysize_generator(g_re)))) % g_max_keysize) + 1; } static std::shared_ptr< std::string > idx_to_key(uint32_t idx) { @@ -154,7 +154,7 @@ class TestVarLenKey : public BtreeKey { public: TestVarLenKey() = default; - TestVarLenKey(uint32_t k) : BtreeKey(), m_key{k} {} + TestVarLenKey(uint64_t k) : BtreeKey(), m_key{k} {} TestVarLenKey(const BtreeKey& other) : TestVarLenKey(other.serialize(), true) {} TestVarLenKey(const TestVarLenKey& other) = default; TestVarLenKey(TestVarLenKey&& other) = default; @@ -164,8 +164,6 @@ class TestVarLenKey : public BtreeKey { TestVarLenKey(const sisl::blob& b, bool copy) : BtreeKey() { deserialize(b, copy); } virtual ~TestVarLenKey() = default; - virtual void clone(const BtreeKey& other) override { m_key = ((TestVarLenKey&)other).m_key; } - sisl::blob serialize() const override { const auto& data = idx_to_key(m_key); return sisl::blob{(uint8_t*)(data->c_str()), (uint32_t)data->size()}; @@ -187,7 +185,7 @@ class TestVarLenKey : public BtreeKey { } // Add 8 bytes for preamble. - static uint32_t get_estimate_max_size() { return g_max_keysize + 8; } + static uint32_t get_max_size() { return g_max_keysize + 8; } int compare(const BtreeKey& o) const override { const TestVarLenKey& other = s_cast< const TestVarLenKey& >(o); @@ -224,17 +222,136 @@ class TestVarLenKey : public BtreeKey { bool operator<(const TestVarLenKey& o) const { return (compare(o) < 0); } bool operator==(const TestVarLenKey& other) const { return (compare(other) == 0); } - uint32_t key() const { return m_key; } - uint32_t start_key(const BtreeKeyRange< TestVarLenKey >& range) const { + uint64_t key() const { return m_key; } + uint64_t start_key(const BtreeKeyRange< TestVarLenKey >& range) const { const TestVarLenKey& k = (const TestVarLenKey&)(range.start_key()); return k.m_key; } - uint32_t end_key(const BtreeKeyRange< TestVarLenKey >& range) const { + uint64_t end_key(const BtreeKeyRange< TestVarLenKey >& range) const { const TestVarLenKey& k = (const TestVarLenKey&)(range.end_key()); return k.m_key; } }; +class TestIntervalKey : public BtreeIntervalKey { +private: +#pragma pack(1) + uint32_t m_base{0}; + uint32_t m_offset{0}; +#pragma pack() + +public: + TestIntervalKey() = default; + TestIntervalKey(uint64_t k) { + m_base = uint32_cast(k >> 32); + m_offset = uint32_cast(k & 0xFFFFFFFF); + } + TestIntervalKey(uint32_t b, uint32_t o) : m_base{b}, m_offset{o} {} + TestIntervalKey(const TestIntervalKey& other) = default; + TestIntervalKey(const BtreeKey& other) : TestIntervalKey(other.serialize(), true) {} + TestIntervalKey(const sisl::blob& b, bool copy) : BtreeIntervalKey() { + TestIntervalKey* other = r_cast< TestIntervalKey* >(b.bytes); + m_base = other->m_base; + m_offset = other->m_offset; + } + + TestIntervalKey& operator=(TestIntervalKey const& other) { + m_base = other.m_base; + m_offset = other.m_offset; + return *this; + }; + virtual ~TestIntervalKey() = default; + + /////////////////// Overriding methods of BtreeKey ///////////////// + int compare(BtreeKey const& o) const override { + TestIntervalKey const& other = s_cast< TestIntervalKey const& >(o); + if (m_base < other.m_base) { + return -1; + } else if (m_base > other.m_base) { + return 1; + } else if (m_offset < other.m_offset) { + return -1; + } else if (m_offset > other.m_offset) { + return 1; + } else { + return 0; + } + } + + sisl::blob serialize() const override { + return sisl::blob{uintptr_cast(const_cast< TestIntervalKey* >(this)), uint32_cast(sizeof(TestIntervalKey))}; + } + + uint32_t serialized_size() const override { return sizeof(TestIntervalKey); } + + void deserialize(sisl::blob const& b, bool copy) override { + assert(b.size == sizeof(TestIntervalKey)); + TestIntervalKey* other = r_cast< TestIntervalKey* >(b.bytes); + m_base = other->m_base; + m_offset = other->m_offset; + } + + std::string to_string() const override { return fmt::format("{}.{}", m_base, m_offset); } + + static uint32_t get_max_size() { return sizeof(TestIntervalKey); } + + static bool is_fixed_size() { return true; } + + static uint32_t get_fixed_size() { return sizeof(TestIntervalKey); } + + /////////////////// Overriding methods of BtreeIntervalKey ///////////////// + void shift(int n) override { m_offset += n; } + + int distance(BtreeKey const& f) const override { + TestIntervalKey const& from = s_cast< TestIntervalKey const& >(f); + DEBUG_ASSERT_EQ(m_base, from.m_base, "Invalid from key for distance"); + DEBUG_ASSERT_GE(m_offset, from.m_offset, "Invalid from key for distance"); + return m_offset - from.m_offset; + } + + bool is_interval_key() const override { return true; } + + sisl::blob serialize_prefix() const override { + return sisl::blob{uintptr_cast(const_cast< uint32_t* >(&m_base)), uint32_cast(sizeof(uint32_t))}; + } + + sisl::blob serialize_suffix() const override { + return sisl::blob{uintptr_cast(const_cast< uint32_t* >(&m_offset)), uint32_cast(sizeof(uint32_t))}; + } + + uint32_t serialized_prefix_size() const override { return uint32_cast(sizeof(uint32_t)); } + + uint32_t serialized_suffix_size() const override { return uint32_cast(sizeof(uint32_t)); }; + + void deserialize(sisl::blob const& prefix, sisl::blob const& suffix, bool) { + DEBUG_ASSERT_EQ(prefix.size, sizeof(uint32_t), "Invalid prefix size on deserialize"); + DEBUG_ASSERT_EQ(suffix.size, sizeof(uint32_t), "Invalid suffix size on deserialize"); + uint32_t* other_p = r_cast< uint32_t* >(prefix.bytes); + m_base = *other_p; + + uint32_t* other_s = r_cast< uint32_t* >(suffix.bytes); + m_offset = *other_s; + } + + /////////////////// Local methods for helping tests ////////////////// + bool operator<(const TestIntervalKey& o) const { return (compare(o) < 0); } + bool operator==(const TestIntervalKey& other) const { return (compare(other) == 0); } + + uint64_t key() const { return (uint64_cast(m_base) << 32) | m_offset; } + uint64_t start_key(const BtreeKeyRange< TestIntervalKey >& range) const { + const TestIntervalKey& k = (const TestIntervalKey&)(range.start_key()); + return k.key(); + } + uint64_t end_key(const BtreeKeyRange< TestIntervalKey >& range) const { + const TestIntervalKey& k = (const TestIntervalKey&)(range.end_key()); + return k.key(); + } + friend std::ostream& operator<<(std::ostream& os, const TestIntervalKey& k) { + os << k.to_string(); + return os; + } +}; + class TestFixedValue : public BtreeValue { private: public: @@ -327,3 +444,71 @@ class TestVarLenValue : public BtreeValue { private: std::string m_val; }; + +class TestIntervalValue : public BtreeIntervalValue { +private: +#pragma pack(1) + uint32_t m_base_val{0}; + uint16_t m_offset{0}; +#pragma pack() + +public: + TestIntervalValue(bnodeid_t val) { assert(0); } + TestIntervalValue(uint32_t val, uint16_t o) : BtreeIntervalValue(), m_base_val{val}, m_offset{o} {} + TestIntervalValue() = default; + TestIntervalValue(const TestIntervalValue& other) : + BtreeIntervalValue(), m_base_val{other.m_base_val}, m_offset{other.m_offset} {} + TestIntervalValue(const sisl::blob& b, bool copy) : BtreeIntervalValue() { this->deserialize(b, copy); } + virtual ~TestIntervalValue() = default; + + static TestIntervalValue generate_rand() { + return TestIntervalValue{g_randval_generator(g_re), s_cast< uint16_t >(0)}; + } + + ///////////////////////////// Overriding methods of BtreeValue ////////////////////////// + TestIntervalValue& operator=(const TestIntervalValue& other) = default; + sisl::blob serialize() const override { + sisl::blob b; + b.bytes = uintptr_cast(const_cast< TestIntervalValue* >(this)); + b.size = sizeof(TestIntervalValue); + return b; + } + + uint32_t serialized_size() const override { return sizeof(TestIntervalValue); } + static uint32_t get_fixed_size() { return sizeof(TestIntervalValue); } + void deserialize(const sisl::blob& b, bool) { + TestIntervalValue const* other = r_cast< TestIntervalValue const* >(b.bytes); + m_base_val = other->m_base_val; + m_offset = other->m_offset; + } + + std::string to_string() const override { return fmt::format("{}.{}", m_base_val, m_offset); } + + friend std::ostream& operator<<(std::ostream& os, const TestIntervalValue& v) { + os << v.to_string(); + return os; + } + + ///////////////////////////// Overriding methods of BtreeIntervalValue ////////////////////////// + void shift(int n) override { m_offset += n; } + + sisl::blob serialize_prefix() const override { + return sisl::blob{uintptr_cast(const_cast< uint32_t* >(&m_base_val)), uint32_cast(sizeof(uint32_t))}; + } + sisl::blob serialize_suffix() const override { + return sisl::blob{uintptr_cast(const_cast< uint16_t* >(&m_offset)), uint32_cast(sizeof(uint16_t))}; + } + uint32_t serialized_prefix_size() const override { return uint32_cast(sizeof(uint32_t)); } + uint32_t serialized_suffix_size() const override { return uint32_cast(sizeof(uint16_t)); } + + void deserialize(sisl::blob const& prefix, sisl::blob const& suffix, bool) override { + DEBUG_ASSERT_EQ(prefix.size, sizeof(uint32_t), "Invalid prefix size on deserialize"); + DEBUG_ASSERT_EQ(suffix.size, sizeof(uint16_t), "Invalid suffix size on deserialize"); + m_base_val = *(r_cast< uint32_t* >(prefix.bytes)); + m_offset = *(r_cast< uint16_t* >(suffix.bytes)); + } + + bool operator==(TestIntervalValue const& other) const { + return ((m_base_val == other.m_base_val) && (m_offset == other.m_offset)); + } +}; diff --git a/src/tests/btree_helpers/shadow_map.hpp b/src/tests/btree_helpers/shadow_map.hpp new file mode 100644 index 000000000..1e7418122 --- /dev/null +++ b/src/tests/btree_helpers/shadow_map.hpp @@ -0,0 +1,96 @@ +#include +#include + +#include "btree_test_kvs.hpp" + +template < typename K, typename V > +class ShadowMap { +private: + std::map< K, V > m_map; + +public: + void put_and_check(const K& key, const V& val, const V& old_val, bool expected_success) { + auto const [it, happened] = m_map.insert(std::make_pair(key, val)); + ASSERT_EQ(happened, expected_success) << "Testcase issue, expected inserted slots to be in shadow map"; + if (!happened) { + ASSERT_EQ(old_val, it->second) << "Put: Existing value doesn't return correct data for key: " << it->first; + } + } + + void range_upsert(uint64_t start_k, uint32_t count, const V& val) { + for (uint32_t i{0}; i < count; ++i) { + K key{start_k + i}; + V range_value{val}; + if constexpr (std::is_same_v< V, TestIntervalValue >) { range_value.shift(i); } + m_map.insert_or_assign(key, range_value); + } + } + + void range_update(const K& start_key, uint32_t count, const V& new_val) { + auto const start_it = m_map.lower_bound(start_key); + auto it = start_it; + uint32_t c = 0; + while ((it != m_map.end()) && (++c <= count)) { + it->second = new_val; + ++it; + } + } + + std::pair< K, K > pick_existing_range(const K& start_key, uint32_t max_count) const { + auto const start_it = m_map.lower_bound(start_key); + auto it = start_it; + uint32_t count = 0; + while ((it != m_map.cend()) && (++count < max_count)) { + ++it; + } + return std::pair(start_it->first, it->first); + } + + bool exists(const K& key) const { return m_map.find(key) != m_map.end(); } + + bool exists_in_range(const K& key, uint64_t start_k, uint64_t end_k) const { + const auto itlower = m_map.lower_bound(K{start_k}); + const auto itupper = m_map.upper_bound(K{end_k}); + auto it = itlower; + while (it != itupper) { + if (it->first == key) { return true; } + ++it; + } + return false; + } + + uint64_t size() const { return m_map.size(); } + + uint32_t num_elems_in_range(uint64_t start_k, uint64_t end_k) const { + const auto itlower = m_map.lower_bound(K{start_k}); + const auto itupper = m_map.upper_bound(K{end_k}); + return std::distance(itlower, itupper); + } + + void validate_data(const K& key, const V& btree_val) const { + const auto r = m_map.find(key); + ASSERT_NE(r, m_map.end()) << "Key " << key.to_string() << " is not present in shadow map"; + ASSERT_EQ(btree_val, r->second) << "Found value in btree doesn't return correct data for key=" << r->first; + } + + void erase(const K& key) { m_map.erase(key); } + + void range_erase(const K& start_key, uint32_t count) { + auto const it = m_map.lower_bound(start_key); + uint32_t i{0}; + while ((it != m_map.cend()) && (i++ < count)) { + it = m_map.erase(it); + } + } + + void range_erase(const K& start_key, const K& end_key) { + auto it = m_map.lower_bound(start_key); + auto const end_it = m_map.upper_bound(end_key); + while ((it != m_map.cend()) && (it != end_it)) { + it = m_map.erase(it); + } + } + + std::map< K, V >& map() { return m_map; } + const std::map< K, V >& map_const() const { return m_map; } +}; diff --git a/src/tests/test_btree_node.cpp b/src/tests/test_btree_node.cpp index 1f06d8bdf..23109cd58 100644 --- a/src/tests/test_btree_node.cpp +++ b/src/tests/test_btree_node.cpp @@ -23,7 +23,8 @@ #include #include #include -#include "btree_test_kvs.hpp" +#include +#include "btree_helpers/btree_test_kvs.hpp" static constexpr uint32_t g_node_size{4096}; static constexpr uint32_t g_max_keys{6000}; @@ -56,6 +57,12 @@ struct VarObjSizeNodeTest { using ValueType = TestVarLenValue; }; +struct PrefixIntervalBtreeTest { + using NodeType = FixedPrefixNode< TestIntervalKey, TestIntervalValue >; + using KeyType = TestIntervalKey; + using ValueType = TestIntervalValue; +}; + template < typename TestType > struct NodeTest : public testing::Test { using T = TestType; @@ -85,9 +92,7 @@ struct NodeTest : public testing::Test { bool done = m_node1->put(key, value, put_type, &existing_v); bool expected_done{true}; - if (m_shadow_map.find(key) != m_shadow_map.end()) { - expected_done = (put_type != btree_put_type::INSERT_ONLY_IF_NOT_EXISTS); - } + if (m_shadow_map.find(key) != m_shadow_map.end()) { expected_done = (put_type != btree_put_type::INSERT); } ASSERT_EQ(done, expected_done) << "Expected put of key " << k << " of put_type " << enum_name(put_type) << " to be " << expected_done; if (expected_done) { @@ -100,6 +105,39 @@ struct NodeTest : public testing::Test { } } + void put_range(uint32_t k, uint32_t count) { + btree_put_type put_type; + if constexpr (!std::is_same_v< V, TestIntervalValue >) { + // For non-interval values we support only update, so we need to first put the value + for (uint32_t i{0}; i < count; ++i) { + this->put(k + i, btree_put_type::UPSERT); + } + put_type = btree_put_type::UPDATE; + } else { + put_type = btree_put_type::UPSERT; + } + + K start_key{k}; + K end_key{k + count - 1}; + V value{V::generate_rand()}; + auto status = m_node1->multi_put(BtreeKeyRange{start_key, true, end_key, true}, start_key, value, put_type, + nullptr /* last_failed_key */); + ASSERT_EQ(status, btree_status_t::success) << "Expected range put of key " << k << " to " << k + count - 1 + << " of put_type " << enum_name(put_type) << " to be successful"; + + for (uint32_t i{0}; i < count; ++i) { + K key{k + i}; + V range_value{value}; + if constexpr (std::is_same_v< V, TestIntervalValue >) { range_value.shift(i); } + + if (m_shadow_map.find(key) != m_shadow_map.end()) { + if (put_type != btree_put_type::INSERT) { m_shadow_map.insert_or_assign(key, range_value); } + } else { + m_shadow_map.insert(std::make_pair(key, range_value)); + } + } + } + void update(uint32_t k, bool validate_update = true) { K key{k}; V value{V::generate_rand()}; @@ -140,6 +178,7 @@ struct NodeTest : public testing::Test { if (validate_remove) { validate_specific(k); } } +#if 0 void remove_range(uint32_t start_idx, uint32_t end_idx) { ASSERT_LT(end_idx, m_node1->total_entries()); ASSERT_LT(start_idx, m_node1->total_entries()); @@ -168,15 +207,40 @@ struct NodeTest : public testing::Test { << "end index key= " << head_k << " key[" << i << "]= " << m_node1->template get_nth_key< K >(i, false); } } +#endif + + void remove_range(uint32_t start_idx, uint32_t end_idx) { + ASSERT_LT(end_idx, m_node1->total_entries()); + ASSERT_LT(start_idx, m_node1->total_entries()); + ASSERT_GE(start_idx, 0); + ASSERT_GE(end_idx, start_idx); + + auto num_entries = m_node1->total_entries(); + auto expected_nremoved = std::distance(m_shadow_map.lower_bound(start_idx), m_shadow_map.upper_bound(end_idx)); + + uint32_t nremoved = m_node1->multi_remove(BtreeKeyRange< K >{K{start_idx}, true, K{end_idx}, true}); + ASSERT_EQ(nremoved, expected_nremoved) << "multi_remove nremoved doesn't match what is expected"; + auto new_num_entries = m_node1->total_entries(); + + ASSERT_EQ(new_num_entries, num_entries - nremoved) + << "Total deleted objects does not match! start_idx= " << start_idx << " end_idx= " << end_idx + << " expected delete: " << end_idx - start_idx + 1 << " original node entries: " << num_entries + << " current node entries: " << new_num_entries; + + // Validating if every entry in the node is sorted correctly. + for (uint32_t i = 0; i < new_num_entries; i++) { + m_shadow_map.erase(K{start_idx + i}); + } + } void validate_get_all() const { uint32_t start_ind{0}; uint32_t end_ind{0}; std::vector< std::pair< K, V > > out_vector; - auto ret = m_node1->get_all(BtreeKeyRange< K >{K{0u}, true, K{g_max_keys}, false}, g_max_keys, start_ind, - end_ind, &out_vector); - ret += m_node2->get_all(BtreeKeyRange< K >{K{0u}, true, K{g_max_keys}, false}, g_max_keys, start_ind, end_ind, - &out_vector); + auto ret = m_node1->multi_get(BtreeKeyRange< K >{K{0u}, true, K{g_max_keys}, false}, g_max_keys, start_ind, + end_ind, &out_vector); + ret += m_node2->multi_get(BtreeKeyRange< K >{K{0u}, true, K{g_max_keys}, false}, g_max_keys, start_ind, end_ind, + &out_vector); ASSERT_EQ(ret, m_shadow_map.size()) << "Expected number of entries to be same with shadow_map size"; ASSERT_EQ(out_vector.size(), m_shadow_map.size()) @@ -243,7 +307,7 @@ struct NodeTest : public testing::Test { void put_list(const std::vector< uint32_t >& keys) { for (const auto& k : keys) { if (!this->has_room()) { break; } - put(k, btree_put_type::INSERT_ONLY_IF_NOT_EXISTS); + put(k, btree_put_type::INSERT); } } @@ -252,7 +316,7 @@ struct NodeTest : public testing::Test { LOGDEBUG("Node2:\n {}", m_node2->to_string(true)); } - uint32_t remaining_space() const { return m_node1->available_size(m_cfg); } + uint32_t remaining_space() const { return m_node1->available_size(); } bool has_room() const { return remaining_space() > (g_max_keysize + g_max_valsize + 32); } private: @@ -263,12 +327,13 @@ struct NodeTest : public testing::Test { } }; -using NodeTypes = testing::Types< FixedLenNodeTest, VarKeySizeNodeTest, VarValueSizeNodeTest, VarObjSizeNodeTest >; +using NodeTypes = testing::Types< FixedLenNodeTest, VarKeySizeNodeTest, VarValueSizeNodeTest, VarObjSizeNodeTest, + PrefixIntervalBtreeTest >; TYPED_TEST_SUITE(NodeTest, NodeTypes); TYPED_TEST(NodeTest, SequentialInsert) { for (uint32_t i{0}; (i < 100 && this->has_room()); ++i) { - this->put(i, btree_put_type::INSERT_ONLY_IF_NOT_EXISTS); + this->put(i, btree_put_type::INSERT); } this->print(); this->validate_get_all(); @@ -279,7 +344,7 @@ TYPED_TEST(NodeTest, SequentialInsert) { TYPED_TEST(NodeTest, ReverseInsert) { for (uint32_t i{100}; (i > 0 && this->has_room()); --i) { - this->put(i - 1, btree_put_type::INSERT_ONLY_IF_NOT_EXISTS); + this->put(i - 1, btree_put_type::INSERT); } this->print(); this->validate_get_all(); @@ -302,9 +367,17 @@ TYPED_TEST(NodeTest, Remove) { this->validate_get_any(g_max_keys / 2, g_max_keys - 1); } +TYPED_TEST(NodeTest, RangePutGet) { + for (uint32_t i = 0; i < 40; i += 5) { + this->put_range(i, 5); + } + + this->validate_get_all(); +} + TYPED_TEST(NodeTest, RemoveRangeIndex) { for (uint32_t i = 0; i < 20; i++) { - this->put(i, btree_put_type::INSERT_ONLY_IF_NOT_EXISTS); + this->put(i, btree_put_type::INSERT); } this->print(); this->remove_range(5, 10); // size = 14 EXPECT: 0 1 2 3 4 [5 6 7 8 9 10] 11 12 13 14 15 16 17 18 19 @@ -329,7 +402,7 @@ TYPED_TEST(NodeTest, Update) { TYPED_TEST(NodeTest, RandomInsertRemoveUpdate) { uint32_t num_inserted{0}; while (this->has_room()) { - this->put(g_randkey_generator(g_re), btree_put_type::INSERT_ONLY_IF_NOT_EXISTS); + this->put(g_randkey_generator(g_re), btree_put_type::INSERT); ++num_inserted; } LOGDEBUG("After random insertion of {} objects", num_inserted); diff --git a/src/tests/test_common/range_scheduler.hpp b/src/tests/test_common/range_scheduler.hpp index 57732aa51..5dc2e4d1b 100644 --- a/src/tests/test_common/range_scheduler.hpp +++ b/src/tests/test_common/range_scheduler.hpp @@ -20,19 +20,44 @@ #pragma once -#include -#include -#include -#include +#include #include + namespace homestore { -using namespace boost::icl; -typedef interval_set< uint32_t > set_t; -typedef set_t::interval_type ival; using mutex = iomgr::FiberManagerLib::shared_mutex; +static std::pair< uint64_t, uint64_t > get_next_contiguous_set_bits(const sisl::Bitset& bm, uint64_t search_start_bit, + uint64_t max_count) { + uint64_t first_set_bit{sisl::Bitset::npos}; + uint64_t set_count{0}; + uint64_t b; + while (((b = bm.get_next_set_bit(search_start_bit)) != sisl::Bitset::npos) && (set_count < max_count)) { + if (first_set_bit == sisl::Bitset::npos) { + first_set_bit = b; + } else if (b > search_start_bit) { + break; + } + ++set_count; + search_start_bit = b + 1; + } + + return std::pair(first_set_bit, set_count); +} + class RangeScheduler { +private: + sisl::Bitset m_existing_keys; + sisl::Bitset m_working_keys; + mutex m_set_lock; + std::uniform_int_distribution< uint32_t > m_rand_start_key_generator; + + std::random_device m_rd; + public: + RangeScheduler(uint32_t num_keys) : m_existing_keys{num_keys}, m_working_keys{num_keys} { + m_rand_start_key_generator = std::uniform_int_distribution< uint32_t >(0, num_keys - 1); + } + void remove_keys_from_working(uint32_t s, uint32_t e) { std::unique_lock< mutex > lk(m_set_lock); remove_from_working(s, e); @@ -62,165 +87,109 @@ class RangeScheduler { remove_from_working(start_key, end_key); } - int pick_random_non_existing_keys(uint32_t n_keys = 1, uint32_t max_range = 0) { + std::pair< uint32_t, uint32_t > pick_random_non_existing_keys(uint32_t max_keys) { + std::pair< uint32_t, uint32_t > ret; + do { + ret = try_pick_random_non_existing_keys(max_keys); + if (ret.first != UINT32_MAX) { break; } + } while (true); + + return ret; + } + + std::pair< uint32_t, uint32_t > pick_random_existing_keys(uint32_t max_keys) { + std::pair< uint32_t, uint32_t > ret; + do { + ret = try_pick_random_existing_keys(max_keys); + if (ret.first != UINT32_MAX) { break; } + } while (true); + + return ret; + } + + std::pair< uint32_t, uint32_t > pick_random_non_working_keys(uint32_t max_keys) { + std::pair< uint32_t, uint32_t > ret; + do { + ret = try_pick_random_non_working_keys(max_keys); + if (ret.first != UINT32_MAX) { break; } + } while (true); + + return ret; + } + +private: + std::pair< uint32_t, uint32_t > try_pick_random_non_existing_keys(uint32_t max_keys) { std::unique_lock< mutex > lk(m_set_lock); - uint32_t working_range = max_range <= 0 ? std::numeric_limits< uint32_t >::max() : max_range; - uint32_t num_retry = 0; - - auto num_intervals = static_cast< uint32_t >(m_existing_keys.iterative_size()); - std::uniform_int_distribution< uint32_t > s_rand_interval_generator{0, num_intervals - 1}; - uint32_t start_key = std::numeric_limits< uint32_t >::max(); - - while (num_retry < max_retries) { - // find a random interval - uint32_t next_lower = working_range; - uint32_t previous_upper = 0; - auto it = m_existing_keys.begin(); - // if the selected interval is the last ... check size between this one and the working_range, rand n keys - // in (previous_upper, working_range] = [previous_upper+1, working_range] choose the gap between this upper - // and the next begin. and check the size! rand nkeys in [previous_upper, next_lower] - if (num_intervals != 0) { - uint32_t cur_interval_idx = s_rand_interval_generator(m_re); - std::advance(it, cur_interval_idx); - previous_upper = last(*it) + 1; // to be inclusivelast - it++; - if (it != m_existing_keys.end()) { next_lower = first(*it) - 1; } - } - if ((next_lower + 1) < (n_keys + previous_upper)) { // check < or <= - num_retry++; - continue; - } - - // choose randomly n keys in [previous_upper, next_lower] - std::uniform_int_distribution< uint32_t > rand_key_generator{ - previous_upper, next_lower - n_keys + 1}; // n_keys or n_keys +- (1) - start_key = rand_key_generator(m_re); - auto found = (m_working_keys & ival::closed(start_key, start_key + n_keys - 1)); - if (found.empty()) { - auto validate = m_existing_keys & ival::closed(start_key, start_key + n_keys - 1); - assert(validate.empty()); - break; - } - num_retry++; - continue; + if ((m_existing_keys.size() - m_existing_keys.get_set_count()) == 0) { + throw std::out_of_range("All keys are being worked on right now"); + } + + uint32_t const search_start = m_rand_start_key_generator(m_rd); + auto bb = m_existing_keys.get_next_contiguous_n_reset_bits(search_start, max_keys); + if (bb.nbits && m_working_keys.is_bits_reset(bb.start_bit, bb.nbits)) { + uint32_t const start = uint32_cast(bb.start_bit); + uint32_t const end = uint32_cast(bb.start_bit + bb.nbits - 1); + add_to_working(start, end); + return std::pair(start, end); + } else { + return std::pair(UINT32_MAX, UINT32_MAX); } - if (num_retry == max_retries) { return -1; } - // add from working keys and return the start_key; - this->add_to_working(start_key, start_key + n_keys - 1); - assert(start_key + n_keys - 1 <= working_range); - return static_cast< int >(start_key); } - int pick_random_existing_keys(uint32_t n_keys = 1, uint32_t max_range = 0) { + std::pair< uint32_t, uint32_t > try_pick_random_existing_keys(uint32_t max_keys) { std::unique_lock< mutex > lk(m_set_lock); - uint32_t working_range = max_range <= 0 ? std::numeric_limits< uint32_t >::max() : max_range; - uint32_t num_retry = 0; - - auto num_intervals = static_cast< uint32_t >(m_existing_keys.iterative_size()); - // empty keys - if (num_intervals == 0) { return -1; } - std::uniform_int_distribution< uint32_t > s_rand_interval_generator{0, num_intervals - 1}; - uint32_t start_key = std::numeric_limits< uint32_t >::max(); - - while (num_retry < max_retries) { - // find a random interval - auto it = m_existing_keys.begin(); - uint32_t cur_interval_idx = s_rand_interval_generator(m_re); - std::advance(it, cur_interval_idx); - uint32_t upper = last(*it); - uint32_t lower = first(*it); - if ((upper + 1) < (n_keys + lower)) { - num_retry++; - continue; - } - // choose randomly n keys in [lower, upper] - std::uniform_int_distribution< uint32_t > rand_key_generator{lower, upper - n_keys + 1}; - start_key = rand_key_generator(m_re); - auto found = (m_working_keys & ival::closed(start_key, start_key + n_keys - 1)); - if (found.empty()) { - auto validate = m_existing_keys & ival::closed(start_key, start_key + n_keys - 1); - assert(!validate.empty()); - break; - } - num_retry++; - continue; + if (m_existing_keys.get_set_count() == 0) { + DEBUG_ASSERT(false, "Couldn't find one existing keys"); + throw std::out_of_range("Couldn't find one existing keys"); + } + + uint32_t const search_start = m_rand_start_key_generator(m_rd); + auto [s, count] = get_next_contiguous_set_bits(m_existing_keys, search_start, max_keys); + + if (count && m_working_keys.is_bits_reset(s, count)) { + uint32_t const start = uint32_cast(s); + uint32_t const end = uint32_cast(s + count - 1); + add_to_working(start, end); + return std::pair(start, end); + } else { + return std::pair(UINT32_MAX, UINT32_MAX); } - if (num_retry == max_retries) { return -1; } - // add from working keys and return the start_key; - this->add_to_working(start_key, start_key + n_keys - 1); - assert(start_key + n_keys - 1 <= working_range); - return static_cast< int >(start_key); } - int pick_random_non_working_keys(uint32_t n_keys = 1, uint32_t max_range = 0) { + std::pair< uint32_t, uint32_t > try_pick_random_non_working_keys(uint32_t max_keys) { std::unique_lock< mutex > lk(m_set_lock); - uint32_t working_range = max_range <= 0 ? std::numeric_limits< uint32_t >::max() : max_range; - uint32_t num_retry = 0; - - auto num_intervals = static_cast< uint32_t >(m_working_keys.iterative_size()); - // empty keys - if (num_intervals == 0) { return -1; } - std::uniform_int_distribution< uint32_t > s_rand_interval_generator{0, num_intervals - 1}; - uint32_t start_key = std::numeric_limits< uint32_t >::max(); - - while (num_retry < max_retries) { - // find a random interval - uint32_t next_lower = working_range; - uint32_t previous_upper = 0; - auto it = m_working_keys.begin(); - if (num_intervals != 0) { - uint32_t cur_interval_idx = s_rand_interval_generator(m_re); - std::advance(it, cur_interval_idx); - previous_upper = last(*it) + 1; // to be inclusivelast - it++; - if (it != m_working_keys.end()) { next_lower = first(*it) - 1; } - } - if ((next_lower + 1) < (n_keys + previous_upper)) { // check < or <= - num_retry++; - continue; - } - - // choose randomly n keys in [previous_upper, next_lower] - std::uniform_int_distribution< uint32_t > rand_key_generator{ - previous_upper, next_lower - n_keys + 1}; // n_keys or n_keys +- (1) - start_key = rand_key_generator(m_re); - break; + + uint32_t const search_start = m_rand_start_key_generator(m_rd); + auto bb = m_working_keys.get_next_contiguous_n_reset_bits(search_start, max_keys); + + if (bb.nbits) { + uint32_t const start = uint32_cast(bb.start_bit); + uint32_t const end = uint32_cast(bb.start_bit + bb.nbits - 1); + add_to_working(start, end); + return std::pair(start, end); + } else { + return std::pair(UINT32_MAX, UINT32_MAX); } - if (num_retry == max_retries) { return -1; } - // add from working keys and return the start_key; - this->add_to_working(start_key, start_key + n_keys - 1); - assert(start_key + n_keys - 1 <= working_range); - return static_cast< int >(start_key); } -private: void add_to_existing(uint32_t s) { add_to_existing(s, s); } void add_to_working(uint32_t s) { add_to_working(s, s); } - void add_to_existing(uint32_t s, uint32_t e) { m_existing_keys += ival::closed(s, e); } + void add_to_existing(uint32_t s, uint32_t e) { m_existing_keys.set_bits(s, e - s + 1); } - void add_to_working(uint32_t s, uint32_t e) { m_working_keys += ival::closed(s, e); } + void add_to_working(uint32_t s, uint32_t e) { m_working_keys.set_bits(s, e - s + 1); } - void remove_from_existing(uint32_t s, uint32_t e) { m_existing_keys -= ival::closed(s, e); } + void remove_from_existing(uint32_t s, uint32_t e) { m_existing_keys.reset_bits(s, e - s + 1); } void remove_from_existing(uint32_t s) { remove_from_existing(s, s); } void remove_from_working(uint32_t s) { remove_from_working(s, s); } - void remove_from_working(uint32_t s, uint32_t e) { m_working_keys -= ival::closed(s, e); } + void remove_from_working(uint32_t s, uint32_t e) { m_working_keys.reset_bits(s, e - s + 1); } - bool is_working(uint32_t cur_key) { return m_working_keys.find(cur_key) != m_working_keys.end(); } - - bool is_existing(uint32_t cur_key) { return m_existing_keys.find(cur_key) != m_existing_keys.end(); } - -private: - set_t m_existing_keys; - set_t m_working_keys; - mutex m_set_lock; + bool is_working(uint32_t cur_key) const { return m_working_keys.is_bits_set(cur_key, 1); } - std::random_device m_rd{}; - std::default_random_engine m_re{m_rd()}; - const uint32_t max_retries = 5; + bool is_existing(uint32_t cur_key) const { return m_existing_keys.is_bits_set(cur_key, 1); } }; }; // namespace homestore diff --git a/src/tests/test_index_btree.cpp b/src/tests/test_index_btree.cpp index 6805f2b0e..14062cd1f 100644 --- a/src/tests/test_index_btree.cpp +++ b/src/tests/test_index_btree.cpp @@ -24,7 +24,6 @@ #include #include #include -#include "btree_test_kvs.hpp" #include #include #include @@ -32,6 +31,9 @@ #include "common/homestore_config.hpp" #include "common/resource_mgr.hpp" #include "test_common/homestore_test_common.hpp" +#include "test_common/range_scheduler.hpp" +#include "btree_helpers/btree_test_kvs.hpp" +#include "btree_helpers/btree_test_helper.hpp" using namespace homestore; @@ -82,8 +84,16 @@ struct VarObjSizeBtreeTest { static constexpr btree_node_type interior_node_type = btree_node_type::VAR_OBJECT; }; +struct PrefixIntervalBtreeTest { + using BtreeType = IndexTable< TestIntervalKey, TestIntervalValue >; + using KeyType = TestIntervalKey; + using ValueType = TestIntervalValue; + static constexpr btree_node_type leaf_node_type = btree_node_type::PREFIX; + static constexpr btree_node_type interior_node_type = btree_node_type::FIXED; +}; + template < typename TestType > -struct BtreeTest : public testing::Test { +struct BtreeTest : public BtreeTestHelper< TestType > { using T = TestType; using K = typename TestType::KeyType; using V = typename TestType::ValueType; @@ -94,7 +104,7 @@ struct BtreeTest : public testing::Test { std::shared_ptr< IndexTableBase > on_index_table_found(const superblk< index_table_sb >& sb) override { LOGINFO("Index table recovered"); LOGINFO("Root bnode_id {} version {}", sb->root_node, sb->link_version); - m_test->m_bt = std::make_shared< typename T::BtreeType >(sb, *m_test->m_bt_cfg); + m_test->m_bt = std::make_shared< typename T::BtreeType >(sb, m_test->m_cfg); return m_test->m_bt; } @@ -102,10 +112,6 @@ struct BtreeTest : public testing::Test { BtreeTest* m_test; }; - std::shared_ptr< typename T::BtreeType > m_bt; - std::map< K, V > m_shadow_map; - std::unique_ptr< BtreeConfig > m_bt_cfg; - void SetUp() override { test_common::HSTestHelper::start_homestore( "test_index_btree", @@ -113,11 +119,7 @@ struct BtreeTest : public testing::Test { {HS_SERVICE::INDEX, {.size_pct = 70.0, .index_svc_cbs = new TestIndexServiceCallbacks(this)}}}); LOGINFO("Node size {} ", hs()->index_service().node_size()); - m_bt_cfg = std::make_unique< BtreeConfig >(hs()->index_service().node_size()); - m_bt_cfg->m_leaf_node_type = T::leaf_node_type; - m_bt_cfg->m_int_node_type = T::interior_node_type; - // TODO fix. SequentialRemove failing in case of VarObj test. - m_bt_cfg->m_merge_turned_on = false; + this->m_cfg = BtreeConfig(hs()->index_service().node_size()); auto uuid = boost::uuids::random_generator()(); auto parent_uuid = boost::uuids::random_generator()(); @@ -130,12 +132,14 @@ struct BtreeTest : public testing::Test { homestore::hs()->resource_mgr().reset_dirty_buf_qd(); // Create index table and attach to index service. - m_bt = std::make_shared< typename T::BtreeType >(uuid, parent_uuid, 0, *m_bt_cfg); - hs()->index_service().add_index_table(m_bt); + BtreeTestHelper< TestType >::SetUp(); + this->m_bt = std::make_shared< typename T::BtreeType >(uuid, parent_uuid, 0, this->m_cfg); + hs()->index_service().add_index_table(this->m_bt); LOGINFO("Added index table to index service"); } void TearDown() override { + BtreeTestHelper< TestType >::TearDown(); test_common::HSTestHelper::shutdown_homestore(); } @@ -146,202 +150,12 @@ struct BtreeTest : public testing::Test { nullptr, true /* restart */); } - void put(uint32_t k, btree_put_type put_type) { - auto existing_v = std::make_unique< V >(); - auto pk = std::make_unique< K >(k); - auto pv = std::make_unique< V >(V::generate_rand()); - auto sreq{BtreeSinglePutRequest{pk.get(), pv.get(), put_type, existing_v.get()}}; - sreq.enable_route_tracing(); - bool done = (m_bt->put(sreq) == btree_status_t::success); - - // auto& sreq = to_single_put_req(req); - bool expected_done{true}; - if (m_shadow_map.find(*sreq.m_k) != m_shadow_map.end()) { - expected_done = (put_type != btree_put_type::INSERT_ONLY_IF_NOT_EXISTS); - } - ASSERT_EQ(done, expected_done) << "Expected put of key " << k << " of put_type " << enum_name(put_type) - << " to be " << expected_done; - if (expected_done) { - m_shadow_map.insert(std::make_pair((const K&)*sreq.m_k, (const V&)*sreq.m_v)); - } else { - const auto r = m_shadow_map.find(*sreq.m_k); - ASSERT_NE(r, m_shadow_map.end()) << "Testcase issue, expected inserted slots to be in shadow map"; - ASSERT_EQ((const V&)*sreq.m_existing_val, r->second) - << "Insert existing value doesn't return correct data for key " << r->first; - } - } - - void range_put(uint32_t max_count) { - const auto num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); - static std::uniform_int_distribution< uint32_t > s_randkey_start_generator{1, num_entries}; - auto val = std::make_unique< V >(V::generate_rand()); - - retry: - auto const start_it = m_shadow_map.lower_bound(K{s_randkey_start_generator(g_re)}); - auto end_it = start_it; - auto it = start_it; - uint32_t count = 0; - while ((it != m_shadow_map.end()) && (count++ < max_count)) { - it->second = *val; - end_it = it++; - } - if (count == 0) { goto retry; } - - auto mreq = BtreeRangePutRequest< K >{BtreeKeyRange< K >{start_it->first, true, end_it->first, true}, - btree_put_type::REPLACE_ONLY_IF_EXISTS, val.get()}; - mreq.enable_route_tracing(); - ASSERT_EQ(m_bt->put(mreq), btree_status_t::success); - } - - void remove_one(uint32_t k) { - auto existing_v = std::make_unique< V >(); - auto pk = std::make_unique< K >(k); - - auto rreq = BtreeSingleRemoveRequest{pk.get(), existing_v.get()}; - rreq.enable_route_tracing(); - bool removed = (m_bt->remove(rreq) == btree_status_t::success); - - bool expected_removed = (m_shadow_map.find(rreq.key()) != m_shadow_map.end()); - ASSERT_EQ(removed, expected_removed) << "Expected remove of key " << k << " to be " << expected_removed; - - if (removed) { - validate_data(rreq.key(), (const V&)rreq.value()); - m_shadow_map.erase(rreq.key()); - } - } - - void query_all_validate() const { - query_validate(0u, SISL_OPTIONS["num_entries"].as< uint32_t >() - 1, UINT32_MAX); - } - void query_all_paginate_validate(uint32_t batch_size) const { - query_validate(0u, SISL_OPTIONS["num_entries"].as< uint32_t >() - 1, batch_size); - } - - void query_validate(uint32_t start_k, uint32_t end_k, uint32_t batch_size) const { - std::vector< std::pair< K, V > > out_vector; - uint32_t remaining = num_elems_in_range(start_k, end_k); - auto it = m_shadow_map.lower_bound(K{start_k}); - - BtreeQueryRequest< K > qreq{BtreeKeyRange< K >{K{start_k}, true, K{end_k}, true}, - BtreeQueryType::SWEEP_NON_INTRUSIVE_PAGINATION_QUERY, batch_size}; - qreq.enable_route_tracing(); - - do { - out_vector.clear(); - auto const ret = m_bt->query(qreq, out_vector); - auto const expected_count = std::min(remaining, batch_size); - - remaining -= expected_count; - if (remaining == 0) { - ASSERT_EQ(ret, btree_status_t::success) << "Expected success on query"; - } else { - ASSERT_EQ(ret, btree_status_t::has_more) << "Expected query to return has_more"; - } - ASSERT_EQ(out_vector.size(), expected_count) << "Received incorrect value on query pagination"; - - for (size_t idx{0}; idx < out_vector.size(); ++idx) { - ASSERT_EQ(out_vector[idx].second, it->second) - << "Range get doesn't return correct data for key=" << it->first << " idx=" << idx; - ++it; - } - } while (remaining > 0); -#if 0 - out_vector.clear(); - auto ret = m_bt->query(qreq, out_vector); - ASSERT_EQ(ret, btree_status_t::success) << "Expected success on query"; - ASSERT_EQ(out_vector.size(), 0) << "Received incorrect value on empty query pagination"; -#endif - } - - void get_all_validate() const { - for (const auto& [key, value] : m_shadow_map) { - auto copy_key = std::make_unique< K >(); - *copy_key = key; - auto out_v = std::make_unique< V >(); - auto req = BtreeSingleGetRequest{copy_key.get(), out_v.get()}; - req.enable_route_tracing(); - const auto ret = m_bt->get(req); - ASSERT_EQ(ret, btree_status_t::success) << "Missing key " << key << " in btree but present in shadow map"; - ASSERT_EQ((const V&)req.value(), value) - << "Found value in btree doesn't return correct data for key=" << key; - } - } - - void get_specific_validate(uint32_t k) const { - auto pk = std::make_unique< K >(k); - auto out_v = std::make_unique< V >(); - auto req = BtreeSingleGetRequest{pk.get(), out_v.get()}; - - const auto status = m_bt->get(req); - if (status == btree_status_t::success) { - validate_data(req.key(), (const V&)req.value()); - } else { - ASSERT_EQ((m_shadow_map.find(req.key()) == m_shadow_map.end()), true) - << "Node key " << k << " is missing in the btree"; - } - } - - void get_any_validate(uint32_t start_k, uint32_t end_k) const { - auto out_k = std::make_unique< K >(); - auto out_v = std::make_unique< V >(); - auto req = - BtreeGetAnyRequest< K >{BtreeKeyRange< K >{K{start_k}, true, K{end_k}, true}, out_k.get(), out_v.get()}; - const auto status = m_bt->get(req); - if (status == btree_status_t::success) { - ASSERT_EQ(found_in_range(*(K*)req.m_outkey, start_k, end_k), true) - << "Get Any returned key=" << *(K*)req.m_outkey << " which is not in range " << start_k << "-" << end_k - << "according to shadow map"; - validate_data(*(K*)req.m_outkey, *(V*)req.m_outval); - } else { - ASSERT_EQ(found_in_range(*(K*)req.m_outkey, start_k, end_k), false) - << "Get Any couldn't find key in the range " << start_k << "-" << end_k - << " but it present in shadow map"; - } - } - - void print(const std::string& file = "") const { m_bt->print_tree(file); } - void destroy_btree() { auto cpg = hs()->cp_mgr().cp_guard(); auto op_context = (void*)cpg.context(cp_consumer_t::INDEX_SVC); - const auto [ret, free_node_cnt] = m_bt->destroy_btree(op_context); + const auto [ret, free_node_cnt] = this->m_bt->destroy_btree(op_context); ASSERT_EQ(ret, btree_status_t::success) << "btree destroy failed"; - m_bt.reset(); - } - - void compare_files(const std::string& before, const std::string& after) { - std::ifstream b(before); - std::ifstream a(after); - std::ostringstream ss_before, ss_after; - ss_before << b.rdbuf(); - ss_after << a.rdbuf(); - std::string s1 = ss_before.str(); - std::string s2 = ss_after.str(); - ASSERT_EQ(s1, s2) << "Mismatch in btree structure"; - } - -private: - void validate_data(const K& key, const V& btree_val) const { - const auto r = m_shadow_map.find(key); - ASSERT_NE(r, m_shadow_map.end()) << "Node key is not present in shadow map"; - ASSERT_EQ(btree_val, r->second) << "Found value in btree doesn't return correct data for key=" << r->first; - } - - bool found_in_range(const K& key, uint32_t start_k, uint32_t end_k) const { - const auto itlower = m_shadow_map.lower_bound(K{start_k}); - const auto itupper = m_shadow_map.upper_bound(K{end_k}); - auto it = itlower; - while (it != itupper) { - if (it->first == key) { return true; } - ++it; - } - return false; - } - - uint32_t num_elems_in_range(uint32_t start_k, uint32_t end_k) const { - const auto itlower = m_shadow_map.lower_bound(K{start_k}); - const auto itupper = m_shadow_map.upper_bound(K{end_k}); - return std::distance(itlower, itupper); + this->m_bt.reset(); } }; @@ -361,36 +175,36 @@ TYPED_TEST(BtreeTest, SequentialInsert) { const auto entries_iter1 = num_entries / 2; LOGINFO("Step 1: Do Forward sequential insert for {} entries", entries_iter1); for (uint32_t i{0}; i < entries_iter1; ++i) { - this->put(i, btree_put_type::INSERT_ONLY_IF_NOT_EXISTS); + this->put(i, btree_put_type::INSERT); // this->print(); } LOGINFO("Step 2: Query {} entries and validate with pagination of 75 entries", entries_iter1); - this->query_validate(0, entries_iter1 - 1, 75); + this->do_query(0, entries_iter1 - 1, 75); // Reverse sequential insert const auto entries_iter2 = num_entries - entries_iter1; LOGINFO("Step 3: Do Reverse sequential insert of remaining {} entries", entries_iter2); for (uint32_t i{num_entries - 1}; i >= entries_iter1; --i) { - this->put(i, btree_put_type::INSERT_ONLY_IF_NOT_EXISTS); + this->put(i, btree_put_type::INSERT); } LOGINFO("Step 4: Query {} entries and validate with pagination of 90 entries", entries_iter2); - this->query_validate(entries_iter1, num_entries - 1, 90); + this->do_query(entries_iter1, num_entries - 1, 90); // Do validate all of them LOGINFO("Step 5: Query all entries and validate with no pagination"); - this->query_all_validate(); + this->query_all(); LOGINFO("Step 6: Query all entries and validate with pagination of 80 entries"); - this->query_all_paginate_validate(80); + this->query_all_paginate(80); LOGINFO("Step 7: Get all entries 1-by-1 and validate them"); - this->get_all_validate(); - this->get_any_validate(num_entries - 3, num_entries + 1); + this->get_all(); + this->get_any(num_entries - 3, num_entries + 1); // Negative cases LOGINFO("Step 8: Do incorrect input and validate errors"); - this->query_validate(num_entries + 100, num_entries + 500, 5); - this->get_any_validate(num_entries + 1, num_entries + 2); + this->do_query(num_entries + 100, num_entries + 500, 5); + this->get_any(num_entries + 1, num_entries + 2); // this->print(); LOGINFO("SequentialInsert test end"); @@ -406,9 +220,9 @@ TYPED_TEST(BtreeTest, RandomInsert) { std::random_shuffle(vec.begin(), vec.end()); LOGINFO("Step 1: Do forward random insert for {} entries", num_entries); for (uint32_t i{0}; i < num_entries; ++i) { - this->put(vec[i], btree_put_type::INSERT_ONLY_IF_NOT_EXISTS); + this->put(vec[i], btree_put_type::INSERT); } - this->get_all_validate(); + this->get_all(); } #if 0 @@ -418,10 +232,10 @@ TYPED_TEST(BtreeTest, SequentialRemove) { const auto num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); LOGINFO("Step 1: Do Forward sequential insert for {} entries", num_entries); for (uint32_t i{0}; i < num_entries; ++i) { - this->put(i, btree_put_type::INSERT_ONLY_IF_NOT_EXISTS); + this->put(i, btree_put_type::INSERT); } LOGINFO("Step 2: Query {} entries and validate with pagination of 75 entries", num_entries); - this->query_validate(0, num_entries - 1, 75); + this->do_query(0, num_entries - 1, 75); const auto entries_iter1 = num_entries / 2; LOGINFO("Step 3: Do Forward sequential remove for {} entries", entries_iter1); @@ -429,8 +243,8 @@ TYPED_TEST(BtreeTest, SequentialRemove) { this->remove_one(i); } LOGINFO("Step 4: Query {} entries and validate with pagination of 75 entries", entries_iter1); - this->query_validate(0, entries_iter1 - 1, 75); - this->query_validate(entries_iter1, num_entries - 1, 75); + this->do_query(0, entries_iter1 - 1, 75); + this->do_query(entries_iter1, num_entries - 1, 75); const auto entries_iter2 = num_entries - entries_iter1; LOGINFO("Step 5: Do Reverse sequential remove of remaining {} entries", entries_iter2); @@ -439,9 +253,9 @@ TYPED_TEST(BtreeTest, SequentialRemove) { } LOGINFO("Step 6: Query the empty tree"); - this->query_validate(0, num_entries - 1, 75); - this->get_any_validate(0, 1); - this->get_specific_validate(0); + this->do_query(0, num_entries - 1, 75); + this->get_any(0, 1); + this->get_specific(0); LOGINFO("SequentialRemove test end"); } @@ -452,7 +266,7 @@ TYPED_TEST(BtreeTest, RandomRemove) { LOGINFO("Step 1: Do forward sequential insert for {} entries", num_entries); for (uint32_t i{0}; i < num_entries; ++i) { - this->put(i, btree_put_type::INSERT_ONLY_IF_NOT_EXISTS); + this->put(i, btree_put_type::INSERT); } std::vector< uint32_t > vec(num_entries); @@ -464,7 +278,7 @@ TYPED_TEST(BtreeTest, RandomRemove) { for (uint32_t i{0}; i < num_iters; ++i) { this->remove_one(vec[i]); } - this->get_all_validate(); + this->get_all(); } #endif @@ -474,17 +288,16 @@ TYPED_TEST(BtreeTest, RangeUpdate) { const auto num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); LOGINFO("Step 1: Do Forward sequential insert for {} entries", num_entries); for (uint32_t i{0}; i < num_entries; ++i) { - this->put(i, btree_put_type::INSERT_ONLY_IF_NOT_EXISTS); + this->put(i, btree_put_type::INSERT); } LOGINFO("Step 2: Do Range Update of random intervals between [1-50] for 100 times with random key ranges"); - static std::uniform_int_distribution< uint32_t > s_rand_key_count_generator{1, 50}; for (uint32_t i{0}; i < 100; ++i) { - this->range_put(s_rand_key_count_generator(g_re)); + this->range_put_random(); } LOGINFO("Step 2: Query {} entries and validate with pagination of 75 entries", num_entries); - this->query_validate(0, num_entries - 1, 75); + this->do_query(0, num_entries - 1, 75); LOGINFO("RangeUpdate test end"); } @@ -494,10 +307,10 @@ TYPED_TEST(BtreeTest, CpFlush) { const auto num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); LOGINFO("Do Forward sequential insert for {} entries", num_entries / 2); for (uint32_t i = 0; i < num_entries; ++i) { - this->put(i, btree_put_type::INSERT_ONLY_IF_NOT_EXISTS); + this->put(i, btree_put_type::INSERT); } LOGINFO("Query {} entries and validate with pagination of 75 entries", num_entries / 2); - this->query_validate(0, num_entries / 2 - 1, 75); + this->do_query(0, num_entries / 2 - 1, 75); this->print(std::string("before.txt")); @@ -505,7 +318,7 @@ TYPED_TEST(BtreeTest, CpFlush) { test_common::HSTestHelper::trigger_cp(true /* wait */); LOGINFO("Query {} entries and validate with pagination of 75 entries", num_entries); - this->query_validate(0, num_entries - 1, 75); + this->do_query(0, num_entries - 1, 75); this->destroy_btree(); @@ -518,7 +331,7 @@ TYPED_TEST(BtreeTest, CpFlush) { this->print(std::string("after.txt")); LOGINFO("Query {} entries", num_entries); - this->query_validate(0, num_entries - 1, 1000); + this->do_query(0, num_entries - 1, 1000); this->compare_files("before.txt", "after.txt"); LOGINFO("CpFlush test end"); @@ -530,7 +343,7 @@ TYPED_TEST(BtreeTest, MultipleCpFlush) { const auto num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); LOGINFO("Do Forward sequential insert for {} entries", num_entries / 2); for (uint32_t i = 0; i < num_entries / 2; ++i) { - this->put(i, btree_put_type::INSERT_ONLY_IF_NOT_EXISTS); + this->put(i, btree_put_type::INSERT); if (i % 500 == 0) { LOGINFO("Trigger checkpoint flush wait=false."); test_common::HSTestHelper::trigger_cp(false /* wait */); @@ -541,7 +354,7 @@ TYPED_TEST(BtreeTest, MultipleCpFlush) { test_common::HSTestHelper::trigger_cp(false /* wait */); for (uint32_t i = num_entries / 2; i < num_entries; ++i) { - this->put(i, btree_put_type::INSERT_ONLY_IF_NOT_EXISTS); + this->put(i, btree_put_type::INSERT); } LOGINFO("Trigger checkpoint flush wait=false."); @@ -551,7 +364,7 @@ TYPED_TEST(BtreeTest, MultipleCpFlush) { test_common::HSTestHelper::trigger_cp(true /* wait */); LOGINFO("Query {} entries and validate with pagination of 75 entries", num_entries); - this->query_validate(0, num_entries - 1, 75); + this->do_query(0, num_entries - 1, 75); this->print(std::string("before.txt")); @@ -567,7 +380,7 @@ TYPED_TEST(BtreeTest, MultipleCpFlush) { this->compare_files("before.txt", "after.txt"); LOGINFO("Query {} entries and validate with pagination of 1000 entries", num_entries); - this->query_validate(0, num_entries - 1, 1000); + this->do_query(0, num_entries - 1, 1000); LOGINFO("MultipleCpFlush test end"); } @@ -579,7 +392,7 @@ TYPED_TEST(BtreeTest, ThreadedCpFlush) { auto io_thread = std::thread([this, num_entries] { LOGINFO("Do Forward sequential insert for {} entries", num_entries); for (uint32_t i = 0; i < num_entries; ++i) { - this->put(i, btree_put_type::INSERT_ONLY_IF_NOT_EXISTS); + this->put(i, btree_put_type::INSERT); } }); @@ -599,7 +412,7 @@ TYPED_TEST(BtreeTest, ThreadedCpFlush) { test_common::HSTestHelper::trigger_cp(true /* wait */); LOGINFO("Query {} entries and validate with pagination of 75 entries", num_entries); - this->query_validate(0, num_entries - 1, 75); + this->do_query(0, num_entries - 1, 75); this->print(std::string("before.txt")); this->destroy_btree(); @@ -614,7 +427,7 @@ TYPED_TEST(BtreeTest, ThreadedCpFlush) { this->compare_files("before.txt", "after.txt"); LOGINFO("Query {} entries and validate with pagination of 1000 entries", num_entries); - this->query_validate(0, num_entries - 1, 1000); + this->do_query(0, num_entries - 1, 1000); LOGINFO("ThreadedCpFlush test end"); } diff --git a/src/tests/test_mem_btree.cpp b/src/tests/test_mem_btree.cpp index 979cdc498..ebf0a1801 100644 --- a/src/tests/test_mem_btree.cpp +++ b/src/tests/test_mem_btree.cpp @@ -1,7 +1,6 @@ /********************************************************************************* * Modifications Copyright 2017-2019 eBay Inc. * - * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at @@ -22,13 +21,15 @@ #include #include #include -#include "btree_test_kvs.hpp" + #include #include +#include #include #include "test_common/range_scheduler.hpp" +#include "btree_helpers/btree_test_kvs.hpp" +#include "btree_helpers/btree_test_helper.hpp" -static constexpr uint32_t g_node_size{4096}; using namespace homestore; SISL_LOGGING_INIT(btree, iomgr, io_wd, flip) @@ -81,262 +82,29 @@ struct VarObjSizeBtreeTest { static constexpr btree_node_type interior_node_type = btree_node_type::VAR_OBJECT; }; +struct PrefixIntervalBtreeTest { + using BtreeType = MemBtree< TestIntervalKey, TestIntervalValue >; + using KeyType = TestIntervalKey; + using ValueType = TestIntervalValue; + static constexpr btree_node_type leaf_node_type = btree_node_type::PREFIX; + static constexpr btree_node_type interior_node_type = btree_node_type::FIXED; +}; + template < typename TestType > -struct BtreeTest : public testing::Test { +struct BtreeTest : public BtreeTestHelper< TestType > { using T = TestType; using K = typename TestType::KeyType; using V = typename TestType::ValueType; - std::unique_ptr< typename T::BtreeType > m_bt; - std::map< K, V > m_shadow_map; - BtreeConfig m_cfg{g_node_size}; - void SetUp() override { - m_cfg.m_leaf_node_type = T::leaf_node_type; - m_cfg.m_int_node_type = T::interior_node_type; - if (SISL_OPTIONS.count("disable_merge")) m_cfg.m_merge_turned_on = false; - m_bt = std::make_unique< typename T::BtreeType >(m_cfg); - m_bt->init(nullptr); - } - - void put(uint32_t k, btree_put_type put_type) { - auto existing_v = std::make_unique< V >(); - auto pk = std::make_unique< K >(k); - auto pv = std::make_unique< V >(V::generate_rand()); - auto sreq{BtreeSinglePutRequest{pk.get(), pv.get(), put_type, existing_v.get()}}; - bool done = (m_bt->put(sreq) == btree_status_t::success); - - // auto& sreq = to_single_put_req(req); - bool expected_done{true}; - if (m_shadow_map.find(*sreq.m_k) != m_shadow_map.end()) { - expected_done = (put_type != btree_put_type::INSERT_ONLY_IF_NOT_EXISTS); - } - ASSERT_EQ(done, expected_done) << "Expected put of key " << k << " of put_type " << enum_name(put_type) - << " to be " << expected_done; - if (expected_done) { - m_shadow_map.insert(std::make_pair((const K&)*sreq.m_k, (const V&)*sreq.m_v)); - } else { - const auto r = m_shadow_map.find(*sreq.m_k); - ASSERT_NE(r, m_shadow_map.end()) << "Testcase issue, expected inserted slots to be in shadow map"; - ASSERT_EQ((const V&)*sreq.m_existing_val, r->second) - << "Insert existing value doesn't return correct data for key " << r->first; - } - } - - void range_put(uint32_t start_entry, uint32_t end_entry, bool expected) { - auto val = std::make_unique< V >(V::generate_rand()); - auto mreq = BtreeRangePutRequest< K >{BtreeKeyRange< K >{start_entry, true, end_entry, true}, - btree_put_type::REPLACE_ONLY_IF_EXISTS, val.get()}; - ASSERT_EQ(m_bt->put(mreq) == btree_status_t::success, expected); - } - - void range_put(uint32_t max_count) { - const auto num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); - static thread_local std::uniform_int_distribution< uint32_t > s_randkey_start_generator{1, num_entries}; - auto val = std::make_unique< V >(V::generate_rand()); - - retry: - auto const start_it = m_shadow_map.lower_bound(K{s_randkey_start_generator(g_re)}); - auto end_it = start_it; - auto it = start_it; - uint32_t count = 0; - while ((it != m_shadow_map.end()) && (count++ < max_count)) { - it->second = *val; - end_it = it++; - } - if (count == 0) { goto retry; } - - auto mreq = BtreeRangePutRequest< K >{BtreeKeyRange< K >{start_it->first, true, end_it->first, true}, - btree_put_type::REPLACE_ONLY_IF_EXISTS, val.get()}; - ASSERT_EQ(m_bt->put(mreq), btree_status_t::success); - } - - void remove_one(uint32_t k) { - auto existing_v = std::make_unique< V >(); - auto pk = std::make_unique< K >(k); - - auto rreq = BtreeSingleRemoveRequest{pk.get(), existing_v.get()}; - bool removed = (m_bt->remove(rreq) == btree_status_t::success); - - bool expected_removed = (m_shadow_map.find(rreq.key()) != m_shadow_map.end()); - ASSERT_EQ(removed, expected_removed) << "Expected remove of key " << k << " to be " << expected_removed; - - if (removed) { - validate_data(rreq.key(), (const V&)rreq.value()); - m_shadow_map.erase(rreq.key()); - } - } - - void range_remove(uint32_t start_key, uint32_t end_key) { - - auto start_it = m_shadow_map.lower_bound(K{start_key}); - auto end_it = m_shadow_map.lower_bound(K{end_key}); - auto fount_it = m_shadow_map.find(K{end_key}); - bool expected = (start_it != m_shadow_map.end()) && (std::distance(start_it, end_it) >= 0); - if (start_it == end_it && fount_it == m_shadow_map.end()) { expected = false; } - auto range = BtreeKeyRange< K >{K{start_key}, true, K{end_key}, true}; - auto mreq = BtreeRangeRemoveRequest< K >{std::move(range)}; - - size_t original_ts = get_tree_size(); - size_t original_ms = m_shadow_map.size(); - - auto ret = m_bt->remove(mreq); - ASSERT_EQ(expected, ret == btree_status_t::success) - << " not a successful remove op for range " << range.to_string() - << "start_it!=m_shadow_map.end(): " << (start_it != m_shadow_map.end()) - << " and std::distance(start_it,end_it) >= 0 : " << (std::distance(start_it, end_it) >= 0); - - K out_key; - V out_value; - auto qret = get_num_elements_in_tree(start_key, end_key, out_key, out_value); - ASSERT_EQ(qret, btree_status_t::not_found) - << " At least one element found! [" << out_key << "] = " << out_value; - - if (expected) { m_shadow_map.erase(start_it, fount_it != m_shadow_map.end() ? ++end_it : end_it); } - size_t ms = m_shadow_map.size(); - size_t ts = get_tree_size(); - ASSERT_EQ(original_ms - ms, original_ts - ts) << " number of removed from map is " << original_ms - ms - << " whereas number of existing keys is " << original_ts - ts; - - ASSERT_EQ(ts, ms) << " size of tree is " << ts << " vs number of existing keys are " << ms; - } - - void query_all_validate() const { - query_validate(0u, SISL_OPTIONS["num_entries"].as< uint32_t >() - 1, UINT32_MAX); - } - - void query_all_paginate_validate(uint32_t batch_size) const { - query_validate(0u, SISL_OPTIONS["num_entries"].as< uint32_t >() - 1, batch_size); - } - - void query_validate(uint32_t start_k, uint32_t end_k, uint32_t batch_size) const { - std::vector< std::pair< K, V > > out_vector; - uint32_t remaining = num_elems_in_range(start_k, end_k); - auto it = m_shadow_map.lower_bound(K{start_k}); - - BtreeQueryRequest< K > qreq{BtreeKeyRange< K >{K{start_k}, true, K{end_k}, true}, - BtreeQueryType::SWEEP_NON_INTRUSIVE_PAGINATION_QUERY, batch_size}; - while (remaining > 0) { - out_vector.clear(); - auto const ret = m_bt->query(qreq, out_vector); - auto const expected_count = std::min(remaining, batch_size); - - ASSERT_EQ(out_vector.size(), expected_count) << "Received incorrect value on query pagination"; - remaining -= expected_count; - - if (remaining == 0) { - ASSERT_EQ(ret, btree_status_t::success) << "Expected success on query"; - } else { - ASSERT_EQ(ret, btree_status_t::has_more) << "Expected query to return has_more"; - } - - for (size_t idx{0}; idx < out_vector.size(); ++idx) { - ASSERT_EQ(out_vector[idx].second, it->second) - << "Range get doesn't return correct data for key=" << it->first << " idx=" << idx; - ++it; - } - } - out_vector.clear(); - auto ret = m_bt->query(qreq, out_vector); - ASSERT_EQ(ret, btree_status_t::success) << "Expected success on query"; - ASSERT_EQ(out_vector.size(), 0) << "Received incorrect value on empty query pagination"; - } - - void get_all_validate() const { - for (const auto& [key, value] : m_shadow_map) { - auto copy_key = std::make_unique< K >(); - *copy_key = key; - auto out_v = std::make_unique< V >(); - auto req = BtreeSingleGetRequest{copy_key.get(), out_v.get()}; - - const auto ret = m_bt->get(req); - ASSERT_EQ(ret, btree_status_t::success) << "Missing key " << key << " in btree but present in shadow map"; - ASSERT_EQ((const V&)req.value(), value) - << "Found value in btree doesn't return correct data for key=" << key; - } - } - - void get_specific_validate(uint32_t k) const { - auto pk = std::make_unique< K >(k); - auto out_v = std::make_unique< V >(); - auto req = BtreeSingleGetRequest{pk.get(), out_v.get()}; - - const auto status = m_bt->get(req); - if (status == btree_status_t::success) { - validate_data(req.key(), (const V&)req.value()); - } else { - ASSERT_EQ((m_shadow_map.find(req.key()) == m_shadow_map.end()), true) - << "Node key " << k << " is missing in the btree"; - } - } - - void get_any_validate(uint32_t start_k, uint32_t end_k) const { - auto out_k = std::make_unique< K >(); - auto out_v = std::make_unique< V >(); - auto req = - BtreeGetAnyRequest< K >{BtreeKeyRange< K >{K{start_k}, true, K{end_k}, true}, out_k.get(), out_v.get()}; - const auto status = m_bt->get(req); - if (status == btree_status_t::success) { - ASSERT_EQ(found_in_range(*(K*)req.m_outkey, start_k, end_k), true) - << "Get Any returned key=" << *(K*)req.m_outkey << " which is not in range " << start_k << "-" << end_k - << "according to shadow map"; - validate_data(*(K*)req.m_outkey, *(V*)req.m_outval); - } else { - ASSERT_EQ(found_in_range(*(K*)req.m_outkey, start_k, end_k), false) - << "Get Any couldn't find key in the range " << start_k << "-" << end_k - << " but it present in shadow map"; - } - } - - void print() const { m_bt->print_tree(); } - - void print_keys() const { m_bt->print_tree_keys(); } - - size_t get_tree_size() { - BtreeQueryRequest< K > qreq{ - BtreeKeyRange< K >{K{0}, true, K{SISL_OPTIONS["num_entries"].as< uint32_t >()}, true}, - BtreeQueryType::SWEEP_NON_INTRUSIVE_PAGINATION_QUERY, UINT32_MAX}; - std::vector< std::pair< K, V > > out_vector; - auto const ret = m_bt->query(qreq, out_vector); - return out_vector.size(); - } - - btree_status_t get_num_elements_in_tree(uint32_t start_k, uint32_t end_k, K& out_key, V& out_value) const { - auto k = std::make_unique< K >(); - auto v = std::make_unique< V >(); - auto req = BtreeGetAnyRequest< K >{BtreeKeyRange< K >{K{start_k}, true, K{end_k}, true}, k.get(), v.get()}; - auto ret = m_bt->get(req); - out_key = *((K*)req.m_outkey); - out_value = *((V*)req.m_outval); - return ret; - } - -private: - void validate_data(const K& key, const V& btree_val) const { - const auto r = m_shadow_map.find(key); - ASSERT_NE(r, m_shadow_map.end()) << "Node key is not present in shadow map"; - ASSERT_EQ(btree_val, r->second) << "Found value in btree doesn't return correct data for key=" << r->first; - } - - bool found_in_range(const K& key, uint32_t start_k, uint32_t end_k) const { - const auto itlower = m_shadow_map.lower_bound(K{start_k}); - const auto itupper = m_shadow_map.upper_bound(K{end_k}); - auto it = itlower; - while (it != itupper) { - if (it->first == key) { return true; } - ++it; - } - return false; - } - - uint32_t num_elems_in_range(uint32_t start_k, uint32_t end_k) const { - const auto itlower = m_shadow_map.lower_bound(K{start_k}); - const auto itupper = m_shadow_map.upper_bound(K{end_k}); - return std::distance(itlower, itupper); + BtreeTestHelper< TestType >::SetUp(); + this->m_bt = std::make_shared< typename T::BtreeType >(this->m_cfg); + this->m_bt->init(nullptr); } }; -using BtreeTypes = testing::Types< FixedLenBtreeTest, VarKeySizeBtreeTest, VarValueSizeBtreeTest, VarObjSizeBtreeTest >; +using BtreeTypes = testing::Types< PrefixIntervalBtreeTest, FixedLenBtreeTest, VarKeySizeBtreeTest, + VarValueSizeBtreeTest, VarObjSizeBtreeTest >; TYPED_TEST_SUITE(BtreeTest, BtreeTypes); TYPED_TEST(BtreeTest, SequentialInsert) { @@ -345,35 +113,35 @@ TYPED_TEST(BtreeTest, SequentialInsert) { const auto entries_iter1 = num_entries / 2; LOGINFO("Step 1: Do forward sequential insert for {} entries", entries_iter1); for (uint32_t i{0}; i < entries_iter1; ++i) { - this->put(i, btree_put_type::INSERT_ONLY_IF_NOT_EXISTS); + this->put(i, btree_put_type::INSERT); } LOGINFO("Step 2: Query {} entries and validate with pagination of 75 entries", entries_iter1); - this->query_validate(0, entries_iter1 - 1, 75); + this->do_query(0, entries_iter1 - 1, 75); // Reverse sequential insert const auto entries_iter2 = num_entries - entries_iter1; LOGINFO("Step 3: Do reverse sequential insert of remaining {} entries", entries_iter2); for (uint32_t i{num_entries - 1}; i >= entries_iter1; --i) { - this->put(i, btree_put_type::INSERT_ONLY_IF_NOT_EXISTS); + this->put(i, btree_put_type::INSERT); } LOGINFO("Step 4: Query {} entries and validate with pagination of 90 entries", entries_iter2); - this->query_validate(entries_iter1, num_entries - 1, 90); + this->do_query(entries_iter1, num_entries - 1, 90); // Do validate all of them LOGINFO("Step 5: Query all entries and validate with no pagination"); - this->query_all_validate(); + this->query_all(); LOGINFO("Step 6: Query all entries and validate with pagination of 80 entries"); - this->query_all_paginate_validate(80); + this->query_all_paginate(80); LOGINFO("Step 7: Get all entries 1-by-1 and validate them"); - this->get_all_validate(); - this->get_any_validate(num_entries - 3, num_entries + 1); + this->get_all(); + this->get_any(num_entries - 3, num_entries + 1); // Negative cases LOGINFO("Step 8: Do incorrect input and validate errors"); - this->query_validate(num_entries + 100, num_entries + 500, 5); - this->get_any_validate(num_entries + 1, num_entries + 2); + this->do_query(num_entries + 100, num_entries + 500, 5); + this->get_any(num_entries + 1, num_entries + 2); } TYPED_TEST(BtreeTest, SequentialRemove) { @@ -381,10 +149,10 @@ TYPED_TEST(BtreeTest, SequentialRemove) { const auto num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); LOGINFO("Step 1: Do forward sequential insert for {} entries", num_entries); for (uint32_t i{0}; i < num_entries; ++i) { - this->put(i, btree_put_type::INSERT_ONLY_IF_NOT_EXISTS); + this->put(i, btree_put_type::INSERT); } LOGINFO("Step 2: Query {} entries and validate with pagination of 75 entries", num_entries); - this->query_validate(0, num_entries - 1, 75); + this->do_query(0, num_entries - 1, 75); const auto entries_iter1 = num_entries / 2; LOGINFO("Step 3: Do forward sequential remove for {} entries", entries_iter1); @@ -392,7 +160,7 @@ TYPED_TEST(BtreeTest, SequentialRemove) { this->remove_one(i); } LOGINFO("Step 4: Query {} entries and validate with pagination of 75 entries", entries_iter1); - this->query_validate(0, entries_iter1 - 1, 75); + this->do_query(0, entries_iter1 - 1, 75); const auto entries_iter2 = num_entries - entries_iter1; LOGINFO("Step 5: Do reverse sequential remove of remaining {} entries", entries_iter2); @@ -401,9 +169,9 @@ TYPED_TEST(BtreeTest, SequentialRemove) { } LOGINFO("Step 6: Query the empty tree"); - this->query_validate(0, num_entries, 75); - this->get_any_validate(0, 1); - this->get_specific_validate(0); + this->do_query(0, num_entries, 75); + this->get_any(0, 1); + this->get_specific(0); } TYPED_TEST(BtreeTest, RandomInsert) { @@ -416,9 +184,9 @@ TYPED_TEST(BtreeTest, RandomInsert) { std::random_shuffle(vec.begin(), vec.end()); LOGINFO("Step 1: Do forward random insert for {} entries", num_entries); for (uint32_t i{0}; i < num_entries; ++i) { - this->put(vec[i], btree_put_type::INSERT_ONLY_IF_NOT_EXISTS); + this->put(vec[i], btree_put_type::INSERT); } - this->get_all_validate(); + this->get_all(); } TYPED_TEST(BtreeTest, RangeUpdate) { @@ -426,17 +194,16 @@ TYPED_TEST(BtreeTest, RangeUpdate) { const auto num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); LOGINFO("Step 1: Do forward sequential insert for {} entries", num_entries); for (uint32_t i{0}; i < num_entries; ++i) { - this->put(i, btree_put_type::INSERT_ONLY_IF_NOT_EXISTS); + this->put(i, btree_put_type::INSERT); } LOGINFO("Step 2: Do range update of random intervals between [1-50] for 100 times with random key ranges"); - static thread_local std::uniform_int_distribution< uint32_t > s_rand_key_count_generator{1, 50}; for (uint32_t i{0}; i < 100; ++i) { - this->range_put(s_rand_key_count_generator(g_re)); + this->range_put_random(); } LOGINFO("Step 3: Query {} entries and validate with pagination of 75 entries", num_entries); - this->query_validate(0, num_entries - 1, 75); + this->do_query(0, num_entries - 1, 75); } TYPED_TEST(BtreeTest, SimpleRemoveRange) { @@ -444,24 +211,24 @@ TYPED_TEST(BtreeTest, SimpleRemoveRange) { const auto num_entries = 20; LOGINFO("Step 1: Do forward sequential insert for {} entries", num_entries); for (uint32_t i{0}; i < num_entries; ++i) { - this->put(i, btree_put_type::INSERT_ONLY_IF_NOT_EXISTS); + this->put(i, btree_put_type::INSERT); } LOGINFO("Step 2: Do range remove for {} entries", num_entries); // this->print_keys(); // EXPECT size = 20 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 - this->range_remove(5, 10); + this->range_remove_any(5, 10); // this->print_keys(); // EXPECT size = 14 : 0 1 2 3 4 [5 6 7 8 9 10] 11 12 13 14 15 16 17 18 19 - this->range_remove(0, 2); + this->range_remove_any(0, 2); // this->print_keys(); // EXPECT size = 11 : [0 1 2] 3 4 11 12 13 14 15 16 17 18 19 - this->range_remove(18, 19); + this->range_remove_any(18, 19); // this->print_keys(); // EXPECT size = 9 : 3 4 11 12 13 14 15 16 17 [18 19] - this->range_remove(17, 17); + this->range_remove_any(17, 17); // this->print_keys(); // EXPECT size = 8 : 3 4 11 12 13 14 15 16 [17] - this->range_remove(1, 5); + this->range_remove_any(1, 5); // this->print_keys(); // EXPECT size = 6 : [3 4] 11 12 13 14 15 16 - this->range_remove(1, 20); + this->range_remove_any(1, 20); // this->print_keys(); // EXPECT size = 0 : [11 12 13 14 15 16] - this->query_all_validate(); + this->query_all(); // this->query_validate(0, num_entries , 75); } @@ -472,7 +239,7 @@ TYPED_TEST(BtreeTest, RandomRemove) { LOGINFO("Step 1: Do forward sequential insert for {} entries", num_entries); for (uint32_t i{0}; i < num_entries; ++i) { - this->put(i, btree_put_type::INSERT_ONLY_IF_NOT_EXISTS); + this->put(i, btree_put_type::INSERT); } std::vector< uint32_t > vec(num_entries); @@ -485,390 +252,66 @@ TYPED_TEST(BtreeTest, RandomRemove) { this->remove_one(vec[i]); } - this->get_all_validate(); + this->get_all(); } TYPED_TEST(BtreeTest, RandomRemoveRange) { - // Forward sequential insert const auto num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); const auto num_iters = SISL_OPTIONS["num_iters"].as< uint32_t >(); LOGINFO("Step 1: Do forward sequential insert for {} entries", num_entries); for (uint32_t i{0}; i < num_entries; ++i) { - this->put(i, btree_put_type::INSERT_ONLY_IF_NOT_EXISTS); + this->put(i, btree_put_type::INSERT); } // generate keys including out of bound static thread_local std::uniform_int_distribution< uint32_t > s_rand_key_generator{0, 2 * num_entries}; // this->print_keys(); LOGINFO("Step 2: Do range remove for maximum of {} iterations", num_iters); - for (uint32_t i{0}; i < num_iters && this->m_shadow_map.size() > 0; ++i) { + for (uint32_t i{0}; (i < num_iters) && this->m_shadow_map.size(); ++i) { uint32_t key1 = s_rand_key_generator(g_re); uint32_t key2 = s_rand_key_generator(g_re); - uint32_t start_key = std::min(key1, key2); - uint32_t end_key = std::max(key1, key2); // LOGINFO("Step 2 - {}: Do Range Remove of maximum [{},{}] keys ", i, start_key, end_key); - this->range_remove(std::min(key1, key2), std::max(key1, key2)); + this->range_remove_any(std::min(key1, key2), std::max(key1, key2)); // this->print_keys(); } - this->query_all_validate(); + this->query_all(); } template < typename TestType > -class BtreeConcurrentTest : public testing::Test { - using op_func = void (BtreeConcurrentTest::*)(void); +struct BtreeConcurrentTest : public BtreeTestHelper< TestType > { using T = TestType; using K = typename TestType::KeyType; using V = typename TestType::ValueType; - using mutex = iomgr::FiberManagerLib::shared_mutex; - -public: - void SetUp() override { - m_cfg.m_leaf_node_type = T::leaf_node_type; - m_cfg.m_int_node_type = T::interior_node_type; - m_max_range_input = SISL_OPTIONS["num_entries"].as< uint32_t >(); - if (SISL_OPTIONS.count("disable_merge")) m_cfg.m_merge_turned_on = false; - m_fibers.clear(); - m_bt = std::make_unique< typename T::BtreeType >(m_cfg); - m_bt->init(nullptr); - } - - void TearDown() override { iomanager.stop(); } - - void print() const { m_bt->print_tree(); } - void print_keys() const { m_bt->print_tree_keys(); } - void execute(const std::vector< std::pair< std::string, int > >& op_list) { + BtreeConcurrentTest() { this->m_is_multi_threaded = true; } + void SetUp() override { LOGINFO("Starting iomgr with {} threads", SISL_OPTIONS["n_threads"].as< uint32_t >()); ioenvironment.with_iomgr(iomgr::iomgr_params{.num_threads = SISL_OPTIONS["n_threads"].as< uint32_t >(), false, .num_fibers = 1 + SISL_OPTIONS["n_fibers"].as< uint32_t >(), 0, 0}); - std::mutex mtx; - iomanager.run_on_wait(iomgr::reactor_regex::all_io, [this, &mtx]() { - auto fv = iomanager.sync_io_capable_fibers(); - std::unique_lock lg(mtx); - m_fibers.insert(m_fibers.end(), fv.begin(), fv.end()); - }); - - preload(SISL_OPTIONS["preload_size"].as< uint32_t >()); - print_keys(); - runInParallel(op_list); - print_keys(); - } - -private: - void random_range_remove() { - static std::uniform_int_distribution< uint32_t > s_rand_range_generator{2, 5}; - std::random_device g_re{}; - uint32_t nkeys = s_rand_range_generator(g_re); - int key = m_range_scheduler.pick_random_existing_keys(nkeys, m_max_range_input); - if (key == -1) { return; } - range_remove(key, key + nkeys - 1); - m_range_scheduler.remove_keys(static_cast< uint32_t >(key), static_cast< uint32_t >(key + nkeys - 1)); - } - - void range_remove(uint32_t start_key, uint32_t end_key) { - auto range = BtreeKeyRange< K >{K{start_key}, true, K{end_key}, true}; - auto out_vector = query(start_key, end_key); - auto rreq = BtreeRangeRemoveRequest< K >{std::move(range)}; - rreq.enable_route_tracing(); - bool removed = (m_bt->remove(rreq) == btree_status_t::success); - bool expected_removed = m_shadow_map.range_remove(start_key, end_key - start_key + 1, out_vector); - ASSERT_EQ(removed, expected_removed) << "not a successful remove op for range " << range.to_string(); - } - - void random_query(uint32_t start_key, uint32_t end_key) { - auto range = BtreeKeyRange< K >{K{start_key}, true, K{end_key}, true}; - auto out_map = query(start_key, end_key); - bool expected = m_shadow_map.range_get(start_key, end_key - start_key + 1, out_map); - ASSERT_TRUE(expected) << "not a successful query op for range " << range.to_string(); - } - std::unordered_map< uint32_t, std::string > query(uint32_t start_k, uint32_t end_k) const { - std::unordered_map< uint32_t, std::string > result; - for (auto cur = start_k; cur <= end_k; cur++) { - auto key = std::make_unique< K >(cur); - auto value = std::make_unique< V >(); - auto req = BtreeSingleGetRequest{key.get(), value.get()}; - const auto status = m_bt->get(req); - if (status == btree_status_t::success) { result[cur] = ((const V&)req.value()).to_string(); } - } - return result; - } - - void random_get() { - static thread_local std::uniform_int_distribution< uint32_t > s_rand_range_generator{1, 100}; - std::random_device g_re{}; - uint32_t nkeys = s_rand_range_generator(g_re); - int key = -1; - key = m_range_scheduler.pick_random_non_working_keys(nkeys, m_max_range_input); - if (key == -1) { return; } - random_query(key, key + nkeys - 1); - m_range_scheduler.remove_keys_from_working(static_cast< uint32_t >(key), - static_cast< uint32_t >(key + nkeys - 1)); - } - - void random_put() { - int key = m_range_scheduler.pick_random_non_existing_keys(1, m_max_range_input); - if (key == -1) { return; } - auto value = V::generate_rand(); - put(key, btree_put_type::INSERT_ONLY_IF_NOT_EXISTS, value); - m_range_scheduler.put_key(static_cast< uint32_t >(key)); - } - - void put(uint32_t key, btree_put_type put_type, V value) { - auto existing_v = std::make_unique< V >(); - auto v = std::make_unique< V >(value); - auto k = std::make_unique< K >(key); - auto sreq{BtreeSinglePutRequest{k.get(), v.get(), put_type, existing_v.get()}}; - sreq.enable_route_tracing(); - bool done = (m_bt->put(sreq) == btree_status_t::success); - auto expected_done = m_shadow_map.put(key, value.to_string()); - ASSERT_EQ(done, expected_done) << "Expected put of key " << key << " of put_type " << enum_name(put_type) - << " to be " << expected_done; + BtreeTestHelper< TestType >::SetUp(); + this->m_bt = std::make_shared< typename T::BtreeType >(this->m_cfg); + this->m_bt->init(nullptr); } - void remove(uint32_t key) { - auto existing_v = std::make_unique< V >(); - auto k = std::make_unique< K >(key); - auto rreq = BtreeSingleRemoveRequest{k.get(), existing_v.get()}; - rreq.enable_route_tracing(); - - bool removed = (m_bt->remove(rreq) == btree_status_t::success); - auto expected_done = m_shadow_map.remove(key, ((const V&)rreq.value()).to_string()); - ASSERT_EQ(removed, expected_done) << "Expected remove of key " << key << " to be " << expected_done; + void TearDown() override { + BtreeTestHelper< TestType >::TearDown(); + iomanager.stop(); } - - void random_remove() { - int key = m_range_scheduler.pick_random_existing_keys(1, m_max_range_input); - if (key == -1) { return; } - remove(key); - m_range_scheduler.remove_key(static_cast< uint32_t >(key)); - } - - void random_range_put() { random_range_put_update(false); } - - void random_range_update() { random_range_put_update(true); } - - void random_range_put_update(bool replace = false) { - static thread_local std::uniform_int_distribution< uint32_t > s_rand_range_generator{2, 5}; - std::random_device g_re{}; - uint32_t nkeys = s_rand_range_generator(g_re); - int key = -1; - - if (replace) { - key = m_range_scheduler.pick_random_existing_keys(nkeys, m_max_range_input); - } else { - key = m_range_scheduler.pick_random_non_existing_keys(nkeys, m_max_range_input); - } - - if (key == -1) { return; } - auto value = V::generate_rand(); - range_put(key, key + nkeys - 1, value, replace); - if (replace) { - m_range_scheduler.remove_keys_from_working(static_cast< uint32_t >(key), - static_cast< uint32_t >(key + nkeys - 1)); - } else { - m_range_scheduler.put_keys(static_cast< uint32_t >(key), static_cast< uint32_t >(key + nkeys - 1)); - } - } - - void range_put(uint32_t start_key, uint32_t end_key, V value, bool update) { - auto val = std::make_unique< V >(value); - auto preq = BtreeRangePutRequest< K >{ - BtreeKeyRange< K >{start_key, true, end_key, true}, - update ? btree_put_type::REPLACE_ONLY_IF_EXISTS : btree_put_type::INSERT_ONLY_IF_NOT_EXISTS, val.get()}; - preq.enable_route_tracing(); - bool done = (m_bt->put(preq) == btree_status_t::success); - auto expected_done = m_shadow_map.range_put(start_key, end_key - start_key + 1, value.to_string(), update); - ASSERT_EQ(done, expected_done); - } - - void runInParallel(const std::vector< std::pair< std::string, int > >& op_list) { - auto test_count = m_fibers.size(); - for (auto it = m_fibers.begin(); it < m_fibers.end(); ++it) { - iomanager.run_on_forget(*it, [this, &test_count, op_list]() { - std::random_device g_rd{}; - std::default_random_engine re{g_rd()}; - const auto num_iters_per_thread = - sisl::round_up(SISL_OPTIONS["num_iters"].as< uint32_t >() / m_fibers.size(), m_fibers.size()); - std::vector< uint32_t > weights; - std::transform(op_list.begin(), op_list.end(), std::back_inserter(weights), - [](const auto& pair) { return pair.second; }); - - // Construct a weighted distribution based on the input frequencies - std::discrete_distribution< uint32_t > s_rand_op_generator(weights.begin(), weights.end()); - - for (uint32_t i = 0; i < num_iters_per_thread; i++) { - uint32_t op_idx = s_rand_op_generator(re); - (this->*m_operations[op_list[op_idx].first])(); - } - { - std::unique_lock lg(m_test_done_mtx); - if (--test_count == 0) { m_test_done_cv.notify_one(); } - } - }); - } - - { - std::unique_lock< std::mutex > lk(m_test_done_mtx); - m_test_done_cv.wait(lk, [&]() { return test_count == 0; }); - } - LOGINFO("ALL parallel jobs joined"); - } - void preload(uint32_t preload_size) { - const auto chunk_size = preload_size / m_fibers.size(); - const auto last_chunk_size = preload_size % chunk_size ?: chunk_size; - auto test_count = m_fibers.size(); - - for (std::size_t i = 0; i < m_fibers.size(); ++i) { - const auto start_range = i * chunk_size; - const auto end_range = start_range + ((i == m_fibers.size() - 1) ? last_chunk_size : chunk_size); - iomanager.run_on_forget(m_fibers[i], [this, start_range, end_range, &test_count]() { - for (uint32_t i = start_range; i < end_range; i++) { - auto value = V::generate_rand(); - put(i, btree_put_type::INSERT_ONLY_IF_NOT_EXISTS, value); - m_range_scheduler.put_key(i); - } - { - std::unique_lock lg(m_test_done_mtx); - if (--test_count == 0) { m_test_done_cv.notify_one(); } - } - }); - } - - { - std::unique_lock< std::mutex > lk(m_test_done_mtx); - m_test_done_cv.wait(lk, [&]() { return test_count == 0; }); - } - LOGINFO("Preload Done"); - } - -private: - std::unique_ptr< typename T::BtreeType > m_bt; - struct ShadowMap { - public: - bool put(uint32_t key, std::string value, bool update = false) { - std::unique_lock< mutex > lk(map_lock); - auto it = data.find(key); - if ((it == data.end() && update) || (it != data.end() && !update)) { return false; } - data[key] = value; - return true; - } - - bool range_put(uint32_t key, uint32_t nkeys, std::string value, bool update = false) { - std::unique_lock< mutex > lk(map_lock); - if (update) { - if (!all_existed(key, nkeys)) { return false; } - } else { - if (!none_of_them_existed(key, nkeys)) { return false; } - } - for (auto cur = key; cur < key + nkeys; cur++) { - data[cur] = value; - } - return true; - } - - bool remove(uint32_t key) { - std::unique_lock< mutex > lk(map_lock); - if (none_of_them_existed(key, 1)) { return false; } - auto it = data.find(key); - if (it == data.end()) { return false; } - data.erase(it); - return true; - } - - bool remove(uint32_t key, std::string value) { - std::unique_lock< mutex > lk(map_lock); - if (none_of_them_existed(key, 1)) { return false; } - auto it = data.find(key); - if (it == data.end()) { return false; } - if (it->second != value) { return false; } - data.erase(it); - return true; - } - - bool range_remove(uint32_t key, uint32_t nkeys) { - std::unique_lock< mutex > lk(map_lock); - if (none_of_them_existed(key, nkeys)) { return false; } - auto first_it = data.find(key); - auto last_it = data.upper_bound(key + nkeys - 1); - data.erase(first_it, last_it); - return true; - } - - bool range_get(uint32_t key, uint32_t nkeys, std::unordered_map< uint32_t, std::string > val_map) { - std::unique_lock< mutex > lk(map_lock); - if (none_of_them_existed(key, nkeys) && val_map.size()) { return false; } - if (none_of_them_existed(key, nkeys) && val_map.size() == 0) { return true; } - uint32_t count = 0; - for (auto cur = key; cur < key + nkeys; cur++) { - if (data.find(cur) != data.end()) { - if (val_map[cur] != data[cur]) { return false; } - if (val_map[cur] == data[cur]) { count++; } - } - } - if (count != val_map.size()) { return false; } - return true; - } - bool range_remove(uint32_t key, uint32_t nkeys, std::unordered_map< uint32_t, std::string > val_map) { - std::unique_lock< mutex > lk(map_lock); - if (none_of_them_existed(key, nkeys)) { return false; } - for (auto cur = key; cur < key + nkeys; cur++) - if (data.find(cur) != data.end() && val_map[cur] != data[cur]) { return false; } - auto first_it = data.find(key); - auto last_it = data.upper_bound(key + nkeys - 1); - data.erase(first_it, last_it); - return true; - } - std::string to_string(uint32_t key, uint32_t nkeys) { - std::unique_lock< mutex > lk(map_lock); - std::string x = ""; - for (auto cur = key; cur < key + nkeys; cur++) { - if (data.find(cur) != data.end()) x += fmt::format("[{}]={}\n", cur, data[cur]); - } - return x; - } - - private: - bool none_of_them_existed(uint32_t key, uint32_t nkeys) { - for (auto cur = key; cur < key + nkeys; cur++) - if (data.find(cur) != data.end()) return false; - return true; - } - - bool all_existed(uint32_t key, uint32_t nkeys) { - for (auto cur = key; cur < key + nkeys; cur++) - if (data.find(cur) == data.end()) return false; - return true; - } - - std::map< uint32_t, std::string > data; - mutex map_lock; - }; - ShadowMap m_shadow_map; - RangeScheduler m_range_scheduler; - uint32_t m_max_range_input{1000}; - BtreeConfig m_cfg{g_node_size}; - std::map< std::string, op_func > m_operations = {{"put", &BtreeConcurrentTest::random_put}, - {"remove", &BtreeConcurrentTest::random_remove}, - {"range_update", &BtreeConcurrentTest::random_range_update}, - {"range_remove", &BtreeConcurrentTest::random_range_remove}, - {"query", &BtreeConcurrentTest::random_get}}; - std::vector< iomgr::io_fiber_t > m_fibers; - std::mutex m_test_done_mtx; - std::condition_variable m_test_done_cv; }; + TYPED_TEST_SUITE(BtreeConcurrentTest, BtreeTypes); -TYPED_TEST(BtreeConcurrentTest, AllTree) { +TYPED_TEST(BtreeConcurrentTest, ConcurrentAllOps) { // range put is not supported for non-extent keys - std::vector< std::string > input_ops = {"put:20", "remove:20", "range_update:20", "range_remove:20", "query:20"}; + std::vector< std::string > input_ops = {"put:20", "remove:20", "range_put:20", "range_remove:20", "query:20"}; std::vector< std::pair< std::string, int > > ops; if (SISL_OPTIONS.count("operation_list")) { @@ -900,7 +343,7 @@ TYPED_TEST(BtreeConcurrentTest, AllTree) { return std::make_pair(std::string(), 0); }); - this->execute(ops); + this->multi_op_execute(ops); } int main(int argc, char* argv[]) {