From 104e54e374c00c020ee70664f65f0c01b2c46a63 Mon Sep 17 00:00:00 2001 From: Harihara Kadayam Date: Fri, 2 Aug 2024 11:16:28 -0700 Subject: [PATCH] Index crash recovery test cases and fixes (#458) * Fixed the case where if cp is not flushed and homestore crashed, restart would not init certain key parameters. This is fixed by taking cp everytime we do formatting and then we persist this information to homestore first_blk. * Fixed the crash simulator issue where restart doesn't initialize existing HSTestHelper instances * Several fixes in IndexTable to make sure we can restart and reapply the logs * Add visualize, change diff and some print * Fixed non-release build duplication of down_buf_count * Removed key specific print into btree and create custom overrider for that --------- Co-authored-by: shosseinimotlagh --- .DS_Store | Bin 0 -> 8196 bytes conanfile.py | 2 +- src/include/homestore/btree/btree.hpp | 15 +- src/include/homestore/btree/btree.ipp | 48 ++- .../homestore/btree/detail/btree_common.ipp | 39 ++- .../homestore/btree/detail/btree_internal.hpp | 3 + .../btree/detail/btree_mutate_impl.ipp | 14 + .../homestore/btree/detail/btree_node.hpp | 62 +++- .../btree/detail/btree_remove_impl.ipp | 2 +- .../homestore/btree/detail/prefix_node.hpp | 2 +- .../homestore/btree/detail/simple_node.hpp | 125 +++++--- .../homestore/btree/detail/varlen_node.hpp | 60 +--- .../homestore/index/index_internal.hpp | 3 +- src/include/homestore/index/index_table.hpp | 105 +++++-- src/include/homestore/index/wb_cache_base.hpp | 1 + src/include/homestore/index_service.hpp | 5 +- src/lib/blkalloc/append_blk_allocator.cpp | 6 +- src/lib/blkalloc/append_blk_allocator.h | 2 +- src/lib/blkalloc/bitmap_blk_allocator.h | 1 + src/lib/blkalloc/blk_allocator.h | 1 + src/lib/blkalloc/fixed_blk_allocator.cpp | 62 ++-- src/lib/blkalloc/fixed_blk_allocator.h | 5 +- src/lib/common/crash_simulator.hpp | 9 +- src/lib/device/device.h | 1 + src/lib/device/device_manager.cpp | 22 ++ src/lib/device/hs_super_blk.h | 12 +- src/lib/device/virtual_dev.cpp | 7 + src/lib/device/virtual_dev.hpp | 2 + src/lib/homestore.cpp | 10 +- src/lib/index/index_cp.cpp | 122 +++++++- src/lib/index/index_cp.hpp | 48 ++- src/lib/index/index_service.cpp | 69 +++-- src/lib/index/wb_cache.cpp | 244 ++++++++++----- src/lib/index/wb_cache.hpp | 5 +- src/tests/CMakeLists.txt | 11 +- src/tests/btree_helpers/btree_test_helper.hpp | 27 +- src/tests/btree_helpers/shadow_map.hpp | 11 +- .../test_common/homestore_test_common.hpp | 5 +- src/tests/test_device_manager.cpp | 7 +- src/tests/test_index_btree.cpp | 12 +- src/tests/test_index_crash_recovery.cpp | 289 ++++++++++++++++++ src/tests/test_pdev.cpp | 7 +- 42 files changed, 1133 insertions(+), 350 deletions(-) create mode 100644 .DS_Store create mode 100644 src/tests/test_index_crash_recovery.cpp diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..04e3372dddc6554a4509762009625dc91bada321 GIT binary patch literal 8196 zcmeHM-D(p-6h4z|nhmW|rGxdQVO}( zx9|Z(yz)WRXAv*-o1d2KG{p-+3eJ?7Gdpv>vuD2f*_$_f@YgEd!Q; z|B?Z5K6uDNRYj^;KUT1n0_6c8X5f#X%0R^zsHZ(t;ujvnyzP9E-!y$xx7=D zD!N5?+I>~u{EH8TZR@aUfy`!}<>UZy~9v_$7`J1=z?=<$qUc}$Y z;StlP4_#8vr{#Op#s+^-kE4)BU9<|s3~kUukoA}zP>r_f3FH!JHArIDp_`?%CCT5X zPL@t4{2AB0f@>EsP0+-&olXMkZM2Bzu3_PaVPWCV7GN!gwH_8lYI>9tV;%SCGRFK! zd$f%ukpPom)G58CH0w+5tAU+?FC14$;u>jJE~gHVeLBqI@*4)u!Dux9i%ykX$xfD`l8p$0I!(n^eaOWMF@hLtz4Wvq<+C&P|I2e1**LDo;X z`9F+pe!%2%x8Hxsge@!s7n^~+a&JMz|2Ue?yZ^t~gRtIN1}p=A#Q<|Q>zh@WIQeyJ zS`)Fhhx{5@B%wD_QbJJ3a2!&G ANB{r; literal 0 HcmV?d00001 diff --git a/conanfile.py b/conanfile.py index 774a1ac0d..fd824630f 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.4.33" + version = "6.4.34" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/btree/btree.hpp b/src/include/homestore/btree/btree.hpp index f7a82b14a..2ef1e1d44 100644 --- a/src/include/homestore/btree/btree.hpp +++ b/src/include/homestore/btree/btree.hpp @@ -34,6 +34,12 @@ namespace homestore { using BtreeNodePtr = boost::intrusive_ptr< BtreeNode >; using BtreeNodeList = folly::small_vector< BtreeNodePtr, 3 >; +struct BtreeVisualizeVariables { + uint64_t parent; + uint64_t midPoint; + uint64_t index; +}; + struct BtreeThreadVariables { std::vector< btree_locked_node_info > wr_locked_nodes; std::vector< btree_locked_node_info > rd_locked_nodes; @@ -114,8 +120,9 @@ class Btree { virtual std::pair< btree_status_t, uint64_t > destroy_btree(void* context); nlohmann::json get_status(int log_level) const; - void print_tree(const std::string& file = "") const; - void print_tree_keys() const; + void dump_tree_to_file(const std::string& file = "") const; + std::string to_custom_string(to_string_cb_t< K, V > const& cb) const; + std::string visualize_tree_keys(const std::string& file) const; uint64_t count_keys(bnodeid_t bnodeid) const; nlohmann::json get_metrics_in_json(bool updated = true); @@ -194,7 +201,9 @@ class Btree { uint64_t get_btree_node_cnt() const; uint64_t get_child_node_cnt(bnodeid_t bnodeid) const; void to_string(bnodeid_t bnodeid, std::string& buf) const; - void to_string_keys(bnodeid_t bnodeid, std::string& buf) const; + void to_custom_string_internal(bnodeid_t bnodeid, std::string& buf, to_string_cb_t< K, V > const& cb) const; + void to_dot_keys(bnodeid_t bnodeid, std::string& buf, std::map< uint32_t, std::vector< uint64_t > >& l_map, + std::map< uint64_t, BtreeVisualizeVariables >& info_map) const; void validate_sanity_child(const BtreeNodePtr& parent_node, uint32_t ind) const; void validate_sanity_next_child(const BtreeNodePtr& parent_node, uint32_t ind) const; void print_node(const bnodeid_t& bnodeid) const; diff --git a/src/include/homestore/btree/btree.ipp b/src/include/homestore/btree/btree.ipp index 73279d138..35c42ab97 100644 --- a/src/include/homestore/btree/btree.ipp +++ b/src/include/homestore/btree/btree.ipp @@ -308,7 +308,7 @@ nlohmann::json Btree< K, V >::get_status(int log_level) const { } template < typename K, typename V > -void Btree< K, V >::print_tree(const std::string& file) const { +void Btree< K, V >::dump_tree_to_file(const std::string& file) const { std::string buf; m_btree_lock.lock_shared(); to_string(m_root_node_info.bnode_id(), buf); @@ -323,13 +323,53 @@ void Btree< K, V >::print_tree(const std::string& file) const { } template < typename K, typename V > -void Btree< K, V >::print_tree_keys() const { +std::string Btree< K, V >::to_custom_string(to_string_cb_t< K, V > const& cb) const { std::string buf; m_btree_lock.lock_shared(); - to_string_keys(m_root_node_info.bnode_id(), buf); + to_custom_string_internal(m_root_node_info.bnode_id(), buf, cb); m_btree_lock.unlock_shared(); - LOGINFO("Pre order traversal of tree:\n<{}>", buf); + return buf; +} + +template < typename K, typename V > +std::string Btree< K, V >::visualize_tree_keys(const std::string& file) const { + std::map< uint32_t, std::vector< uint64_t > > level_map; + std::map< uint64_t, BtreeVisualizeVariables > info_map; + std::string buf = "digraph G\n" + "{ \n" + "ranksep = 3.0;\n" + R"(graph [splines="polyline"]; +)"; + + m_btree_lock.lock_shared(); + to_dot_keys(m_root_node_info.bnode_id(), buf, level_map, info_map); + m_btree_lock.unlock_shared(); + for (const auto& [child, info] : info_map) { + if (info.parent) { + buf += fmt::format(R"( + "{}":connector{} -> "{}":"key{}" [splines=false];)", + info.parent, info.index, child, info.midPoint); + } + } + + std::string result; + for (const auto& [key, values] : level_map) { + result += "{rank=same; "; + std::vector< std::string > quotedValues; + std::transform(values.begin(), values.end(), std::back_inserter(quotedValues), + [](uint64_t value) { return fmt::format("\"{}\"", value); }); + + result += fmt::to_string(fmt::join(quotedValues, " ")) + "}\n"; + } + + buf += "\n" + result + " }\n"; + if (!file.empty()) { + std::ofstream o(file); + o.write(buf.c_str(), buf.size()); + o.flush(); + } + return buf; } template < typename K, typename V > diff --git a/src/include/homestore/btree/detail/btree_common.ipp b/src/include/homestore/btree/detail/btree_common.ipp index edd895d89..b21305497 100644 --- a/src/include/homestore/btree/detail/btree_common.ipp +++ b/src/include/homestore/btree/detail/btree_common.ipp @@ -147,23 +147,54 @@ void Btree< K, V >::to_string(bnodeid_t bnodeid, std::string& buf) const { } template < typename K, typename V > -void Btree< K, V >::to_string_keys(bnodeid_t bnodeid, std::string& buf) const { +void Btree< K, V >::to_custom_string_internal(bnodeid_t bnodeid, std::string& buf, + to_string_cb_t< K, V > const& cb) const { BtreeNodePtr node; locktype_t acq_lock = locktype_t::READ; if (read_and_lock_node(bnodeid, node, acq_lock, acq_lock, nullptr) != btree_status_t::success) { return; } - fmt::format_to(std::back_inserter(buf), "{}\n", node->to_string_keys()); + fmt::format_to(std::back_inserter(buf), "{}\n", node->to_custom_string(cb)); if (!node->is_leaf()) { uint32_t i = 0; while (i < node->total_entries()) { BtreeLinkInfo p; node->get_nth_value(i, &p, false); - to_string_keys(p.bnode_id(), buf); + to_custom_string_internal(p.bnode_id(), buf, cb); ++i; } - if (node->has_valid_edge()) { to_string_keys(node->edge_id(), buf); } + if (node->has_valid_edge()) { to_custom_string_internal(node->edge_id(), buf, cb); } + } + unlock_node(node, acq_lock); +} + +template < typename K, typename V > +void Btree< K, V >::to_dot_keys(bnodeid_t bnodeid, std::string& buf, + std::map< uint32_t, std::vector< uint64_t > >& l_map, + std::map< uint64_t, BtreeVisualizeVariables >& info_map) const { + BtreeNodePtr node; + locktype_t acq_lock = locktype_t::READ; + + if (read_and_lock_node(bnodeid, node, acq_lock, acq_lock, nullptr) != btree_status_t::success) { return; } + fmt::format_to(std::back_inserter(buf), "{}\n", node->to_dot_keys()); + l_map[node->level()].push_back(node->node_id()); + info_map[node->node_id()].midPoint = node->is_leaf() ? 0 : node->total_entries() / 2; + if (!node->is_leaf()) { + uint32_t i = 0; + while (i < node->total_entries()) { + BtreeLinkInfo p; + node->get_nth_value(i, &p, false); + to_dot_keys(p.bnode_id(), buf, l_map, info_map); + info_map[p.bnode_id()].parent = node->node_id(); + info_map[p.bnode_id()].index = i; + ++i; + } + if (node->has_valid_edge()) { + to_dot_keys(node->edge_id(), buf, l_map, info_map); + info_map[node->edge_id()].parent = node->node_id(); + info_map[node->edge_id()].index = node->total_entries(); + } } unlock_node(node, acq_lock); } diff --git a/src/include/homestore/btree/detail/btree_internal.hpp b/src/include/homestore/btree/detail/btree_internal.hpp index 352b06e04..a77bfe5ac 100644 --- a/src/include/homestore/btree/detail/btree_internal.hpp +++ b/src/include/homestore/btree/detail/btree_internal.hpp @@ -215,6 +215,9 @@ class BtreeNode; void intrusive_ptr_add_ref(BtreeNode* node); void intrusive_ptr_release(BtreeNode* node); +template < typename K, typename V > +using to_string_cb_t = std::function< std::string(std::vector< std::pair< K, V > > const&) >; + ENUM(btree_event_t, uint8_t, READ, MUTATE, REMOVE, SPLIT, REPAIR, MERGE); struct trace_route_entry { diff --git a/src/include/homestore/btree/detail/btree_mutate_impl.ipp b/src/include/homestore/btree/detail/btree_mutate_impl.ipp index 209b35558..0c5313eb2 100644 --- a/src/include/homestore/btree/detail/btree_mutate_impl.ipp +++ b/src/include/homestore/btree/detail/btree_mutate_impl.ipp @@ -299,6 +299,20 @@ btree_status_t Btree< K, V >::split_node(const BtreeNodePtr& parent_node, const return ret; } +// template < typename K, typename V > +// template < typename ReqT > +// bool Btree< K, V >::is_split_needed(const BtreeNodePtr& node, ReqT& req) const { +// if (!node->is_leaf()) { // if internal node, size is atmost one additional entry, size of K/V +// return node->total_entries() >= 3; +// } else if constexpr (std::is_same_v< ReqT, BtreeRangePutRequest< K > >) { +// return node->total_entries() >= 3; +// } else if constexpr (std::is_same_v< ReqT, BtreeSinglePutRequest >) { +// return node->total_entries() >= 3;; +// } else { +// return false; +// } +// } + template < typename K, typename V > template < typename ReqT > bool Btree< K, V >::is_split_needed(const BtreeNodePtr& node, ReqT& req) const { diff --git a/src/include/homestore/btree/detail/btree_node.hpp b/src/include/homestore/btree/detail/btree_node.hpp index 73e8be5a8..84f70aa05 100644 --- a/src/include/homestore/btree/detail/btree_node.hpp +++ b/src/include/homestore/btree/detail/btree_node.hpp @@ -69,17 +69,24 @@ struct persistent_hdr_t { persistent_hdr_t() : nentries{0}, leaf{0}, node_deleted{0} {} std::string to_string() const { - return fmt::format("magic={} version={} csum={} node_id={} next_node={} nentries={} node_type={} is_leaf={} " - "node_deleted={} node_gen={} modified_cp_id={} link_version={} edge_nodeid={}, " - "edge_link_version={} level={} ", - magic, version, checksum, node_id, next_node, nentries, node_type, leaf, node_deleted, - node_gen, modified_cp_id, link_version, edge_info.m_bnodeid, edge_info.m_link_version, - level); + auto snext = (next_node == empty_bnodeid) ? "" : " next=" + std::to_string(next_node); + auto sedge = (edge_info.m_bnodeid == empty_bnodeid) + ? "" + : fmt::format(" edge={}.{}", edge_info.m_bnodeid, edge_info.m_link_version); + return fmt::format("magic={} version={} csum={} node_id={}{} nentries={} node_type={} is_leaf={} " + "node_deleted={} node_gen={} modified_cp_id={} link_version={}{} level={} ", + magic, version, checksum, node_id, snext, nentries, node_type, leaf, node_deleted, node_gen, + modified_cp_id, link_version, sedge, level); } std::string to_compact_string() const { - return fmt::format("{} id={} next={} nentries={} {} level={}", (void*)this, node_id, next_node, nentries, - (node_deleted == 0x1) ? "Deleted" : "", level); + auto snext = (next_node == empty_bnodeid) ? "" : " next=" + std::to_string(next_node); + auto sedge = (edge_info.m_bnodeid == empty_bnodeid) + ? "" + : fmt::format(" edge={}.{}", edge_info.m_bnodeid, edge_info.m_link_version); + return fmt::format("id={}{}{} {} level={} nentries={}{} mod_cp={}", node_id, snext, sedge, + leaf ? "LEAF" : "INTERIOR", level, nentries, (node_deleted == 0x1) ? " Deleted" : "", + modified_cp_id); } }; #pragma pack() @@ -111,10 +118,10 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { // Identify if a node is a leaf node or not, from raw buffer, by just reading persistent_hdr_t static bool identify_leaf_node(uint8_t* buf) { return (r_cast< persistent_hdr_t* >(buf))->leaf; } + static std::string to_string_buf(uint8_t* buf) { return (r_cast< persistent_hdr_t* >(buf))->to_compact_string(); } static BtreeLinkInfo::bnode_link_info identify_edge_info(uint8_t* buf) { return (r_cast< persistent_hdr_t* >(buf))->edge_info; } - static std::string to_string_buf(uint8_t* buf) { return (r_cast< persistent_hdr_t* >(buf))->to_string(); } static bool is_valid_node(sisl::blob const& buf) { auto phdr = r_cast< persistent_hdr_t const* >(buf.cbytes()); @@ -347,6 +354,41 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { void lock_acknowledge() { m_trans_hdr.upgraders.decrement(1); } bool any_upgrade_waiters() const { return (!m_trans_hdr.upgraders.testz()); } + template < typename K, typename V > + std::string to_custom_string(to_string_cb_t< K, V > const& cb) const { + std::string snext = + (this->next_bnode() == empty_bnodeid) ? "" : fmt::format(" next_node={}", this->next_bnode()); + auto str = fmt::format("id={}.{} level={} nEntries={} {}{} node_gen={} ", this->node_id(), this->link_version(), + this->level(), this->total_entries(), (this->is_leaf() ? "LEAF" : "INTERIOR"), snext, + this->node_gen()); + if (this->has_valid_edge()) { + fmt::format_to(std::back_inserter(str), " edge={}.{}", this->edge_info().m_bnodeid, + this->edge_info().m_link_version); + } + + if (this->total_entries() == 0) { + fmt::format_to(std::back_inserter(str), " [EMPTY] "); + return str; + } else if (this->is_leaf()) { + std::vector< std::pair< K, V > > entries; + for (uint32_t i{0}; i < this->total_entries(); ++i) { + V v; + get_nth_value(i, &v, false); + entries.emplace_back(std::make_pair(get_nth_key< K >(i, false), v)); + } + fmt::format_to(std::back_inserter(str), " Keys=[{}]", cb(entries)); + return str; + } else { + fmt::format_to(std::back_inserter(str), " Keys=["); + for (uint32_t i{0}; i < this->total_entries(); ++i) { + fmt::format_to(std::back_inserter(str), "{}{}", get_nth_key< K >(i, false).to_string(), + (i == this->total_entries() - 1) ? "" : ", "); + } + fmt::format_to(std::back_inserter(str), "]"); + } + return str; + } + public: // Public method which needs to be implemented by variants virtual btree_status_t insert(uint32_t ind, const BtreeKey& key, const BtreeValue& val) = 0; @@ -384,7 +426,7 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { } virtual std::string to_string(bool print_friendly = false) const = 0; - virtual std::string to_string_keys(bool print_friendly = false) const = 0; + virtual std::string to_dot_keys() const = 0; protected: node_find_result_t bsearch_node(const BtreeKey& key) const { diff --git a/src/include/homestore/btree/detail/btree_remove_impl.ipp b/src/include/homestore/btree/detail/btree_remove_impl.ipp index 6b20473d3..82213dcc6 100644 --- a/src/include/homestore/btree/detail/btree_remove_impl.ipp +++ b/src/include/homestore/btree/detail/btree_remove_impl.ipp @@ -119,7 +119,7 @@ retry: goto retry; } else if (ret == btree_status_t::merge_not_required) { BT_NODE_LOG(DEBUG, my_node, "merge is not required for child = {} keys: {}", curr_idx, - child_node->to_string_keys()); + child_node->to_string()); } } } diff --git a/src/include/homestore/btree/detail/prefix_node.hpp b/src/include/homestore/btree/detail/prefix_node.hpp index 486dec6f1..072b4f654 100644 --- a/src/include/homestore/btree/detail/prefix_node.hpp +++ b/src/include/homestore/btree/detail/prefix_node.hpp @@ -644,7 +644,7 @@ class FixedPrefixNode : public VariantNode< K, V > { return str; } - std::string to_string_keys(bool print_friendly = false) const override { return "NOT Supported"; } + std::string to_dot_keys() const override { return "NOT Supported"; } private: uint16_t add_prefix(BtreeKey const& key, BtreeValue const& val) { diff --git a/src/include/homestore/btree/detail/simple_node.hpp b/src/include/homestore/btree/detail/simple_node.hpp index d909c202f..85318c9b9 100644 --- a/src/include/homestore/btree/detail/simple_node.hpp +++ b/src/include/homestore/btree/detail/simple_node.hpp @@ -201,18 +201,22 @@ class SimpleNode : public VariantNode< K, V > { } bool has_room_for_put(btree_put_type put_type, uint32_t key_size, uint32_t value_size) const override { +#ifdef _PRERELEASE + // return (this->total_entries() <= 3); +#endif return ((put_type == btree_put_type::UPSERT) || (put_type == btree_put_type::INSERT)) ? (get_available_entries() > 0) : true; } std::string to_string(bool print_friendly = false) const override { - auto str = fmt::format("{}id={} level={} nEntries={} {} next_node={} ", + auto snext = this->next_bnode() == empty_bnodeid ? "" : fmt::format("next_node={}", this->next_bnode()); + auto str = fmt::format("{}id={} level={} nEntries={} {} {} ", (print_friendly ? "------------------------------------------------------------\n" : ""), this->node_id(), this->level(), this->total_entries(), - (this->is_leaf() ? "LEAF" : "INTERIOR"), this->next_bnode()); - if (!this->is_leaf() && (this->has_valid_edge())) { - fmt::format_to(std::back_inserter(str), "edge_id={}.{}", this->edge_info().m_bnodeid, + (this->is_leaf() ? "LEAF" : "INTERIOR"), snext); + if (this->has_valid_edge()) { + fmt::format_to(std::back_inserter(str), " edge={}.{}", this->edge_info().m_bnodeid, this->edge_info().m_link_version); } @@ -224,55 +228,75 @@ class SimpleNode : public VariantNode< K, V > { return str; } - std::string to_string_keys(bool print_friendly = false) const override { - // FIXME: Implement this, key may not be a unit32_t - return ""; -#if 0 - std::string delimiter = print_friendly ? "\n" : "\t"; - std::string snext = this->next_bnode() == empty_bnodeid ? "" : fmt::format("next_node={}", this->next_bnode()); - auto str = fmt::format("{}{}.{} level:{} nEntries={} {} {} node_gen={} ", - print_friendly ? "------------------------------------------------------------\n" : "", - this->node_id(), this->link_version(), this->level(), this->total_entries(), - (this->is_leaf() ? "LEAF" : "INTERIOR"), snext, this->node_gen()); - if (!this->is_leaf() && (this->has_valid_edge())) { - fmt::format_to(std::back_inserter(str), "edge_id={}.{}", this->edge_info().m_bnodeid, - this->edge_info().m_link_version); - } - if (this->total_entries() == 0) { - fmt::format_to(std::back_inserter(str), " [EMPTY] "); - return str; - } - if (!this->is_leaf()) { - fmt::format_to(std::back_inserter(str), " ["); - for (uint32_t i{0}; i < this->total_entries(); ++i) { - uint32_t cur_key = BtreeNode::get_nth_key< K >(i, false).key(); - BtreeLinkInfo child_info; - get_nth_value(i, &child_info, false /* copy */); - fmt::format_to(std::back_inserter(str), "{}.{} {}", cur_key, child_info.link_version(), - i == this->total_entries() - 1 ? "" : ", "); - } - fmt::format_to(std::back_inserter(str), "]"); - return str; + std::string to_dot_keys() const override { + std::string str; + std::string snext = this->next_bnode() == empty_bnodeid ? "" : fmt::format("next_node={}", this->next_bnode()); + str += fmt::format(R"("{}" [ + shape = none, + labelloc="c", + fontsize=25, + label = < + )", + this->node_id()); + if (this->total_entries() == 0) { + return str + fmt::format(R"( + +
E
>])"); + } + + if (!this->is_leaf()) { + // str += " "; + for (uint32_t i{0}; i < this->total_entries(); ++i) { + uint32_t cur_key = get_nth_key< K >(i, false).key(); + BtreeLinkInfo child_info; + get_nth_value(i, &child_info, false /* copy */); + str += fmt::format(R"( + {}.{})", + i, i, cur_key, child_info.link_version()); } - uint32_t prev_key = BtreeNode::get_nth_key< K >(0, false).key(); + std::string sedge = this->has_valid_edge() ? "edge:" + std::to_string(this->edge_info().m_bnodeid) + "." + + std::to_string(this->edge_info().m_link_version) + : ""; + str += fmt::format(R"( + + {}.{}
gen={}
{} {} >];)", + this->total_entries(), this->node_id(), this->link_version(), this->node_gen(), snext, + sedge); + + } else { + std::string keys_buf = ""; + uint32_t prev_key = get_nth_key< K >(0, false).key(); uint32_t cur_key = prev_key; - uint32_t last_key = BtreeNode::get_nth_key< K >(this->total_entries() - 1, false).key(); + uint32_t last_key = get_nth_key< K >(this->total_entries() - 1, false).key(); if (last_key - prev_key == this->total_entries() - 1) { - if (this->total_entries() == 1) - fmt::format_to(std::back_inserter(str), "{}[{}]", delimiter, prev_key); - else - fmt::format_to(std::back_inserter(str), "{}[{}-{}]", delimiter, prev_key, last_key); - return str; + if (this->total_entries() == 1) { + keys_buf += fmt::format(R"( + {})", + 0, 0, cur_key); + } else { + keys_buf += fmt::format(R"( + {}-{})", + 0, 0, prev_key, last_key); + } + keys_buf += fmt::format(R"( + + {}.{}
gen={}
{} + >];)", + 1, this->node_id(), this->link_version(), this->node_gen(), snext); + return str + keys_buf; } - fmt::format_to(std::back_inserter(str), "{}0 - [{}", delimiter, prev_key); + + keys_buf += fmt::format(R"( + "{})", + 0, 0, prev_key); uint32_t start_interval_key = prev_key; for (uint32_t i{1}; i < this->total_entries(); ++i) { - cur_key = BtreeNode::get_nth_key< K >(i, false).key(); + cur_key = get_nth_key< K >(i, false).key(); if (cur_key != prev_key + 1) { if (start_interval_key == prev_key) { - fmt::format_to(std::back_inserter(str), "-{}]{}{}- [{}", prev_key, delimiter, i, cur_key); + keys_buf += fmt::format(" {}", cur_key); } else { - fmt::format_to(std::back_inserter(str), "]{}{}- [{}", delimiter, i, cur_key); + keys_buf += fmt::format("-{} {}", prev_key, cur_key); } start_interval_key = cur_key; } @@ -280,12 +304,17 @@ class SimpleNode : public VariantNode< K, V > { } if (start_interval_key == prev_key) { - fmt::format_to(std::back_inserter(str), "]"); + keys_buf += fmt::format(""); } else { - fmt::format_to(std::back_inserter(str), "-{}]", cur_key); + keys_buf += fmt::format(" {}", cur_key); } - return str; -#endif + keys_buf += fmt::format(R"( + + {}.{}
gen={}
{}>];)", + 1, this->node_id(), this->link_version(), this->node_gen(), snext); + return str + keys_buf; + } + return str; } #ifndef NDEBUG diff --git a/src/include/homestore/btree/detail/varlen_node.hpp b/src/include/homestore/btree/detail/varlen_node.hpp index e98b63a52..fcb7ff79c 100644 --- a/src/include/homestore/btree/detail/varlen_node.hpp +++ b/src/include/homestore/btree/detail/varlen_node.hpp @@ -508,65 +508,7 @@ class VariableNode : public VariantNode< K, V > { return str; } - std::string to_string_keys(bool print_friendly = false) const override { -#if 0 - std::string delimiter = print_friendly ? "\n" : "\t"; - auto str = fmt::format("{}{}.{} nEntries={} {} ", - print_friendly ? "------------------------------------------------------------\n" : "", - this->node_id(), this->link_version(), this->total_entries(), (this->is_leaf() ? "LEAF" : "INTERIOR")); - if (!this->is_leaf() && (this->has_valid_edge())) { - fmt::format_to(std::back_inserter(str), "edge_id={}.{}", this->edge_info().m_bnodeid, - this->edge_info().m_link_version); - } - if (this->total_entries() == 0) { - fmt::format_to(std::back_inserter(str), " [EMPTY] "); - return str; - } - if (!this->is_leaf()) { - fmt::format_to(std::back_inserter(str), " ["); - for (uint32_t i{0}; i < this->total_entries(); ++i) { - uint32_t cur_key = BtreeNode::get_nth_key< K >(i, false).key(); - BtreeLinkInfo child_info; - get_nth_value(i, &child_info, false /* copy */); - fmt::format_to(std::back_inserter(str), "{}.{} {}", cur_key, child_info.link_version(), i == this->total_entries() - 1 ? "" : ", "); - } - fmt::format_to(std::back_inserter(str), "]"); - return str; - } - uint32_t prev_key = BtreeNode::get_nth_key< K >(0, false).key(); - uint32_t cur_key = prev_key; - uint32_t last_key = BtreeNode::get_nth_key< K >(this->total_entries() - 1, false).key(); - if (last_key - prev_key == this->total_entries() - 1) { - if (this->total_entries() == 1) - fmt::format_to(std::back_inserter(str), "{}[{}]", delimiter, prev_key); - else - fmt::format_to(std::back_inserter(str), "{}[{}-{}]", delimiter, prev_key, last_key); - return str; - } - fmt::format_to(std::back_inserter(str), "{}0 - [{}", delimiter, prev_key); - uint32_t start_interval_key = prev_key; - for (uint32_t i{1}; i < this->total_entries(); ++i) { - cur_key = BtreeNode::get_nth_key< K >(i, false).key(); - if (cur_key != prev_key + 1) { - if (start_interval_key == prev_key) { - fmt::format_to(std::back_inserter(str), "-{}]{}{}- [{}", prev_key, delimiter, i, cur_key); - } else { - fmt::format_to(std::back_inserter(str), "]{}{}- [{}", delimiter, i, cur_key); - } - start_interval_key = cur_key; - } - prev_key = cur_key; - } - - if (start_interval_key == prev_key) { - fmt::format_to(std::back_inserter(str), "]"); - } else { - fmt::format_to(std::back_inserter(str), "-{}]", cur_key); - } - return str; -#endif - return {}; - } + std::string to_dot_keys() const override { return "NOT Supported"; } /*int compare_nth_key_range(const BtreeKeyRange& range, uint32_t ind) const { return get_nth_key(ind, false).compare_range(range); diff --git a/src/include/homestore/index/index_internal.hpp b/src/include/homestore/index/index_internal.hpp index d813fa4e2..85c2a304d 100644 --- a/src/include/homestore/index/index_internal.hpp +++ b/src/include/homestore/index/index_internal.hpp @@ -68,11 +68,11 @@ class IndexTableBase { public: virtual ~IndexTableBase() = default; virtual uuid_t uuid() const = 0; + virtual void recovery_completed() = 0; virtual uint32_t ordinal() const = 0; virtual uint64_t used_size() const = 0; virtual void destroy() = 0; virtual void repair_node(IndexBufferPtr const& buf) = 0; - virtual void repair_root(IndexBufferPtr const& buf) = 0; }; enum class index_buf_state_t : uint8_t { @@ -122,6 +122,7 @@ struct IndexBuffer : public sisl::ObjLifeCounter< IndexBuffer > { bool is_meta_buf() const { return m_is_meta_buf; } std::string to_string() const; + std::string to_string_dot() const; }; // This is a special buffer which is used to write to the meta block diff --git a/src/include/homestore/index/index_table.hpp b/src/include/homestore/index/index_table.hpp index 3461fb357..2bec275e3 100644 --- a/src/include/homestore/index/index_table.hpp +++ b/src/include/homestore/index/index_table.hpp @@ -46,6 +46,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { m_sb->parent_uuid = parent_uuid; m_sb->user_sb_size = user_sb_size; m_sb.write(); + m_sb_buffer = std::make_shared< MetaIndexBuffer >(m_sb); // Create a root node which is a leaf node. @@ -56,7 +57,24 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { IndexTable(superblk< index_table_sb >&& sb, const BtreeConfig& cfg) : Btree< K, V >{cfg}, m_sb{std::move(sb)} { m_sb_buffer = std::make_shared< MetaIndexBuffer >(m_sb); - this->set_root_node_info(BtreeLinkInfo{m_sb->root_node, m_sb->root_link_version}); + + // After recovery, we see that root node is empty, which means that after btree is created, we crashed. + // So create new root node, which is essential for btree to function. + if (m_sb->root_node != empty_bnodeid) { + this->set_root_node_info(BtreeLinkInfo{m_sb->root_node, m_sb->root_link_version}); + } + } + + void recovery_completed() override { + if (m_sb->root_node == empty_bnodeid) { + // After recovery, we see that root node is empty, which means that after btree is created, we crashed. + // So create new root node, which is essential for btree to function. + auto cp = hs()->cp_mgr().cp_guard(); + auto const status = this->create_root_node((void*)cp.context(cp_consumer_t::INDEX_SVC)); + if (status != btree_status_t::success) { + throw std::runtime_error(fmt::format("Unable to create root node")); + } + } } void destroy() override { @@ -96,19 +114,30 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } void repair_node(IndexBufferPtr const& idx_buf) override { - BtreeNode* n = this->init_node(idx_buf->raw_buffer(), idx_buf->blkid().to_integer(), true, + if (idx_buf->is_meta_buf()) { + // We cannot repair the meta buf on its own, we need to repair the root node which modifies the + // meta_buf. It is ok to ignore this call, because repair will be done from root before meta_buf is + // attempted to repair, which would have updated the meta_buf already. + return; + } + BtreeNode* n = this->init_node(idx_buf->raw_buffer(), idx_buf->blkid().to_integer(), false /* init_buf */, BtreeNode::identify_leaf_node(idx_buf->raw_buffer())); static_cast< IndexBtreeNode* >(n)->attach_buf(idx_buf); auto cpg = cp_mgr().cp_guard(); - repair_links(BtreeNodePtr{n}, (void*)cpg.context(cp_consumer_t::INDEX_SVC)); - } - void repair_root(IndexBufferPtr const& root_buf) override { - BtreeNode* n = this->init_node(root_buf->raw_buffer(), root_buf->blkid().to_integer(), true, - BtreeNode::identify_leaf_node(root_buf->raw_buffer())); - static_cast< IndexBtreeNode* >(n)->attach_buf(root_buf); - auto cpg = cp_mgr().cp_guard(); - on_root_changed(n, (void*)cpg.context(cp_consumer_t::INDEX_SVC)); + // Set the cp_id to current cp buf, since repair doesn't call get_writable_buf (unlike regular IO path), so + // we need to set it here, so that other code path assumes correct cp + idx_buf->m_dirtied_cp_id = cpg->id(); + BtreeNodePtr bn = BtreeNodePtr{n}; + + LOGTRACEMOD(wbcache, "repair_node cp={} buf={}", cpg->id(), idx_buf->to_string()); + repair_links(bn, (void*)cpg.context(cp_consumer_t::INDEX_SVC)); + + if (idx_buf->m_up_buffer && idx_buf->m_up_buffer->is_meta_buf()) { + // Our up buffer is a meta buffer, which means that we are the new root node, we need to update the + // meta_buf with new root as well + on_root_changed(bn, (void*)cpg.context(cp_consumer_t::INDEX_SVC)); + } } protected: @@ -125,21 +154,22 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { auto cp_ctx = r_cast< CPContext* >(context); auto idx_node = static_cast< IndexBtreeNode* >(node.get()); + node->set_checksum(); auto prev_state = idx_node->m_idx_buf->m_state.exchange(index_buf_state_t::DIRTY); if (prev_state == index_buf_state_t::CLEAN) { // It was clean before, dirtying it first time, add it to the wb_cache list to flush - BT_DBG_ASSERT_EQ(idx_node->m_idx_buf->m_dirtied_cp_id, cp_ctx->id(), - "Writing a node which was not acquired by this cp"); + if (idx_node->m_idx_buf->m_dirtied_cp_id != -1) { + BT_DBG_ASSERT_EQ(idx_node->m_idx_buf->m_dirtied_cp_id, cp_ctx->id(), + "Writing a node which was not acquired by this cp"); + } + node->set_modified_cp_id(cp_ctx->id()); wb_cache().write_buf(node, idx_node->m_idx_buf, cp_ctx); - LOGTRACEMOD(wbcache, "add to dirty list cp {} {}", cp_ctx->id(), idx_node->m_idx_buf->to_string()); } else { BT_DBG_ASSERT_NE( (int)prev_state, (int)index_buf_state_t::FLUSHING, "Writing on a node buffer which was currently in flushing state on cur_cp={} buffer_cp_id={}", - cp_ctx->id(), idx_node->m_idx_buf->m_dirtied_cp_id) + cp_ctx->id(), idx_node->m_idx_buf->m_dirtied_cp_id); } - node->set_checksum(); - node->set_modified_cp_id(cp_ctx->id()); return btree_status_t::success; } @@ -147,24 +177,25 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { const BtreeNodePtr& left_child_node, const BtreeNodePtr& parent_node, void* context) override { CPContext* cp_ctx = r_cast< CPContext* >(context); - auto& left_child_buf = static_cast< IndexBtreeNode* >(left_child_node.get())->m_idx_buf; - auto& parent_buf = static_cast< IndexBtreeNode* >(parent_node.get())->m_idx_buf; IndexBufferPtrList new_node_bufs; for (const auto& right_child_node : new_nodes) { write_node_impl(right_child_node, context); - new_node_bufs.push_back(static_cast< IndexBtreeNode* >(right_child_node.get())->m_idx_buf); + new_node_bufs.push_back(s_cast< IndexBtreeNode* >(right_child_node.get())->m_idx_buf); } write_node_impl(left_child_node, context); - write_node_impl(parent_node, context); + // during recovery it is possible that there is no parent_node + if (parent_node.get() != nullptr) { write_node_impl(parent_node, context); } IndexBufferPtrList freed_node_bufs; for (const auto& freed_node : freed_nodes) { - freed_node_bufs.push_back(static_cast< IndexBtreeNode* >(freed_node.get())->m_idx_buf); + freed_node_bufs.push_back(s_cast< IndexBtreeNode* >(freed_node.get())->m_idx_buf); this->free_node(freed_node, locktype_t::WRITE, context); } - wb_cache().transact_bufs(ordinal(), parent_buf, left_child_buf, new_node_bufs, freed_node_bufs, cp_ctx); + wb_cache().transact_bufs( + ordinal(), parent_node.get() ? s_cast< IndexBtreeNode* >(parent_node.get())->m_idx_buf : nullptr, + s_cast< IndexBtreeNode* >(left_child_node.get())->m_idx_buf, new_node_bufs, freed_node_bufs, cp_ctx); return btree_status_t::success; } @@ -206,7 +237,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } btree_status_t repair_links(BtreeNodePtr const& parent_node, void* cp_ctx) { - BT_LOG(DEBUG, "Repairing links for parent node {}", parent_node->node_id()); + BT_LOG(DEBUG, "Repairing links for parent node {}", parent_node->to_string()); // Get the last key in the node auto const last_parent_key = parent_node->get_last_key< K >(); @@ -216,6 +247,8 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { parent_node->node_id()); return btree_status_t::not_found; } + BT_LOG(INFO, "Repairing node={} with last_parent_key={}", parent_node->to_string(), + last_parent_key.to_string()); // Get the first child node and its link info BtreeLinkInfo child_info; @@ -239,15 +272,19 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { auto cur_parent = parent_node; BtreeNodeList new_parent_nodes; do { - if (child_node->has_valid_edge()) { - BT_DBG_ASSERT(!is_parent_edge_node, + if (child_node->has_valid_edge() || + (child_node->is_leaf() && (child_node->next_bnode() == empty_bnodeid))) { + BT_DBG_ASSERT(is_parent_edge_node, "Child node={} is an edge node but parent_node={} is not an edge node", - child_node->node_id(), parent_node->node_id()); + child_node->node_id(), cur_parent->node_id()); cur_parent->set_edge_value(BtreeLinkInfo{child_node->node_id(), child_node->link_version()}); break; } auto const child_last_key = child_node->get_last_key< K >(); + BT_LOG(INFO, "Repairing node={} child_node={} child_last_key={}", cur_parent->node_id(), + child_node->to_string(), child_last_key.to_string()); + if (child_last_key.compare(last_parent_key) > 0) { // We have reached the last key, we can stop now break; @@ -275,13 +312,16 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { cur_parent->insert(cur_parent->total_entries(), child_last_key, BtreeLinkInfo{child_node->node_id(), child_node->link_version()}); + BT_LOG(INFO, "Repairing node={}, repaired so_far={}", cur_parent->node_id(), cur_parent->to_string()); + // Move to the next child node + this->unlock_node(child_node, locktype_t::READ); auto const next_node_id = child_node->next_bnode(); if (next_node_id == empty_bnodeid) { - BT_LOG_ASSERT( - false, - "Child node={} next_node_id is empty, while its not a edge node, parent_node={} repair is partial", - child_node->node_id(), parent_node->node_id()); + BT_LOG_ASSERT(false, + "Child node={} next_node_id is empty, while its not a edge node, parent_node={} " + "repair is partial", + child_node->node_id(), parent_node->node_id()); ret = btree_status_t::not_found; break; } @@ -293,18 +333,19 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { break; } } while (true); + this->unlock_node(child_node, locktype_t::READ); if (ret == btree_status_t::success) { ret = transact_nodes(new_parent_nodes, {}, parent_node, nullptr, cp_ctx); } if (ret != btree_status_t::success) { - BT_LOG(DEBUG, "An error occurred status={} during repair of parent_node={}, aborting the repair", + BT_LOG(ERROR, "An error occurred status={} during repair of parent_node={}, aborting the repair", enum_name(ret), parent_node->node_id()); std::memcpy(parent_node->m_phys_node_buf, tmp_buffer, this->m_bt_cfg.node_size()); } - delete tmp_buffer; + delete[] tmp_buffer; return ret; } }; diff --git a/src/include/homestore/index/wb_cache_base.hpp b/src/include/homestore/index/wb_cache_base.hpp index d576c2b6b..4624f9444 100644 --- a/src/include/homestore/index/wb_cache_base.hpp +++ b/src/include/homestore/index/wb_cache_base.hpp @@ -64,6 +64,7 @@ class IndexWBCacheBase { /// @param cur_buf /// @return // virtual IndexBufferPtr copy_buffer(const IndexBufferPtr& cur_buf, const CPContext* context) const = 0; + virtual void recover(sisl::byte_view sb) = 0; }; } // namespace homestore diff --git a/src/include/homestore/index_service.hpp b/src/include/homestore/index_service.hpp index cfd5bd5f7..fb776eb6b 100644 --- a/src/include/homestore/index_service.hpp +++ b/src/include/homestore/index_service.hpp @@ -47,6 +47,7 @@ class IndexService { std::shared_ptr< VirtualDev > m_vdev; std::pair< meta_blk*, sisl::byte_view > m_wbcache_sb{ std::pair< meta_blk*, sisl::byte_view >{nullptr, sisl::byte_view{}}}; + std::vector< std::pair< meta_blk*, sisl::byte_view > > m_itable_sbs; std::unique_ptr< sisl::IDReserver > m_ordinal_reserver; mutable std::mutex m_index_map_mtx; @@ -80,12 +81,8 @@ class IndexService { uint64_t used_size() const; uint32_t node_size() const; void repair_index_node(uint32_t ordinal, IndexBufferPtr const& node_buf); - void repair_index_root(uint32_t ordinal, IndexBufferPtr const& root_buf); IndexWBCacheBase& wb_cache() { return *m_wb_cache; } - -private: - void itable_meta_blk_found(const sisl::byte_view& buf, void* meta_cookie); }; extern IndexService& index_service(); diff --git a/src/lib/blkalloc/append_blk_allocator.cpp b/src/lib/blkalloc/append_blk_allocator.cpp index c7d18e0b1..4a4c7fd18 100644 --- a/src/lib/blkalloc/append_blk_allocator.cpp +++ b/src/lib/blkalloc/append_blk_allocator.cpp @@ -68,12 +68,12 @@ BlkAllocStatus AppendBlkAllocator::alloc_contiguous(BlkId& bid) { return alloc(1 // BlkAllocStatus AppendBlkAllocator::alloc(blk_count_t nblks, const blk_alloc_hints& hint, BlkId& out_bid) { if (available_blks() < nblks) { - //COUNTER_INCREMENT(m_metrics, num_alloc_failure, 1); + // COUNTER_INCREMENT(m_metrics, num_alloc_failure, 1); LOGERROR("No space left to serve request nblks: {}, available_blks: {}", nblks, available_blks()); return BlkAllocStatus::SPACE_FULL; } else if (nblks > max_blks_per_blkid()) { // consumer(vdev) already handles this case. - //COUNTER_INCREMENT(m_metrics, num_alloc_failure, 1); + // COUNTER_INCREMENT(m_metrics, num_alloc_failure, 1); LOGERROR("Can't serve request nblks: {} larger than max_blks_in_op: {}", nblks, max_blks_per_blkid()); return BlkAllocStatus::FAILED; } @@ -81,7 +81,7 @@ BlkAllocStatus AppendBlkAllocator::alloc(blk_count_t nblks, const blk_alloc_hint // Push 1 blk to the vector which has all the requested nblks; out_bid = BlkId{m_last_append_offset.fetch_add(nblks), nblks, m_chunk_id}; - //COUNTER_INCREMENT(m_metrics, num_alloc, 1); + // COUNTER_INCREMENT(m_metrics, num_alloc, 1); return BlkAllocStatus::SUCCESS; } diff --git a/src/lib/blkalloc/append_blk_allocator.h b/src/lib/blkalloc/append_blk_allocator.h index ed256d7f1..384a4936b 100644 --- a/src/lib/blkalloc/append_blk_allocator.h +++ b/src/lib/blkalloc/append_blk_allocator.h @@ -109,7 +109,7 @@ class AppendBlkAllocator : public BlkAllocator { std::string to_string() const override; void cp_flush(CP* cp) override; - + void recovery_completed() override {} nlohmann::json get_status(int log_level) const override; private: diff --git a/src/lib/blkalloc/bitmap_blk_allocator.h b/src/lib/blkalloc/bitmap_blk_allocator.h index 03ba713d8..381767bef 100644 --- a/src/lib/blkalloc/bitmap_blk_allocator.h +++ b/src/lib/blkalloc/bitmap_blk_allocator.h @@ -76,6 +76,7 @@ class BitmapBlkAllocator : public BlkAllocator { bool is_blk_alloced_on_disk(BlkId const& b, bool use_lock = false) const override; void cp_flush(CP* cp) override; + void recovery_completed() override {} blk_num_t get_num_portions() const { return (m_num_blks - 1) / m_blks_per_portion + 1; } blk_num_t get_blks_per_portion() const { return m_blks_per_portion; } diff --git a/src/lib/blkalloc/blk_allocator.h b/src/lib/blkalloc/blk_allocator.h index c95935c77..3ba0ecf82 100644 --- a/src/lib/blkalloc/blk_allocator.h +++ b/src/lib/blkalloc/blk_allocator.h @@ -160,6 +160,7 @@ class BlkAllocator { virtual blk_num_t get_used_blks() const = 0; virtual bool is_blk_alloced(BlkId const& b, bool use_lock = false) const = 0; virtual bool is_blk_alloced_on_disk(BlkId const& b, bool use_lock = false) const = 0; + virtual void recovery_completed() = 0; virtual std::string to_string() const = 0; virtual void cp_flush(CP* cp) = 0; diff --git a/src/lib/blkalloc/fixed_blk_allocator.cpp b/src/lib/blkalloc/fixed_blk_allocator.cpp index 1e2af3257..9df3a07ca 100644 --- a/src/lib/blkalloc/fixed_blk_allocator.cpp +++ b/src/lib/blkalloc/fixed_blk_allocator.cpp @@ -54,33 +54,23 @@ blk_num_t FixedBlkAllocator::init_portion(BlkAllocPortion& portion, blk_num_t st bool FixedBlkAllocator::is_blk_alloced(BlkId const& b, bool use_lock) const { return true; } BlkAllocStatus FixedBlkAllocator::alloc([[maybe_unused]] blk_count_t nblks, blk_alloc_hints const&, BlkId& out_blkid) { - if (m_state == state_t::RECOVERING) { - // Possibly first few attempts to allocate; under lock, remove all the blks which are marked to be removed from - // the free list - std::lock_guard lg(m_mark_blk_mtx); - if (!m_marked_blks.empty()) { - auto const count = available_blks(); - for (uint64_t i{0}; ((i < count) && !m_marked_blks.empty()); ++i) { - blk_num_t blk_num; - if (!m_free_blk_q.read(blk_num)) { break; } - - if (m_marked_blks.find(blk_num) != m_marked_blks.end()) { - m_marked_blks.erase(blk_num); // This blk needs to be skipped - } else { - m_free_blk_q.write(blk_num); // This blk is not marked, put it back at the end of queue - } - } - HS_DBG_ASSERT(m_marked_blks.empty(), "All marked blks should have been removed from free list"); - } - m_state = state_t::ACTIVE; - } - #ifdef _PRERELEASE if (iomgr_flip::instance()->test_flip("fixed_blkalloc_no_blks")) { return BlkAllocStatus::SPACE_FULL; } #endif +retry: blk_num_t blk_num; if (!m_free_blk_q.read(blk_num)) { return BlkAllocStatus::SPACE_FULL; } + if (m_state != state_t::ACTIVE) { + // We are not in active state, means we must be recovering. During recovery state, if any of the blks which are + // committed, we shouldn't be allocating those blocks. So remove from free blk q and retry another blk + std::lock_guard lg(m_reserve_blk_mtx); + if ((m_state == state_t::RECOVERING) && (m_reserved_blks.find(blk_num) != m_reserved_blks.end())) { + m_reserved_blks.erase(blk_num); // This blk can be removed from reserved, because its no longer free (since + // we also removed from free_blk q) + goto retry; + } + } out_blkid = BlkId{blk_num, 1, m_chunk_id}; return BlkAllocStatus::SUCCESS; } @@ -88,18 +78,42 @@ BlkAllocStatus FixedBlkAllocator::alloc([[maybe_unused]] blk_count_t nblks, blk_ BlkAllocStatus FixedBlkAllocator::alloc_contiguous(BlkId& out_blkid) { return alloc(1, {}, out_blkid); } BlkAllocStatus FixedBlkAllocator::reserve_on_cache(BlkId const& b) { - std::lock_guard lg(m_mark_blk_mtx); - HS_DBG_ASSERT(m_state == state_t::RECOVERING, "reserve_on_cache called on non-recovery path"); - m_marked_blks.insert(b.blk_num()); + std::lock_guard lg(m_reserve_blk_mtx); + if (m_state == state_t::RECOVERING) { m_reserved_blks.insert(b.blk_num()); } return BlkAllocStatus::SUCCESS; } +void FixedBlkAllocator::recovery_completed() { + std::lock_guard lg(m_reserve_blk_mtx); + if (!m_reserved_blks.empty()) { + auto const count = available_blks(); + for (uint64_t i{0}; ((i < count) && !m_reserved_blks.empty()); ++i) { + blk_num_t blk_num; + if (!m_free_blk_q.read(blk_num)) { break; } + + if (m_reserved_blks.find(blk_num) == m_reserved_blks.end()) { + m_free_blk_q.write(blk_num); // This blk is not marked, put it back at the end of queue + } + } + // We no longer need any reserved blks, since all of them are now marked as allocated + m_reserved_blks.clear(); + } + m_state = state_t::ACTIVE; +} + void FixedBlkAllocator::free(BlkId const& b) { HS_DBG_ASSERT_EQ(b.blk_count(), 1, "Multiple blk free for FixedBlkAllocator? allocated by different allocator?"); const auto pushed = m_free_blk_q.write(b.blk_num()); HS_DBG_ASSERT_EQ(pushed, true, "Expected to be able to push the blk on fixed capacity Q"); + if (m_state != state_t::ACTIVE) { + std::lock_guard lg(m_reserve_blk_mtx); + if (m_state == state_t::RECOVERING) { + // If we are in recovering state and freeing blk would removed it from being reserved as well. + m_reserved_blks.erase(b.blk_num()); + } + } if (is_persistent()) { free_on_disk(b); } } diff --git a/src/lib/blkalloc/fixed_blk_allocator.h b/src/lib/blkalloc/fixed_blk_allocator.h index adfe7b7b4..fa28681f2 100644 --- a/src/lib/blkalloc/fixed_blk_allocator.h +++ b/src/lib/blkalloc/fixed_blk_allocator.h @@ -36,6 +36,7 @@ class FixedBlkAllocator : public BitmapBlkAllocator { BlkAllocStatus alloc(blk_count_t nblks, blk_alloc_hints const& hints, BlkId& out_blkid) override; BlkAllocStatus reserve_on_cache(BlkId const& b) override; void free(BlkId const& b) override; + void recovery_completed() override; blk_num_t available_blks() const override; blk_num_t get_used_blks() const override; @@ -50,8 +51,8 @@ class FixedBlkAllocator : public BitmapBlkAllocator { enum class state_t : uint8_t { RECOVERING, ACTIVE }; state_t m_state{state_t::RECOVERING}; - std::unordered_set< blk_num_t > m_marked_blks; // Keep track of all blks which are marked as allocated - std::mutex m_mark_blk_mtx; // Mutex used while removing marked_blks from blk_q + std::unordered_set< blk_num_t > m_reserved_blks; // Keep track of all blks which are reserved as allocated + std::mutex m_reserve_blk_mtx; // Mutex used while removing marked_blks from blk_q folly::MPMCQueue< blk_num_t > m_free_blk_q; }; } // namespace homestore diff --git a/src/lib/common/crash_simulator.hpp b/src/lib/common/crash_simulator.hpp index f1c0b3024..98c22fe17 100644 --- a/src/lib/common/crash_simulator.hpp +++ b/src/lib/common/crash_simulator.hpp @@ -9,7 +9,7 @@ namespace homestore { class CrashSimulator { public: - CrashSimulator(std::function< void(void) > cb = nullptr) : m_restart_cb{cb} {} + CrashSimulator(std::function< void(void) > cb = nullptr) : m_restart_cb{std::move(cb)} {} ~CrashSimulator() = default; void crash() { @@ -17,14 +17,17 @@ class CrashSimulator { m_crashed.update([](auto* s) { *s = true; }); // We can restart on a new thread to allow other operations to continue - std::thread t([this]() { m_restart_cb(); }); + std::thread t([cb = std::move(m_restart_cb)]() { + // Restart could destroy this pointer, so we are storing in local variable and then calling. + cb(); + }); t.detach(); } else { raise(SIGKILL); } } - bool is_crashed() const { return ((m_restart_cb != nullptr) && *(m_crashed.access().get())); } + bool is_crashed() const { return *(m_crashed.access().get()); } bool crash_if_flip_set(const std::string& flip_name) { if (iomgr_flip::instance()->test_flip(flip_name)) { diff --git a/src/lib/device/device.h b/src/lib/device/device.h index 16442e82f..beefdfc7f 100644 --- a/src/lib/device/device.h +++ b/src/lib/device/device.h @@ -151,6 +151,7 @@ class DeviceManager { bool is_first_time_boot() const { return m_first_time_boot; } void format_devices(); + void commit_formatting(); void load_devices(); void close_devices(); bool is_boot_in_degraded_mode() const { return m_boot_in_degraded_mode; } diff --git a/src/lib/device/device_manager.cpp b/src/lib/device/device_manager.cpp index 5d16fc601..ea8461c67 100644 --- a/src/lib/device/device_manager.cpp +++ b/src/lib/device/device_manager.cpp @@ -116,6 +116,7 @@ void DeviceManager::format_devices() { first_block* fblk = r_cast< first_block* >(buf); fblk->magic = first_block::HOMESTORE_MAGIC; fblk->checksum = 0; // Computed while writing the first block + fblk->formatting_done = 0x0; // Formatting is not done yet, until homestore is completely started fblk->hdr = m_first_blk_hdr; // Entire header is copied as is auto pdev_id = populate_pdev_info(dinfo, attr, m_first_blk_hdr.system_uuid, fblk->this_pdev_hdr); fblk->checksum = crc32_ieee(init_crc32, uintptr_cast(fblk), first_block::s_atomic_fb_size); @@ -176,6 +177,27 @@ void DeviceManager::load_devices() { load_vdevs(); } +void DeviceManager::commit_formatting() { + auto buf = hs_utils::iobuf_alloc(hs_super_blk::first_block_size(), sisl::buftag::superblk, 512); + for (auto& pdev : m_all_pdevs) { + if (!pdev) { continue; } + + auto err = pdev->read_super_block(buf, hs_super_blk::first_block_size(), hs_super_blk::first_block_offset()); + if (err) { + LOGERROR("Failed to read first block from device={}, error={}", pdev->get_devname(), err.message()); + continue; + } + + first_block* fblk = r_cast< first_block* >(buf); + fblk->formatting_done = 0x1; + fblk->checksum = crc32_ieee(init_crc32, uintptr_cast(fblk), first_block::s_atomic_fb_size); + + pdev->write_super_block(buf, hs_super_blk::first_block_size(), hs_super_blk::first_block_offset()); + } + hs_utils::iobuf_free(buf, sisl::buftag::superblk); + LOGINFO("HomeStore formatting is committed on all physical devices"); +} + void DeviceManager::close_devices() { for (auto& pdev : m_all_pdevs) { if (pdev) { pdev->close_device(); } diff --git a/src/lib/device/hs_super_blk.h b/src/lib/device/hs_super_blk.h index 460894b35..9f909c5ea 100644 --- a/src/lib/device/hs_super_blk.h +++ b/src/lib/device/hs_super_blk.h @@ -131,8 +131,10 @@ struct first_block { static constexpr uint32_t HOMESTORE_MAGIC{0xCEEDDEEB}; // Magic written as first bytes on each device public: - uint64_t magic{0}; // Header magic expected to be at the top of block - uint32_t checksum{0}; // Checksum of the entire first block (excluding this field) + uint64_t magic{0}; // Header magic expected to be at the top of block + uint32_t checksum{0}; // Checksum of the entire first block (excluding this field) + uint32_t formatting_done : 1 {0}; // Has formatting completed yet + uint32_t reserved : 31 {0}; first_block_header hdr; // Information about the entire system pdev_info_header this_pdev_hdr; // Information about the current pdev @@ -141,7 +143,8 @@ struct first_block { bool is_valid() const { return ((magic == HOMESTORE_MAGIC) && - (std::string(hdr.product_name) == std::string(first_block_header::PRODUCT_NAME))); + (std::string(hdr.product_name) == std::string(first_block_header::PRODUCT_NAME) && + (formatting_done != 0x0))); } std::string to_string() const { @@ -203,8 +206,7 @@ class hs_super_blk { return (dinfo.dev_size - 1) / min_chunk_size + 1; } static uint32_t min_chunk_size(HSDevType dtype) { - uint64_t min_chunk_size = - (dtype == HSDevType::Fast) ? MIN_CHUNK_SIZE_FAST_DEVICE : MIN_CHUNK_SIZE_DATA_DEVICE; + uint64_t min_chunk_size = (dtype == HSDevType::Fast) ? MIN_CHUNK_SIZE_FAST_DEVICE : MIN_CHUNK_SIZE_DATA_DEVICE; #ifdef _PRERELEASE auto chunk_size = iomgr_flip::instance()->get_test_flip< long >("set_minimum_chunk_size"); if (chunk_size) { min_chunk_size = chunk_size.get(); } diff --git a/src/lib/device/virtual_dev.cpp b/src/lib/device/virtual_dev.cpp index ae4cf5d18..37b32804f 100644 --- a/src/lib/device/virtual_dev.cpp +++ b/src/lib/device/virtual_dev.cpp @@ -696,6 +696,13 @@ void VirtualDev::cp_flush(VDevCPContext* v_cp_ctx) { // sync-ops during cp_flush, so return 100; int VirtualDev::cp_progress_percent() { return 100; } +void VirtualDev::recovery_completed() { + if (m_allocator_type != blk_allocator_type_t::append) { + m_chunk_selector->foreach_chunks( + [this](cshared< Chunk >& chunk) { chunk->blk_allocator_mutable()->recovery_completed(); }); + } +} + ///////////////////////// VirtualDev Private Methods ///////////////////////////// uint64_t VirtualDev::to_dev_offset(BlkId const& b, Chunk** chunk) const { *chunk = m_dmgr.get_chunk_mutable(b.chunk_num()); diff --git a/src/lib/device/virtual_dev.hpp b/src/lib/device/virtual_dev.hpp index 6901be0f4..c004e3c03 100644 --- a/src/lib/device/virtual_dev.hpp +++ b/src/lib/device/virtual_dev.hpp @@ -275,6 +275,8 @@ class VirtualDev { std::unique_ptr< CPContext > create_cp_context(CP* cp); + void recovery_completed(); + ////////////////////////// Standard Getters /////////////////////////////// virtual uint64_t available_blks() const; virtual uint64_t size() const { return m_vdev_info.vdev_size; } diff --git a/src/lib/homestore.cpp b/src/lib/homestore.cpp index 24211e24d..af2d521c5 100644 --- a/src/lib/homestore.cpp +++ b/src/lib/homestore.cpp @@ -264,9 +264,17 @@ void HomeStore::do_start() { } } - m_cp_mgr->start_timer(); + // If this is the first time boot, we need to commit the formatting so that it will not be considered as first time + // boot going forward on next reboot. + if (m_dev_mgr->is_first_time_boot()) { + // Take the first CP after we have initialized all subsystems and wait for it to complete. + m_cp_mgr->trigger_cp_flush(true /* force */).get(); + m_dev_mgr->commit_formatting(); + } + m_cp_mgr->start_timer(); m_resource_mgr->start(m_dev_mgr->total_capacity()); + m_init_done = true; } diff --git a/src/lib/index/index_cp.cpp b/src/lib/index/index_cp.cpp index 6c8d06395..955bd523f 100644 --- a/src/lib/index/index_cp.cpp +++ b/src/lib/index/index_cp.cpp @@ -51,7 +51,9 @@ void IndexCPContext::add_to_txn_journal(uint32_t index_ordinal, const IndexBuffe rec->append(op_t::parent_inplace, parent_buf->blkid()); if (parent_buf->is_meta_buf()) { rec->is_parent_meta = 0x1; } } - if (left_child_buf) { rec->append(op_t::child_inplace, left_child_buf->blkid()); } + if (left_child_buf && (left_child_buf != parent_buf)) { + rec->append(op_t::child_inplace, left_child_buf->blkid()); + } for (auto const& buf : created_bufs) { rec->append(op_t::child_new, buf->blkid()); } @@ -79,7 +81,7 @@ std::optional< IndexBufferPtr > IndexCPContext::next_dirty() { } std::string IndexCPContext::to_string() { - std::string str{fmt::format("IndexCPContext cpid={} dirty_buf_count={} dirty_buf_list_size={}", m_cp->id(), + std::string str{fmt::format("IndexCPContext cpid={} dirty_buf_count={} dirty_buf_list_size={}\n", m_cp->id(), m_dirty_buf_count.get(), m_dirty_buf_list.size())}; // Mapping from a node to all its parents in the graph. @@ -106,6 +108,42 @@ std::string IndexCPContext::to_string() { return str; } +void IndexCPContext::to_string_dot(const std::string& filename) { + std::ofstream file(filename); + if (!file.is_open()) { throw std::runtime_error("Failed to open file: " + filename); } + + file << "digraph G {\n"; + + // Mapping from a node to all its parents in the graph. + std::unordered_map< IndexBuffer*, std::vector< IndexBuffer* > > parents; + + m_dirty_buf_list.foreach_entry([&parents](IndexBufferPtr buf) { + // Add this buf to his children. + parents[buf->m_up_buffer.get()].emplace_back(buf.get()); + }); + m_dirty_buf_list.foreach_entry([&file, &parents, this](IndexBufferPtr buf) { + std::vector< std::string > colors = {"lightgreen", "lightcoral", "lightyellow"}; + auto sbuf = BtreeNode::to_string_buf(buf->raw_buffer()); + auto pos = sbuf.find("LEAF"); + if (pos != std::string::npos) { + sbuf.insert(pos + 4, "
"); + } else { + pos = sbuf.find("INTERIOR"); + if (pos != std::string::npos) { sbuf.insert(pos + 8, "
"); } + } + file << fmt::format( + "\"{}\" [shape={}, label=< {}
{} >, fillcolor=\"{}\", style=\"filled\", fontname=\"bold\"];\n", + r_cast< void* >(buf.get()), m_cp->id() == buf->m_created_cp_id ? "ellipse" : "box", buf->to_string_dot(), + sbuf, colors[s_cast< int >(buf->state())]); + for (const auto& p : parents[buf.get()]) { + file << fmt::format("\"{}\" -> \"{}\";\n", r_cast< void* >(p), r_cast< void* >(buf.get())); + } + }); + file << "}\n"; + + file.close(); +} + std::string IndexCPContext::to_string_with_dags() { struct DagNode { IndexBufferPtr buf; @@ -189,13 +227,12 @@ std::map< BlkId, IndexBufferPtr > IndexCPContext::recover(sisl::byte_view sb) { void IndexCPContext::process_txn_record(txn_record const* rec, std::map< BlkId, IndexBufferPtr >& buf_map) { auto cpg = cp_mgr().cp_guard(); - auto const rec_to_buf = [&buf_map, &cpg](txn_record const* rec, uint32_t idx, + auto const rec_to_buf = [&buf_map, &cpg](txn_record const* rec, bool is_meta, BlkId const& bid, IndexBufferPtr const& up_buf) -> IndexBufferPtr { - BlkId const bid = rec->blk_id(idx); IndexBufferPtr buf; auto it = buf_map.find(bid); if (it == buf_map.end()) { - if (rec->is_parent_meta) { + if (is_meta) { superblk< index_table_sb > tmp_sb; buf = std::make_shared< MetaIndexBuffer >(tmp_sb); } else { @@ -213,27 +250,86 @@ void IndexCPContext::process_txn_record(txn_record const* rec, std::map< BlkId, if (up_buf) { DEBUG_ASSERT(((buf->m_up_buffer == nullptr) || (buf->m_up_buffer == up_buf)), "Inconsistent up buffer"); - up_buf->m_wait_for_down_buffers.increment(1); - buf->m_up_buffer = up_buf; + auto real_up_buf = (up_buf->m_created_cp_id == cpg->id()) ? up_buf->m_up_buffer : up_buf; + +#ifndef NDEBUG + // if (!is_sibling_link || (buf->m_up_buffer == real_up_buf)) { return buf;} + // Already linked with same buf or its not a sibling link to override + bool found{false}; + for (auto const& dbuf : real_up_buf->m_down_buffers) { + if (dbuf.lock() == buf) { + found = true; + break; + } + } + if (found) { return buf; } + real_up_buf->m_down_buffers.emplace_back(buf); +#endif + + if (buf->m_up_buffer != real_up_buf) { + real_up_buf->m_wait_for_down_buffers.increment(1); + buf->m_up_buffer = real_up_buf; + } } return buf; }; uint32_t cur_idx = 0; - IndexBufferPtr parent_buf; - if (rec->has_inplace_parent) { parent_buf = rec_to_buf(rec, cur_idx++, nullptr); } + IndexBufferPtr parent_buf{nullptr}; + if (rec->has_inplace_parent) { parent_buf = rec_to_buf(rec, rec->is_parent_meta, rec->blk_id(cur_idx++), nullptr); } - IndexBufferPtr inplace_child_buf; - if (rec->has_inplace_child) { inplace_child_buf = rec_to_buf(rec, cur_idx++, parent_buf); } + IndexBufferPtr inplace_child_buf{nullptr}; + if (rec->has_inplace_child) { + inplace_child_buf = rec_to_buf(rec, false /* is_meta */, rec->blk_id(cur_idx++), parent_buf); + } for (uint8_t idx{0}; idx < rec->num_new_ids; ++idx) { - auto new_buf = rec_to_buf(rec, cur_idx++, inplace_child_buf); + auto new_buf = rec_to_buf(rec, false /* is_meta */, rec->blk_id(cur_idx++), + inplace_child_buf ? inplace_child_buf : parent_buf); new_buf->m_created_cp_id = cpg->id(); } for (uint8_t idx{0}; idx < rec->num_freed_ids; ++idx) { - auto freed_buf = rec_to_buf(rec, cur_idx++, inplace_child_buf); + auto freed_buf = rec_to_buf(rec, false /* is_meta */, rec->blk_id(cur_idx++), + inplace_child_buf ? inplace_child_buf : parent_buf); freed_buf->m_node_freed = true; } } + +void IndexCPContext::txn_journal::log_records() const { LOGINFO("{}", to_string()); } + +std::string IndexCPContext::txn_journal::to_string() const { + std::string str = fmt::format("cp_id={}, num_txns={}, size={}", cp_id, num_txns, size); + uint8_t const* cur_ptr = r_cast< uint8_t const* >(this) + sizeof(txn_journal); + for (uint32_t t{0}; t < num_txns; ++t) { + txn_record const* rec = r_cast< txn_record const* >(cur_ptr); + fmt::format_to(std::back_inserter(str), "\n {}: {}", t, rec->to_string()); + cur_ptr += rec->size(); + } + return str; +} + +std::string IndexCPContext::txn_record::to_string() const { + auto add_to_string = [this](std::string& str, uint8_t& idx, uint8_t id_count) { + if (id_count == 0) { + fmt::format_to(std::back_inserter(str), "empty]"); + } else { + for (uint8_t i{0}; i < id_count; ++i, ++idx) { + fmt::format_to(std::back_inserter(str), "[chunk={}, blk={}],", ids[idx].second, ids[idx].first); + } + fmt::format_to(std::back_inserter(str), "]"); + } + }; + + std::string str = fmt::format("ordinal={}, parent=[{}], in_place_child=[{}]", index_ordinal, parent_id_string(), + child_id_string(), num_new_ids, num_freed_ids); + + uint8_t idx = (has_inplace_parent == 0x1) ? 1 : 0 + (has_inplace_child == 0x1) ? 1 : 0; + fmt::format_to(std::back_inserter(str), ", new_ids=["); + add_to_string(str, idx, num_new_ids); + + fmt::format_to(std::back_inserter(str), ", freed_ids=["); + add_to_string(str, idx, num_freed_ids); + return str; +} } // namespace homestore diff --git a/src/lib/index/index_cp.hpp b/src/lib/index/index_cp.hpp index 044d935dc..1b8a2a2b0 100644 --- a/src/lib/index/index_cp.hpp +++ b/src/lib/index/index_cp.hpp @@ -34,52 +34,76 @@ struct IndexCPContext : public VDevCPContext { using compact_blkid_t = std::pair< blk_num_t, chunk_num_t >; enum class op_t : uint8_t { child_new, child_freed, parent_inplace, child_inplace }; struct txn_record { + uint8_t has_inplace_parent : 1; // Do we have parent_id in the list of ids. It will be first + uint8_t has_inplace_child : 1; // Do we have child_id in the list of ids. It will be second + uint8_t is_parent_meta : 1; // Is the parent buffer a meta buffer + uint8_t reserved1 : 5; uint8_t num_new_ids; uint8_t num_freed_ids; - uint8_t has_inplace_parent : 1; - uint8_t has_inplace_child : 1; - uint8_t is_parent_meta : 1; // Is the parent buffer a meta buffer - uint8_t reserved1 : 5; uint8_t reserved{0}; uint32_t index_ordinal; compact_blkid_t ids[1]; // C++ std probhits 0 size array txn_record(uint32_t ordinal) : - num_new_ids{0}, - num_freed_ids{0}, has_inplace_parent{0x0}, has_inplace_child{0x0}, is_parent_meta{0x0}, + num_new_ids{0}, + num_freed_ids{0}, index_ordinal{ordinal} {} uint32_t total_ids() const { - return (num_new_ids + num_freed_ids + has_inplace_parent ? 1 : 0 + has_inplace_child ? 1 : 0); + return (num_new_ids + num_freed_ids + ((has_inplace_parent == 0x1) ? 1 : 0) + + ((has_inplace_child == 0x1) ? 1 : 0)); } - uint32_t size() const { return sizeof(txn_record) - (total_ids() - 1) * sizeof(compact_blkid_t); } + uint32_t size() const { return sizeof(txn_record) + (total_ids() - 1) * sizeof(compact_blkid_t); } static uint32_t size_for_num_ids(uint8_t n) { return sizeof(txn_record) + (n - 1) * sizeof(compact_blkid_t); } + + uint32_t next_slot() const { + return ((has_inplace_parent == 0x1) ? 1 : 0) + ((has_inplace_child == 0x1) ? 1 : 0) + num_new_ids + + num_freed_ids; + } + void append(op_t op, BlkId const& blk) { + auto const compact_blk = std::make_pair(blk.blk_num(), blk.chunk_num()); + auto const slot = next_slot(); if (op == op_t::parent_inplace) { DEBUG_ASSERT(has_inplace_parent == 0x0, "Duplicate inplace parent in same txn record"); + DEBUG_ASSERT((has_inplace_child == 0x0) && (num_new_ids == 0) && (num_freed_ids == 0), + "Ordering of append is not correct"); has_inplace_parent = 0x1; } else if (op == op_t::child_inplace) { DEBUG_ASSERT(has_inplace_child == 0x0, "Duplicate inplace child in same txn record"); has_inplace_child = 0x1; } else if (op == op_t::child_new) { DEBUG_ASSERT_LT(num_new_ids, 0xff, "Too many new ids in txn record"); - ids[num_new_ids++] = std::make_pair(blk.blk_num(), blk.chunk_num()); + ++num_new_ids; } else if (op == op_t::child_freed) { DEBUG_ASSERT_LT(num_freed_ids, 0xff, "Too many freed ids in txn record"); - ids[num_freed_ids++] = std::make_pair(blk.blk_num(), blk.chunk_num()); + ++num_freed_ids; } else { DEBUG_ASSERT(false, "Invalid op type"); } + ids[slot] = compact_blk; } BlkId blk_id(uint8_t idx) const { DEBUG_ASSERT_LT(idx, total_ids(), "Index out of bounds"); return BlkId{ids[idx].first, (blk_count_t)1u, ids[idx].second}; } + + std::string parent_id_string() const { + return (has_inplace_parent == 0x1) ? fmt::format("chunk={}, blk={}", ids[0].second, ids[0].first) : "empty"; + } + + std::string child_id_string() const { + auto const idx = (has_inplace_parent == 0x1) ? 1 : 0; + return (has_inplace_child == 0x1) ? fmt::format("chunk={}, blk={}", ids[idx].second, ids[idx].first) + : "empty"; + } + + std::string to_string() const; }; struct txn_journal { @@ -103,6 +127,9 @@ struct IndexCPContext : public VDevCPContext { ++num_txns; return append_guard(this, ordinal); } + + std::string to_string() const; + void log_records() const; }; #pragma pack() @@ -135,6 +162,7 @@ struct IndexCPContext : public VDevCPContext { std::optional< IndexBufferPtr > next_dirty(); std::string to_string(); std::string to_string_with_dags(); + void to_string_dot(const std::string& filename); private: void check_cycle(); diff --git a/src/lib/index/index_service.cpp b/src/lib/index/index_service.cpp index 453b802be..cc199bbd5 100644 --- a/src/lib/index/index_service.cpp +++ b/src/lib/index/index_service.cpp @@ -33,7 +33,7 @@ IndexService::IndexService(std::unique_ptr< IndexServiceCallbacks > cbs) : m_svc meta_service().register_handler( "index", [this](meta_blk* mblk, sisl::byte_view buf, size_t size) { - itable_meta_blk_found(std::move(buf), voidptr_cast(mblk)); + m_itable_sbs.emplace_back(std::pair{mblk, std::move(buf)}); }, nullptr); @@ -67,18 +67,26 @@ shared< VirtualDev > IndexService::open_vdev(const vdev_info& vinfo, bool load_e uint32_t IndexService::reserve_ordinal() { return m_ordinal_reserver->reserve(); } -void IndexService::itable_meta_blk_found(const sisl::byte_view& buf, void* meta_cookie) { - // We have found an index table superblock. Notify the callback which should convert the superblock into actual - // IndexTable instance - superblk< index_table_sb > sb; - sb.load(buf, meta_cookie); - add_index_table(m_svc_cbs->on_index_table_found(std::move(sb))); -} - void IndexService::start() { // Start Writeback cache - m_wb_cache = std::make_unique< IndexWBCache >(m_vdev, std::move(m_wbcache_sb), hs()->evictor(), + m_wb_cache = std::make_unique< IndexWBCache >(m_vdev, m_wbcache_sb, hs()->evictor(), hs()->device_mgr()->atomic_page_size(HSDevType::Fast)); + + // Load any index tables which are to loaded from meta blk + for (auto const& [meta_cookie, buf] : m_itable_sbs) { + superblk< index_table_sb > sb; + sb.load(buf, meta_cookie); + add_index_table(m_svc_cbs->on_index_table_found(std::move(sb))); + } + + // Recover the writeback cache, which in-turns recovers any index table nodes + m_wb_cache->recover(m_wbcache_sb.second); + + // Notify each table that we have completed recovery + std::unique_lock lg(m_index_map_mtx); + for (const auto& [_, tbl] : m_index_map) { + tbl->recovery_completed(); + } } void IndexService::stop() { m_wb_cache.reset(); } @@ -109,12 +117,11 @@ std::shared_ptr< IndexTableBase > IndexService::get_index_table(uint32_t ordinal void IndexService::repair_index_node(uint32_t ordinal, IndexBufferPtr const& node_buf) { auto tbl = get_index_table(ordinal); - if (tbl) { tbl->repair_node(node_buf); } -} - -void IndexService::repair_index_root(uint32_t ordinal, IndexBufferPtr const& root_buf) { - auto tbl = get_index_table(ordinal); - if (tbl) { tbl->repair_root(root_buf); } + if (tbl) { + tbl->repair_node(node_buf); + } else { + HS_DBG_ASSERT(false, "Index corresponding to ordinal={} has not been loaded yet, unexpected", ordinal); + } } uint32_t IndexService::node_size() const { return m_vdev->atomic_page_size(); } @@ -144,11 +151,35 @@ std::string IndexBuffer::to_string() const { voidptr_cast(const_cast< IndexBuffer* >(this)), m_index_ordinal, int_cast(state()), m_created_cp_id, m_dirtied_cp_id, m_wait_for_down_buffers.get(), m_node_freed); } else { - return fmt::format("Buf={} index={} state={} create/dirty_cp={}/{} down_wait#={} {} node=[{}] ", + // store m_down_buffers in a string + std::string down_bufs = ""; +#ifndef NDEBUG + for (auto const& down_buf : m_down_buffers) { + if (auto ptr = down_buf.lock()) { + fmt::format_to(std::back_inserter(down_bufs), "[{}]", voidptr_cast(ptr.get())); + } + } +#endif + + return fmt::format("Buf={} index={} state={} create/dirty_cp={}/{} down_wait#={}{} up={} node=[{}] down=[{}]", voidptr_cast(const_cast< IndexBuffer* >(this)), m_index_ordinal, int_cast(state()), - m_created_cp_id, m_dirtied_cp_id, m_wait_for_down_buffers.get(), m_node_freed ? "Freed" : "", - r_cast< persistent_hdr_t const* >(m_bytes)->to_compact_string()); + m_created_cp_id, m_dirtied_cp_id, m_wait_for_down_buffers.get(), + m_node_freed ? " Freed" : "", voidptr_cast(const_cast< IndexBuffer* >(m_up_buffer.get())), + (m_bytes == nullptr) ? "not attached yet" + : r_cast< persistent_hdr_t const* >(m_bytes)->to_compact_string(), + down_bufs); + } +} +std::string IndexBuffer::to_string_dot() const { + auto str = fmt::format("IndexBuffer {} ", reinterpret_cast< void* >(const_cast< IndexBuffer* >(this))); + if (m_bytes == nullptr) { + fmt::format_to(std::back_inserter(str), " node_buf=nullptr "); + } else { + fmt::format_to(std::back_inserter(str), " node_buf={} {} created/dirtied={}/{} {} down_wait#={}", + static_cast< void* >(m_bytes), m_is_meta_buf ? "[META]" : "", m_created_cp_id, m_dirtied_cp_id, + m_node_freed ? "FREED" : "", m_wait_for_down_buffers.get()); } + return str; } MetaIndexBuffer::MetaIndexBuffer(superblk< index_table_sb >& sb) : IndexBuffer{nullptr, BlkId{}}, m_sb{sb} { diff --git a/src/lib/index/wb_cache.cpp b/src/lib/index/wb_cache.cpp index 6bbb6d809..6c0c722ae 100644 --- a/src/lib/index/wb_cache.cpp +++ b/src/lib/index/wb_cache.cpp @@ -54,7 +54,6 @@ IndexWBCache::IndexWBCache(const std::shared_ptr< VirtualDev >& vdev, std::pair< // We need to register the consumer first before recovery, so that recovery can use the cp_ctx created to add/track // recovered new nodes. cp_mgr().register_consumer(cp_consumer_t::INDEX_SVC, std::move(std::make_unique< IndexCPCallbacks >(this))); - recover(std::move(sb.second)); } void IndexWBCache::start_flush_threads() { @@ -102,9 +101,11 @@ BtreeNodePtr IndexWBCache::alloc_buf(node_initializer_t&& node_initializer) { idx_buf->m_dirtied_cp_id = cpg->id(); auto node = node_initializer(idx_buf); - // Add the node to the cache - bool done = m_cache.insert(node); - HS_REL_ASSERT_EQ(done, true, "Unable to add alloc'd node to cache, low memory or duplicate inserts?"); + if (!m_in_recovery) { + // Add the node to the cache. Skip if we are in recovery mode. + bool done = m_cache.insert(node); + HS_REL_ASSERT_EQ(done, true, "Unable to add alloc'd node to cache, low memory or duplicate inserts?"); + } // The entire index is updated in the commit path, so we alloc the blk and commit them right away auto alloc_status = m_vdev->commit_blk(blkid); @@ -115,9 +116,19 @@ BtreeNodePtr IndexWBCache::alloc_buf(node_initializer_t&& node_initializer) { void IndexWBCache::write_buf(const BtreeNodePtr& node, const IndexBufferPtr& buf, CPContext* cp_ctx) { // TODO upsert always returns false even if it succeeds. - if (node != nullptr) { m_cache.upsert(node); } - r_cast< IndexCPContext* >(cp_ctx)->add_to_dirty_list(buf); - resource_mgr().inc_dirty_buf_size(m_node_size); + if (m_in_recovery) { + if (buf->is_meta_buf()) { + auto const& sb = r_cast< MetaIndexBuffer* >(buf.get())->m_sb; + meta_service().update_sub_sb(buf->m_bytes, sb.size(), sb.meta_blk()); + } else { + m_vdev->sync_write(r_cast< const char* >(buf->raw_buffer()), m_node_size, buf->m_blkid); + } + } else { + if (node != nullptr) { m_cache.upsert(node); } + LOGTRACEMOD(wbcache, "add to dirty list cp {} {}", cp_ctx->id(), buf->to_string()); + r_cast< IndexCPContext* >(cp_ctx)->add_to_dirty_list(buf); + resource_mgr().inc_dirty_buf_size(m_node_size); + } } void IndexWBCache::read_buf(bnodeid_t id, BtreeNodePtr& node, node_initializer_t&& node_initializer) { @@ -125,7 +136,7 @@ void IndexWBCache::read_buf(bnodeid_t id, BtreeNodePtr& node, node_initializer_t retry: // Check if the blkid is already in cache, if not load and put it into the cache - if (m_cache.get(blkid, node)) { return; } + if (!m_in_recovery && m_cache.get(blkid, node)) { return; } // Read the buffer from virtual device auto idx_buf = std::make_shared< IndexBuffer >(blkid, m_node_size, m_vdev->align_size()); @@ -135,10 +146,12 @@ void IndexWBCache::read_buf(bnodeid_t id, BtreeNodePtr& node, node_initializer_t node = node_initializer(idx_buf); // Push the node into cache - bool done = m_cache.insert(node); - if (!done) { - // There is a race between 2 concurrent reads from vdev and other party won the race. Re-read from cache - goto retry; + if (!m_in_recovery) { + bool done = m_cache.insert(node); + if (!done) { + // There is a race between 2 concurrent reads from vdev and other party won the race. Re-read from cache + goto retry; + } } } @@ -184,6 +197,7 @@ bool IndexWBCache::refresh_meta_buf(shared< MetaIndexBuffer >& meta_buf, CPConte new_buf->m_dirtied_cp_id = cp_ctx->id(); write_buf(nullptr, new_buf, cp_ctx); meta_buf = new_buf; // Replace the meta_buf with new buf + LOGTRACEMOD(wbcache, "meta buf {} is created in cp {}", meta_buf->to_string(), cp_ctx->id()); } return true; } @@ -193,13 +207,16 @@ static void set_crash_flips(IndexBufferPtr const& parent_buf, IndexBufferPtr con IndexBufferPtrList const& new_node_bufs, IndexBufferPtrList const& freed_node_bufs) { // TODO: Need an API from flip to quickly check if flip is enabled, so this method doesn't check flip_enabled a // bunch of times. - if ((new_node_bufs.size() == 1) && freed_node_bufs.empty()) { + if (parent_buf && parent_buf->is_meta_buf()) { + // Split or merge happening on root + if (iomgr_flip::instance()->test_flip("crash_flush_on_meta")) { + parent_buf->set_crash_flag(); + } else if (iomgr_flip::instance()->test_flip("crash_flush_on_root")) { + child_buf->set_crash_flag(); + } + } else if ((new_node_bufs.size() == 1) && freed_node_bufs.empty()) { // Its a split node situation - if (iomgr_flip::instance()->test_flip("crash_flush_on_split_at_meta")) { - if (parent_buf->is_meta_buf()) { parent_buf->set_crash_flag(); } - } else if (iomgr_flip::instance()->test_flip("crash_flush_on_split_at_root")) { - if (parent_buf->is_meta_buf()) { child_buf->set_crash_flag(); } - } else if (iomgr_flip::instance()->test_flip("crash_flush_on_split_at_parent")) { + if (iomgr_flip::instance()->test_flip("crash_flush_on_split_at_parent")) { parent_buf->set_crash_flag(); } else if (iomgr_flip::instance()->test_flip("crash_flush_on_split_at_left_child")) { child_buf->set_crash_flag(); @@ -208,11 +225,7 @@ static void set_crash_flips(IndexBufferPtr const& parent_buf, IndexBufferPtr con } } else if (!freed_node_bufs.empty() && (new_node_bufs.size() != freed_node_bufs.size())) { // Its a merge nodes sitation - if (iomgr_flip::instance()->test_flip("crash_flush_on_merge_at_meta")) { - if (parent_buf->is_meta_buf()) { parent_buf->set_crash_flag(); } - } else if (iomgr_flip::instance()->test_flip("crash_flush_on_merge_at_root")) { - if (parent_buf->is_meta_buf()) { child_buf->set_crash_flag(); } - } else if (iomgr_flip::instance()->test_flip("crash_flush_on_merge_at_parent")) { + if (iomgr_flip::instance()->test_flip("crash_flush_on_merge_at_parent")) { parent_buf->set_crash_flag(); } else if (iomgr_flip::instance()->test_flip("crash_flush_on_merge_at_left_child")) { child_buf->set_crash_flag(); @@ -221,11 +234,7 @@ static void set_crash_flips(IndexBufferPtr const& parent_buf, IndexBufferPtr con } } else if (!freed_node_bufs.empty() && (new_node_bufs.size() == freed_node_bufs.size())) { // Its a rebalance node situation - if (iomgr_flip::instance()->test_flip("crash_flush_on_rebalance_at_meta")) { - if (parent_buf->is_meta_buf()) { parent_buf->set_crash_flag(); } - } else if (iomgr_flip::instance()->test_flip("crash_flush_on_rebalance_at_root")) { - if (parent_buf->is_meta_buf()) { child_buf->set_crash_flag(); } - } else if (iomgr_flip::instance()->test_flip("crash_flush_on_rebalance_at_parent")) { + if (iomgr_flip::instance()->test_flip("crash_flush_on_rebalance_at_parent")) { parent_buf->set_crash_flag(); } else if (iomgr_flip::instance()->test_flip("crash_flush_on_rebalance_at_left_child")) { child_buf->set_crash_flag(); @@ -261,9 +270,28 @@ void IndexWBCache::transact_bufs(uint32_t index_ordinal, IndexBufferPtr const& p } } - icp_ctx->add_to_txn_journal(index_ordinal, child_buf->m_up_buffer, - new_node_bufs.size() ? new_node_bufs[0]->m_up_buffer : nullptr, new_node_bufs, - freed_node_bufs); + if (new_node_bufs.empty() && freed_node_bufs.empty()) { + // This is an update for meta, root transaction. + if (child_buf->m_created_cp_id != -1) { + DEBUG_ASSERT_EQ(child_buf->m_created_cp_id, icp_ctx->id(), + "Root buffer is not created by current cp (for split root), its not expected"); + } + icp_ctx->add_to_txn_journal(index_ordinal, parent_buf, nullptr, {child_buf}, {}); + } else { + icp_ctx->add_to_txn_journal(index_ordinal, // Ordinal + child_buf->m_up_buffer, // real up buffer + new_node_bufs.empty() ? freed_node_bufs[0]->m_up_buffer + : new_node_bufs[0]->m_up_buffer, // real in place child + new_node_bufs, // new node bufs + freed_node_bufs // free_node_bufs + ); + } +#if 0 + static int id = 0; + auto filename = "transact_bufs_"+std::to_string(id++)+ "_" +std::to_string(rand()%100)+".dot"; + LOGINFO("Transact cp is in cp\n{} and storing in {}\n\n\n", icp_ctx->to_string(), filename); + icp_ctx->to_string_dot(filename); +#endif } void IndexWBCache::link_buf(IndexBufferPtr const& up_buf, IndexBufferPtr const& down_buf, bool is_sibling_link, @@ -346,8 +374,10 @@ void IndexWBCache::link_buf(IndexBufferPtr const& up_buf, IndexBufferPtr const& void IndexWBCache::free_buf(const IndexBufferPtr& buf, CPContext* cp_ctx) { BtreeNodePtr node; - bool done = m_cache.remove(buf->m_blkid, node); - HS_REL_ASSERT_EQ(done, true, "Race on cache removal of btree blkid?"); + if (!m_in_recovery) { + bool done = m_cache.remove(buf->m_blkid, node); + HS_REL_ASSERT_EQ(done, true, "Race on cache removal of btree blkid?"); + } resource_mgr().inc_free_blk(m_node_size); m_vdev->free_blk(buf->m_blkid, s_cast< VDevCPContext* >(cp_ctx)); @@ -356,7 +386,12 @@ void IndexWBCache::free_buf(const IndexBufferPtr& buf, CPContext* cp_ctx) { //////////////////// Recovery Related section ///////////////////////////////// void IndexWBCache::recover(sisl::byte_view sb) { // If sb is empty, its possible a first time boot. - if ((sb.bytes() == nullptr) || (sb.size() == 0)) { return; } + if ((sb.bytes() == nullptr) || (sb.size() == 0)) { + m_vdev->recovery_completed(); + return; + } + + m_in_recovery = true; // For entirity of this call, we should mark it as being recovered. // Recover the CP Context with the buf_map of all the buffers that were dirtied in the last cp with its // relationship (up/down buf links) as it was by the cp that was flushing the buffers prior to unclean shutdown. @@ -364,58 +399,84 @@ void IndexWBCache::recover(sisl::byte_view sb) { auto icp_ctx = r_cast< IndexCPContext* >(cpg.context(cp_consumer_t::INDEX_SVC)); std::map< BlkId, IndexBufferPtr > bufs = icp_ctx->recover(std::move(sb)); - // With all the buffers recovered, we first make the decision of which blk to keep and which blk to free. This - // is needed so that subsequent repair can do blk allocation and shouldn't incorrectly allocate the blks which - // are going to be committed later. - std::vector< IndexBufferPtr > new_bufs; + LOGINFOMOD(wbcache, "Detected unclean shutdown, prior cp={} had to flush {} nodes, recovering... ", icp_ctx->id(), + bufs.size()); + + // At this point, we have the DAG structure (up/down dependency graph), exactly the same as prior to crash, with one + // addition of all freed buffers also put in the DAG structure. + // + // We do repair/recovery as 2 passes. A quick glance would look like we don't need 2 passes of the walking through + // all the buffers, but it is essential. + // + // In the first pass, we look for any new bufs and any freed bufs and commit/free their corresponding node blkids. + // This has to be done before doing any repair, because repair can allocate blkids and we don't want to allocate + // the same blkid which could clash with the blkid next in the buf list. + // + // On the second pass, we only take the new nodes/bufs and then repair their up buffers, if needed. + std::vector< IndexBufferPtr > l0_bufs; for (auto const& [_, buf] : bufs) { - if (buf->m_node_freed) { - // If the node was freed according txn records, we need to check if up_buf node was also written - if (was_node_committed(buf->m_up_buffer)) { - // Up buffer was written, so this buffer can be freed and thus can free the blk. - m_vdev->free_blk(buf->m_blkid, s_cast< VDevCPContext* >(icp_ctx)); - } - buf->m_up_buffer->m_wait_for_down_buffers.decrement(); - } else if (buf->m_created_cp_id == icp_ctx->id()) { - if (was_node_committed(buf) && was_node_committed(buf->m_up_buffer)) { - // The buffer was newly created on this cp, and its up_buffer is also written, we need to commit - // this blk again, so that it will not be reallocated for other node during repair. - m_vdev->commit_blk(buf->m_blkid); - new_bufs.push_back(buf); + if (buf->m_node_freed || (buf->m_created_cp_id == icp_ctx->id())) { + if (was_node_committed(buf)) { + if (was_node_committed(buf->m_up_buffer)) { + if (buf->m_node_freed) { + // Up buffer was written, so this buffer can be freed and thus can free the blk. + m_vdev->free_blk(buf->m_blkid, s_cast< VDevCPContext* >(icp_ctx)); + } else { + m_vdev->commit_blk(buf->m_blkid); + } + l0_bufs.push_back(buf); + } else { + buf->m_up_buffer->m_wait_for_down_buffers.decrement(); + } } } } + LOGINFOMOD(wbcache, "Index Recovery detected {} nodes out of {} as new/freed nodes to be recovered in prev cp={}", + l0_bufs.size(), bufs.size(), icp_ctx->id()); + + auto detailed_log = [this](std::map< BlkId, IndexBufferPtr > const& bufs, + std::vector< IndexBufferPtr > const& l0_bufs) { + // Logs to detect down_waits are set correctly for up buffers list of all recovered bufs + std::string log = fmt::format("\trecovered bufs (#of bufs = {})\n", bufs.size()); + for (auto const& [_, buf] : bufs) { + fmt::format_to(std::back_inserter(log), "{}\n", buf->to_string()); + } + + // list of new_bufs + fmt::format_to(std::back_inserter(log), "\n\tl0_bufs (#of bufs = {})\n", l0_bufs.size()); + for (auto const& buf : l0_bufs) { + fmt::format_to(std::back_inserter(log), "{}\n", buf->to_string()); + } + return log; + }; + LOGTRACEMOD(wbcache, "All unclean bufs list\n{}", detailed_log(bufs, l0_bufs)); + // Second iteration we start from the lowest levels (which are all new_bufs) and check if up_buffers need to be // repaired. All L1 buffers are not needed to repair, because they are sibling nodes and so we pass false in // do_repair flag. - for (auto const& buf : new_bufs) { - process_up_buf(buf->m_up_buffer, false /* do_repair */); + for (auto const& buf : l0_bufs) { + recover_buf(buf->m_up_buffer); } + m_in_recovery = false; + m_vdev->recovery_completed(); } -void IndexWBCache::process_up_buf(IndexBufferPtr const& buf, bool do_repair) { - if (do_repair) { - // If the buffer needs to be repaired, reset the dirtied_cp_id so once all down buffers are done, it does - // repair - buf->m_dirtied_cp_id = -1; - } - +void IndexWBCache::recover_buf(IndexBufferPtr const& buf) { if (!buf->m_wait_for_down_buffers.decrement_testz()) { return; } - // One of the down buffer indicated that it has to repair our node, so we issue a repair - if (buf->m_dirtied_cp_id == -1) { index_service().repair_index_node(buf->m_index_ordinal, buf); } - - // If there is an up buffer on next level, we need to process them and ask them to repair in case they were not - // written as part of this CP. - if (buf->m_up_buffer) { - if (buf->m_up_buffer->is_meta_buf()) { - // If the up_buf is meta buffer, then we found the new root, repair the current node accordingly - index_service().repair_index_root(buf->m_index_ordinal, buf); - } else { - process_up_buf(buf->m_up_buffer, !was_node_committed(buf->m_up_buffer)); - } + // All down buffers are completed and given a nod saying that they are committed. If this buffer is not committed, + // then we need to repair this node/buffer. After that we will keep going to the next up level to repair them if + // needed + if (!was_node_committed(buf)) { + LOGDEBUGMOD(wbcache, "Index Recovery detected uncommitted up node [{}], repairing it", buf->to_string()); + index_service().repair_index_node(buf->m_index_ordinal, buf); + } else { + LOGTRACEMOD(wbcache, "Index Recovery detected up node [{}] as committed no need to repair that", + buf->to_string()); } + + if (buf->m_up_buffer) { recover_buf(buf->m_up_buffer); } } bool IndexWBCache::was_node_committed(IndexBufferPtr const& buf) { @@ -442,12 +503,25 @@ bool IndexWBCache::was_node_committed(IndexBufferPtr const& buf) { //////////////////// CP Related API section ///////////////////////////////// folly::Future< bool > IndexWBCache::async_cp_flush(IndexCPContext* cp_ctx) { - LOGTRACEMOD(wbcache, "cp_ctx {}", cp_ctx->to_string()); + LOGTRACEMOD(wbcache, "Starting Index CP Flush with cp context={}", cp_ctx->to_string_with_dags()); if (!cp_ctx->any_dirty_buffers()) { - CP_PERIODIC_LOG(DEBUG, cp_ctx->id(), "Btree does not have any dirty buffers to flush"); + if (cp_ctx->id() == 0) { + // For the first CP, we need to flush the journal buffer to the meta blk + LOGINFO("First time boot cp, we shall flush the vdev to ensure all cp information is created"); + m_vdev->cp_flush(cp_ctx); + } else { + CP_PERIODIC_LOG(DEBUG, cp_ctx->id(), "Btree does not have any dirty buffers to flush"); + } return folly::makeFuture< bool >(true); // nothing to flush } +#ifdef _PRERELEASE + if (hs()->crash_simulator().is_crashed()) { + LOGINFOMOD(wbcache, "crash simulation is ongoing, so skip the cp flush"); + return folly::makeFuture< bool >(true); + } +#endif + // First thing is to flush the new_blks created as part of the CP. auto const& journal_buf = cp_ctx->journal_buf(); if (journal_buf.size() != 0) { @@ -475,16 +549,23 @@ folly::Future< bool > IndexWBCache::async_cp_flush(IndexCPContext* cp_ctx) { } void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const& buf, bool part_of_batch) { - LOGTRACEMOD(wbcache, "cp {} buf {}", cp_ctx->id(), buf->to_string()); - buf->set_state(index_buf_state_t::FLUSHING); - #ifdef _PRERELEASE if (buf->m_crash_flag_on) { - LOGINFOMOD(wbcache, "Simulating crash while writing buffer {}", buf->to_string()); + std::string filename = "crash_buf_" + std::to_string(cp_ctx->id()) + ".dot"; + LOGINFOMOD(wbcache, "Simulating crash while writing buffer {}, stored in file {}", buf->to_string(), filename); + cp_ctx->to_string_dot(filename); hs()->crash_simulator().crash(); + cp_ctx->complete(true); + return; + } else if (hs()->crash_simulator().is_crashed()) { + LOGINFOMOD(wbcache, "crash simulation is ongoing, aid simulation by not flushing"); + return; } #endif + LOGTRACEMOD(wbcache, "cp={} {}", cp_ctx->id(), buf->to_string()); + buf->set_state(index_buf_state_t::FLUSHING); + if (buf->is_meta_buf()) { LOGTRACEMOD(wbcache, "flushing cp {} meta buf {} possibly because of root split", cp_ctx->id(), buf->to_string()); @@ -509,6 +590,13 @@ void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const } void IndexWBCache::process_write_completion(IndexCPContext* cp_ctx, IndexBufferPtr const& buf) { +#ifdef _PRERELEASE + if (hs()->crash_simulator().is_crashed()) { + LOGINFOMOD(wbcache, "Crash simulation is ongoing, ignore all process_write_completion"); + return; + } +#endif + LOGTRACEMOD(wbcache, "cp {} buf {}", cp_ctx->id(), buf->to_string()); resource_mgr().dec_dirty_buf_size(m_node_size); auto [next_buf, has_more] = on_buf_flush_done(cp_ctx, buf); diff --git a/src/lib/index/wb_cache.hpp b/src/lib/index/wb_cache.hpp index 8f08f73a1..209d3845e 100644 --- a/src/lib/index/wb_cache.hpp +++ b/src/lib/index/wb_cache.hpp @@ -40,6 +40,7 @@ class IndexWBCache : public IndexWBCacheBase { std::vector< iomgr::io_fiber_t > m_cp_flush_fibers; std::mutex m_flush_mtx; void* m_meta_blk; + bool m_in_recovery{false}; public: IndexWBCache(const std::shared_ptr< VirtualDev >& vdev, std::pair< meta_blk*, sisl::byte_view > sb, @@ -59,7 +60,7 @@ class IndexWBCache : public IndexWBCacheBase { //////////////////// CP Related API section ///////////////////////////////// folly::Future< bool > async_cp_flush(IndexCPContext* context); IndexBufferPtr copy_buffer(const IndexBufferPtr& cur_buf, const CPContext* cp_ctx) const; - void recover(sisl::byte_view sb); + void recover(sisl::byte_view sb) override; private: void start_flush_threads(); @@ -75,7 +76,7 @@ class IndexWBCache : public IndexWBCacheBase { void get_next_bufs_internal(IndexCPContext* cp_ctx, uint32_t max_count, IndexBufferPtr const& prev_flushed_buf, IndexBufferPtrList& bufs); - void process_up_buf(IndexBufferPtr const& buf, bool do_repair); + void recover_buf(IndexBufferPtr const& buf); bool was_node_committed(IndexBufferPtr const& buf); }; } // namespace homestore diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt index a20904b8f..eab66ece1 100644 --- a/src/tests/CMakeLists.txt +++ b/src/tests/CMakeLists.txt @@ -71,16 +71,15 @@ if (${io_tests}) set(TEST_INDEXBTREE_SOURCE_FILES test_index_btree.cpp) add_executable(test_index_btree ${TEST_INDEXBTREE_SOURCE_FILES}) target_link_libraries(test_index_btree homestore ${COMMON_TEST_DEPS} GTest::gtest) - #TODO : Fix the test case and enable it add_test(NAME IndexBtree COMMAND test_index_btree --gtest_filter=*/0.*) set_property(TEST IndexBtree PROPERTY ENVIRONMENT "ASAN_OPTIONS=detect_stack_use_after_return=true") set_tests_properties(IndexBtree PROPERTIES TIMEOUT 1200) - #set(TEST_RECOVERY_INDEX_SOURCE_FILES test_index_recovery.cpp) - #add_executable(test_index_recovery ${TEST_RECOVERY_INDEX_SOURCE_FILES}) - #target_link_libraries(test_index_recovery homestore ${COMMON_TEST_DEPS} GTest::gtest) - #add_test(NAME IndexRecovery COMMAND test_index_recovery) - # set_property(TEST IndexRecovery PROPERTY ENVIRONMENT "ASAN_OPTIONS=detect_stack_use_after_return=true") + set(TEST_RECOVERY_INDEX_SOURCE_FILES test_index_crash_recovery.cpp) + add_executable(test_index_crash_recovery ${TEST_RECOVERY_INDEX_SOURCE_FILES}) + target_link_libraries(test_index_crash_recovery homestore ${COMMON_TEST_DEPS} GTest::gtest) + add_test(NAME IndexCrashRecovery COMMAND test_index_crash_recovery) + set_property(TEST IndexCrashRecovery PROPERTY ENVIRONMENT "ASAN_OPTIONS=detect_stack_use_after_return=true") add_executable(test_data_service) target_sources(test_data_service PRIVATE test_data_service.cpp) diff --git a/src/tests/btree_helpers/btree_test_helper.hpp b/src/tests/btree_helpers/btree_test_helper.hpp index 950884903..298235068 100644 --- a/src/tests/btree_helpers/btree_test_helper.hpp +++ b/src/tests/btree_helpers/btree_test_helper.hpp @@ -274,7 +274,7 @@ struct BtreeTestHelper { qreq.enable_route_tracing(); auto const ret = m_bt->query(qreq, out_vector); auto const expected_count = std::min(remaining, batch_size); - + // this->print_keys(); ASSERT_EQ(out_vector.size(), expected_count) << "Received incorrect value on query pagination"; if (remaining < batch_size) { @@ -374,8 +374,29 @@ struct BtreeTestHelper { run_in_parallel(op_list); } - void print(const std::string& file = "") const { m_bt->print_tree(file); } - void print_keys() const { m_bt->print_tree_keys(); } + void dump_to_file(const std::string& file = "") const { m_bt->dump_tree_to_file(file); } + void print_keys(const std::string& preamble = "") const { + auto print_key_range = [](std::vector< std::pair< K, V > > const& kvs) -> std::string { + uint32_t start = 0; + std::string str; + for (uint32_t i{1}; i <= kvs.size(); ++i) { + if ((i == kvs.size()) || (kvs[i].first.key() != kvs[i - 1].first.key() + 1)) { + if ((i - start) > 1) { + fmt::format_to(std::back_inserter(str), "{}-{}{}", kvs[start].first.key(), + kvs[i - 1].first.key(), (i == kvs.size()) ? "" : ", "); + } else { + fmt::format_to(std::back_inserter(str), "{}{}", kvs[start].first.key(), + (i == kvs.size()) ? "" : ", "); + } + start = i; + } + } + return str; + }; + + LOGINFO("{}{}", preamble.empty() ? "" : preamble + ":\n", m_bt->to_custom_string(print_key_range)); + } + void visualize_keys(const std::string& file) const { m_bt->visualize_tree_keys(file); } void compare_files(const std::string& before, const std::string& after) { std::ifstream b(before, std::ifstream::ate); diff --git a/src/tests/btree_helpers/shadow_map.hpp b/src/tests/btree_helpers/shadow_map.hpp index a91dd39b8..9818c8a45 100644 --- a/src/tests/btree_helpers/shadow_map.hpp +++ b/src/tests/btree_helpers/shadow_map.hpp @@ -3,6 +3,7 @@ #include "btree_test_kvs.hpp" + template < typename K, typename V > class ShadowMap { private: @@ -11,7 +12,11 @@ class ShadowMap { uint32_t m_max_keys; using mutex = iomgr::FiberManagerLib::shared_mutex; mutex m_mutex; - +//#define SHOWM(X) cout << #X " = " << (X) << endl +// void testPrint(std::map< uint32_t, std::string >& m_map, int i) { +// SHOWM(m[i]); +// SHOWM(m.find(i)->first); +// } public: ShadowMap(uint32_t num_keys) : m_range_scheduler(num_keys), m_max_keys{num_keys} {} @@ -142,7 +147,7 @@ class ShadowMap { auto it2 = other.m_map.begin(); std::vector< std::pair< K, bool > > ret_diff; - while ((it1 != m_map.end()) && (it2 != m_map.end())) { + while ((it1 != m_map.end()) && (it2 != other.m_map.end())) { auto const x = it1->first.compare(it2->first); if (x == 0) { ++it1; @@ -163,7 +168,7 @@ class ShadowMap { } while (it2 != other.m_map.end()) { - ret_diff.emplace_back(it1->first, false /* addition */); + ret_diff.emplace_back(it2->first, false /* addition */); ++it2; } return ret_diff; diff --git a/src/tests/test_common/homestore_test_common.hpp b/src/tests/test_common/homestore_test_common.hpp index b82ae2466..7a25c9b12 100644 --- a/src/tests/test_common/homestore_test_common.hpp +++ b/src/tests/test_common/homestore_test_common.hpp @@ -198,7 +198,10 @@ class HSTestHelper { test_params& params(uint32_t svc) { return m_token.svc_params_[svc]; } #ifdef _PRERELEASE - void wait_for_crash_recovery() { m_crash_recovered.getFuture().get(); } + void wait_for_crash_recovery() { + m_crash_recovered.getFuture().get(); + m_crash_recovered = folly::Promise< folly::Unit >(); + } #endif void set_min_chunk_size(uint64_t chunk_size) { diff --git a/src/tests/test_device_manager.cpp b/src/tests/test_device_manager.cpp index 6ddb8aec5..4077bc917 100644 --- a/src/tests/test_device_manager.cpp +++ b/src/tests/test_device_manager.cpp @@ -78,7 +78,12 @@ class DeviceMgrTest : public ::testing::Test { return std::make_shared< homestore::VirtualDev >(*m_dmgr, vinfo_tmp, nullptr /* event_cb */, false); }); - m_dmgr->is_first_time_boot() ? m_dmgr->format_devices() : m_dmgr->load_devices(); + if (m_dmgr->is_first_time_boot()) { + m_dmgr->format_devices(); + m_dmgr->commit_formatting(); + } else { + m_dmgr->load_devices(); + } m_pdevs = m_dmgr->get_pdevs_by_dev_type(homestore::HSDevType::Data); m_vdevs = m_dmgr->get_vdevs(); } diff --git a/src/tests/test_index_btree.cpp b/src/tests/test_index_btree.cpp index 68e6bbee0..6b5ff27ab 100644 --- a/src/tests/test_index_btree.cpp +++ b/src/tests/test_index_btree.cpp @@ -288,7 +288,7 @@ TYPED_TEST(BtreeTest, CpFlush) { LOGINFO("Query {} entries and validate with pagination of 75 entries", num_entries); this->do_query(0, num_entries - 1, 75); - this->print(std::string("before.txt")); + this->dump_to_file(std::string("before.txt")); this->destroy_btree(); @@ -298,7 +298,7 @@ TYPED_TEST(BtreeTest, CpFlush) { std::this_thread::sleep_for(std::chrono::seconds{1}); LOGINFO("Restarted homestore with index recovered"); - this->print(std::string("after.txt")); + this->dump_to_file(std::string("after.txt")); LOGINFO("Query {} entries", num_entries); this->do_query(0, num_entries - 1, 1000); @@ -336,7 +336,7 @@ TYPED_TEST(BtreeTest, MultipleCpFlush) { LOGINFO("Query {} entries and validate with pagination of 75 entries", num_entries); this->do_query(0, num_entries - 1, 75); - this->print(std::string("before.txt")); + this->dump_to_file(std::string("before.txt")); this->destroy_btree(); @@ -345,7 +345,7 @@ TYPED_TEST(BtreeTest, MultipleCpFlush) { std::this_thread::sleep_for(std::chrono::seconds{1}); LOGINFO(" Restarted homestore with index recovered"); - this->print(std::string("after.txt")); + this->dump_to_file(std::string("after.txt")); this->compare_files("before.txt", "after.txt"); @@ -400,7 +400,7 @@ TYPED_TEST(BtreeTest, ThreadedCpFlush) { LOGINFO("Query {} entries and validate with pagination of 75 entries", num_entries); this->do_query(0, num_entries - 1, 75); - this->print(std::string("before.txt")); + this->dump_to_file(std::string("before.txt")); this->destroy_btree(); // Restart homestore. m_bt is updated by the TestIndexServiceCallback. @@ -408,7 +408,7 @@ TYPED_TEST(BtreeTest, ThreadedCpFlush) { std::this_thread::sleep_for(std::chrono::seconds{1}); LOGINFO(" Restarted homestore with index recovered"); - this->print(std::string("after.txt")); + this->dump_to_file(std::string("after.txt")); this->compare_files("before.txt", "after.txt"); diff --git a/src/tests/test_index_crash_recovery.cpp b/src/tests/test_index_crash_recovery.cpp new file mode 100644 index 000000000..2cc932315 --- /dev/null +++ b/src/tests/test_index_crash_recovery.cpp @@ -0,0 +1,289 @@ +/********************************************************************************* + * Modifications Copyright 2017-2019 eBay Inc. + * + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ +#include +#include + +#include +#include "common/homestore_config.hpp" +#include "common/resource_mgr.hpp" +#include "test_common/homestore_test_common.hpp" +#include "test_common/range_scheduler.hpp" +#include "btree_helpers/btree_test_helper.hpp" +#include "btree_helpers/btree_test_kvs.hpp" +#include "btree_helpers/btree_decls.h" + +using namespace homestore; + +SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) +SISL_OPTIONS_ENABLE(logging, test_index_crash_recovery, iomgr, test_common_setup) +SISL_LOGGING_DECL(test_index_crash_recovery) + +// TODO Add tests to do write,remove after recovery. +// TODO Test with var len key with io mgr page size is 512. + +SISL_OPTION_GROUP( + test_index_crash_recovery, + (num_iters, "", "num_iters", "number of iterations for rand ops", + ::cxxopts::value< uint32_t >()->default_value("500"), "number"), + (num_entries, "", "num_entries", "number of entries to test with", + ::cxxopts::value< uint32_t >()->default_value("5000"), "number"), + (run_time, "", "run_time", "run time for io", ::cxxopts::value< uint32_t >()->default_value("360000"), "seconds"), + (disable_merge, "", "disable_merge", "disable_merge", ::cxxopts::value< bool >()->default_value("0"), ""), + (operation_list, "", "operation_list", "operation list instead of default created following by percentage", + ::cxxopts::value< std::vector< std::string > >(), "operations [...]"), + (preload_size, "", "preload_size", "number of entries to preload tree with", + ::cxxopts::value< uint32_t >()->default_value("1000"), "number"), + (init_device, "", "init_device", "init device", ::cxxopts::value< bool >()->default_value("1"), ""), + (cleanup_after_shutdown, "", "cleanup_after_shutdown", "cleanup after shutdown", + ::cxxopts::value< bool >()->default_value("1"), ""), + (seed, "", "seed", "random engine seed, use random if not defined", + ::cxxopts::value< uint64_t >()->default_value("0"), "number")) + +void log_obj_life_counter() { + std::string str; + sisl::ObjCounterRegistry::foreach ([&str](const std::string& name, int64_t created, int64_t alive) { + fmt::format_to(std::back_inserter(str), "{}: created={} alive={}\n", name, created, alive); + }); + LOGINFO("Object Life Counter\n:{}", str); +} + +#ifdef _PRERELEASE +template < typename TestType > +struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestType >, public ::testing::Test { + using T = TestType; + using K = typename TestType::KeyType; + using V = typename TestType::ValueType; + class TestIndexServiceCallbacks : public IndexServiceCallbacks { + public: + TestIndexServiceCallbacks(IndexCrashTest* test) : m_test(test) {} + + std::shared_ptr< IndexTableBase > on_index_table_found(superblk< index_table_sb >&& sb) override { + LOGINFO("Index table recovered, root bnode_id {} version {}", sb->root_node, sb->root_link_version); + m_test->m_cfg = BtreeConfig(hs()->index_service().node_size()); + m_test->m_cfg.m_leaf_node_type = T::leaf_node_type; + m_test->m_cfg.m_int_node_type = T::interior_node_type; + m_test->m_bt = std::make_shared< typename T::BtreeType >(std::move(sb), m_test->m_cfg); + return m_test->m_bt; + } + + private: + IndexCrashTest* m_test; + }; + + IndexCrashTest() : testing::Test() { this->m_is_multi_threaded = true; } + + void SetUp() override { + // Set the cp_timer_us to very high value to avoid any automatic checkpointing. + HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) { + s.generic.cache_max_throttle_cnt = 10000; + s.generic.cp_timer_us = 0x8000000000000000; + s.resource_limits.dirty_buf_percent = 100; + HS_SETTINGS_FACTORY().save(); + }); + + this->start_homestore( + "test_index_crash_recovery", + {{HS_SERVICE::META, {.size_pct = 10.0}}, + {HS_SERVICE::INDEX, {.size_pct = 70.0, .index_svc_cbs = new TestIndexServiceCallbacks(this)}}}, + nullptr, {}, SISL_OPTIONS["init_device"].as< bool >()); + + LOGINFO("Node size {} ", hs()->index_service().node_size()); + this->m_cfg = BtreeConfig(hs()->index_service().node_size()); + + auto uuid = boost::uuids::random_generator()(); + auto parent_uuid = boost::uuids::random_generator()(); + + homestore::hs()->resource_mgr().reset_dirty_buf_qd(); + + // Create index table and attach to index service. + BtreeTestHelper< TestType >::SetUp(); + if (this->m_bt == nullptr || SISL_OPTIONS["init_device"].as< bool >()) { + this->m_bt = std::make_shared< typename T::BtreeType >(uuid, parent_uuid, 0, this->m_cfg); + } else { + populate_shadow_map(); + } + + hs()->index_service().add_index_table(this->m_bt); + LOGINFO("Added index table to index service"); + } + + void populate_shadow_map() { + this->m_shadow_map.load(m_shadow_filename); + ASSERT_EQ(this->m_shadow_map.size(), this->m_bt->count_keys(this->m_bt->root_node_id())) + << "shadow map size and tree size mismatch"; + this->get_all(); + } + + void restart_homestore(uint32_t shutdown_delay_sec = 3) override { + this->params(HS_SERVICE::INDEX).index_svc_cbs = new TestIndexServiceCallbacks(this); + LOGINFO("\n\n\n\n\n\n shutdown homestore for index service Test\n\n\n\n\n"); + // this->m_shadow_map.save(this->m_shadow_filename); + test_common::HSTestHelper::restart_homestore(shutdown_delay_sec); + } + + void reapply_after_crash() { + ShadowMap< K, V > snapshot_map{this->m_shadow_map.max_keys()}; + snapshot_map.load(m_shadow_filename); + LOGDEBUG("\tSnapshot before crash\n{}", snapshot_map.to_string()); + auto diff = this->m_shadow_map.diff(snapshot_map); + + // visualize tree after crash + // std::string recovered_tree_filename = "tree_after_crash_" + to_string(rand() % 100) + ".dot"; + // this->visualize_keys(recovered_tree_filename); + // LOGINFO(" tree after recovered stored in {}", recovered_tree_filename); + + std::string dif_str = "KEY \tADDITION\n"; + for (const auto& [k, addition] : diff) { + dif_str += fmt::format(" {} \t{}\n", k.key(), addition); + } + LOGDEBUG("Diff between shadow map and snapshot map\n{}\n", dif_str); + + for (const auto& [k, addition] : diff) { + // this->print_keys(fmt::format("reapply: before inserting key {}", k.key())); + // this->visualize_keys(recovered_tree_filename); + if (addition) { this->force_upsert(k.key()); } + } + test_common::HSTestHelper::trigger_cp(true); + this->m_shadow_map.save(m_shadow_filename); + } + + void TearDown() override { + bool cleanup = SISL_OPTIONS["cleanup_after_shutdown"].as< bool >(); + LOGINFO("cleanup the dump map and index data? {}", cleanup); + if (!cleanup) { + this->m_shadow_map.save(m_shadow_filename); + } else { + if (std::filesystem::remove(m_shadow_filename)) { + LOGINFO("File {} removed successfully", m_shadow_filename); + } else { + LOGINFO("Error: failed to remove {}", m_shadow_filename); + } + } + LOGINFO("Teardown with Root bnode_id {} tree size: {}", this->m_bt->root_node_id(), + this->m_bt->count_keys(this->m_bt->root_node_id())); + BtreeTestHelper< TestType >::TearDown(); + this->shutdown_homestore(false); + } + void crash_and_recover(uint32_t s_key, uint32_t e_key) { + this->print_keys("Btree prior to CP and susbsequent simulated crash: "); + test_common::HSTestHelper::trigger_cp(false); + this->wait_for_crash_recovery(); + // this->visualize_keys("tree_after_crash_" + std::to_string(s_key) + "_" + std::to_string(e_key) + ".dot"); + + this->print_keys("Post crash and recovery, btree structure: "); + this->reapply_after_crash(); + + this->get_all(); + LOGINFO(" except to have [{},{}) in tree and it is actually{} ", s_key, e_key, tree_key_count()); + ASSERT_EQ(this->m_shadow_map.size(), this->m_bt->count_keys(this->m_bt->root_node_id())) + << "shadow map size and tree size mismatch"; + } + uint32_t tree_key_count() { return this->m_bt->count_keys(this->m_bt->root_node_id()); } + +protected: + const std::string m_shadow_filename = "/tmp/shadow_map_index_recovery.txt"; +}; + +// Crash recovery can test one simple btree, since focus is not on btree test itself, but index recovery +using BtreeTypes = testing::Types< FixedLenBtree >; +TYPED_TEST_SUITE(IndexCrashTest, BtreeTypes); + +TYPED_TEST(IndexCrashTest, CrashBeforeFirstCp) { + // Simulate the crash even before first cp + this->set_basic_flip("crash_flush_on_root"); + + auto ops = this->build_op_list({"put:100"}); + this->multi_op_execute(ops, true /* skip_preload */); + + // Trigger a cp, which should induce the crash and wait for hs to recover + test_common::HSTestHelper::trigger_cp(false); + this->wait_for_crash_recovery(); + + // Post crash, load the shadow_map into a new instance and compute the diff. Redo the operation + this->reapply_after_crash(); +} + +TYPED_TEST(IndexCrashTest, SplitOnLeftEdge) { + // Insert into 4 phases, first fill up the last part, since we need to test split on left edge + LOGINFO("Step 1: Fill up the last quarter of the tree"); + auto const num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); + for (auto k = num_entries * 3 / 4; k < num_entries; ++k) { + this->put(k, btree_put_type::INSERT, true /* expect_success */); + } + + // Trigger the cp to make sure middle part is successful + LOGINFO("Step 2: Flush all the entries so far"); + test_common::HSTestHelper::trigger_cp(true); + this->get_all(); + this->m_shadow_map.save(this->m_shadow_filename); + + // Now fill the entries from first and the leftmost child will always split, with crash flip set during flush phase + LOGINFO("Step 3: Fill the 3rd quarter of the tree, to make sure left child is split and we crash on flush of the " + "new child"); + this->set_basic_flip("crash_flush_on_split_at_right_child"); + for (auto k = num_entries / 2; k < num_entries * 3 / 4; ++k) { + this->put(k, btree_put_type::INSERT, true /* expect_success */); + } + LOGINFO("Step 4: Crash and reapply the missing entries to tree"); + this->crash_and_recover(num_entries / 2, num_entries); + + // TODO: Uncomment this once we do a fix for the inconsistent query results + LOGINFO("Step 5: Fill the 2nd quarter of the tree, to make sure left child is split and we crash on flush of the " + "left child"); + this->set_basic_flip("crash_flush_on_split_at_left_child"); + this->visualize_keys("tree_before_insert.dot"); + for (auto k = num_entries / 4; k < num_entries / 2; ++k) { + // LOGINFO("inserting key {}", k); + // this->visualize_keys("tree_before_" + to_string(k) + ".dot"); + this->put(k, btree_put_type::INSERT, true /* expect_success */); + } + this->visualize_keys("tree_before_crash.dot"); + this->dump_to_file("tree_before_crash.txt"); + LOGINFO("Step 6: Simulate crash and then recover, reapply keys to tree"); + this->crash_and_recover(num_entries / 4, num_entries); + + LOGINFO("Step 7: Fill the 1st quarter of the tree, to make sure left child is split and we crash on flush of the " + "parent node"); + this->set_basic_flip("crash_flush_on_split_at_parent"); + for (auto k = 0u; k <= num_entries / 4; ++k) { + this->put(k, btree_put_type::INSERT, true /* expect_success */); + } + LOGINFO("Step 8: Post crash we reapply the missing entries to tree"); + this->crash_and_recover(0, num_entries); + LOGINFO("Step 9: Query all entries and validate with pagination of 80 entries"); + this->query_all_paginate(80); +} +#endif + +int main(int argc, char* argv[]) { + int parsed_argc{argc}; + ::testing::InitGoogleTest(&parsed_argc, argv); + SISL_OPTIONS_LOAD(parsed_argc, argv, logging, test_index_crash_recovery, iomgr, test_common_setup); + sisl::logging::SetLogger("test_index_crash_recovery"); + spdlog::set_pattern("[%D %T%z] [%^%L%$] [%t] %v"); + + if (SISL_OPTIONS.count("seed")) { + auto seed = SISL_OPTIONS["seed"].as< uint64_t >(); + LOGINFO("Using seed {} to sow the random generation", seed); + g_re.seed(seed); + } + +#ifdef _PRERELEASE + return RUN_ALL_TESTS(); +#else + return 0; +#endif +} diff --git a/src/tests/test_pdev.cpp b/src/tests/test_pdev.cpp index d5670abaf..e28c8a933 100644 --- a/src/tests/test_pdev.cpp +++ b/src/tests/test_pdev.cpp @@ -83,7 +83,12 @@ class PDevTest : public ::testing::Test { m_dev_infos, [this](const homestore::vdev_info&, bool load_existing) -> shared< homestore::VirtualDev > { return nullptr; }); - m_dmgr->is_first_time_boot() ? m_dmgr->format_devices() : m_dmgr->load_devices(); + if (m_dmgr->is_first_time_boot()) { + m_dmgr->format_devices(); + m_dmgr->commit_formatting(); + } else { + m_dmgr->load_devices(); + } m_first_data_pdev = m_dmgr->get_pdevs_by_dev_type(homestore::HSDevType::Data)[0]; m_first_fast_pdev = m_dmgr->get_pdevs_by_dev_type(homestore::HSDevType::Fast)[0]; }