diff --git a/conanfile.py b/conanfile.py index 0e7fa7d89..deee0421d 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,7 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.4.65" + version = "6.4.66" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/btree/btree.hpp b/src/include/homestore/btree/btree.hpp index 2ef1e1d44..0fec83ddd 100644 --- a/src/include/homestore/btree/btree.hpp +++ b/src/include/homestore/btree/btree.hpp @@ -123,7 +123,7 @@ class Btree { void dump_tree_to_file(const std::string& file = "") const; std::string to_custom_string(to_string_cb_t< K, V > const& cb) const; std::string visualize_tree_keys(const std::string& file) const; - uint64_t count_keys(bnodeid_t bnodeid) const; + uint64_t count_keys(bnodeid_t bnodeid = 0) const; nlohmann::json get_metrics_in_json(bool updated = true); bnodeid_t root_node_id() const; diff --git a/src/include/homestore/btree/detail/btree_common.ipp b/src/include/homestore/btree/detail/btree_common.ipp index b21305497..ecda7e138 100644 --- a/src/include/homestore/btree/detail/btree_common.ipp +++ b/src/include/homestore/btree/detail/btree_common.ipp @@ -201,6 +201,7 @@ void Btree< K, V >::to_dot_keys(bnodeid_t bnodeid, std::string& buf, template < typename K, typename V > uint64_t Btree< K, V >::count_keys(bnodeid_t bnodeid) const { + if (bnodeid == 0) { bnodeid = this->root_node_id(); } BtreeNodePtr node; locktype_t acq_lock = locktype_t::READ; if (read_and_lock_node(bnodeid, node, acq_lock, acq_lock, nullptr) != btree_status_t::success) { return 0; } diff --git a/src/include/homestore/btree/detail/btree_mutate_impl.ipp b/src/include/homestore/btree/detail/btree_mutate_impl.ipp index 209b35558..3e90ccfd5 100644 --- a/src/include/homestore/btree/detail/btree_mutate_impl.ipp +++ b/src/include/homestore/btree/detail/btree_mutate_impl.ipp @@ -283,6 +283,11 @@ btree_status_t Btree< K, V >::split_node(const BtreeNodePtr& parent_node, const child_node1->inc_link_version(); // Update the existing parent node entry to point to second child ptr. + // Don't change the order. First update the parent node and then insert the new key. This is important for casee + // where the split key is the last key in the parent node. In this case, the split key should be inserted in the + // parent node. If we insert the split key first, then the split key will be inserted in the parent node and the + // last key in the parent node will be lost. This will lead to inconsistency in the tree. In case of empty parent + // (i.e., new root) or updating the edge, this order made sure that edge is updated. parent_node->update(parent_ind, child_node2->link_info()); parent_node->insert(parent_ind, *out_split_key, child_node1->link_info()); diff --git a/src/include/homestore/btree/detail/simple_node.hpp b/src/include/homestore/btree/detail/simple_node.hpp index 1f4c30e32..e85d1190c 100644 --- a/src/include/homestore/btree/detail/simple_node.hpp +++ b/src/include/homestore/btree/detail/simple_node.hpp @@ -204,7 +204,7 @@ class SimpleNode : public VariantNode< K, V > { bool has_room_for_put(btree_put_type put_type, uint32_t key_size, uint32_t value_size) const override { #ifdef _PRERELEASE auto max_keys = max_keys_in_node(); - if(max_keys) {return (this->total_entries() < max_keys);} + if(max_keys) {return (this->total_entries() < max_keys);} #endif return ((put_type == btree_put_type::UPSERT) || (put_type == btree_put_type::INSERT)) ? (get_available_entries() > 0) @@ -230,7 +230,7 @@ class SimpleNode : public VariantNode< K, V > { return str; } std::string to_dot_keys() const override { - return to_dot_keys_impl(std::is_same{}); + return to_dot_keys_impl(std::is_same().key()), uint64_t>{}); } std::string to_dot_keys_impl(std::false_type) const { diff --git a/src/include/homestore/index/index_table.hpp b/src/include/homestore/index/index_table.hpp index 2bec275e3..86f3a8c86 100644 --- a/src/include/homestore/index/index_table.hpp +++ b/src/include/homestore/index/index_table.hpp @@ -224,10 +224,13 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { } btree_status_t on_root_changed(BtreeNodePtr const& new_root, void* context) override { + // todo: if(m_sb->root_node == new_root->node_id() && m_sb->root_link_version == new_root->link_version()){ + // return btree_status_t::success;} m_sb->root_node = new_root->node_id(); m_sb->root_link_version = new_root->link_version(); if (!wb_cache().refresh_meta_buf(m_sb_buffer, r_cast< CPContext* >(context))) { + LOGTRACEMOD(wbcache, "CP mismatch error - discard transact for meta node"); return btree_status_t::cp_mismatch; } @@ -238,8 +241,8 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { btree_status_t repair_links(BtreeNodePtr const& parent_node, void* cp_ctx) { BT_LOG(DEBUG, "Repairing links for parent node {}", parent_node->to_string()); - - // Get the last key in the node + // TODO: is it possible that repairing many nodes causes an increase to level of btree? If so, then this needs + // to be handled. Get the last key in the node auto const last_parent_key = parent_node->get_last_key< K >(); auto const is_parent_edge_node = parent_node->has_valid_edge(); if ((parent_node->total_entries() == 0) && !is_parent_edge_node) { @@ -285,8 +288,8 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { BT_LOG(INFO, "Repairing node={} child_node={} child_last_key={}", cur_parent->node_id(), child_node->to_string(), child_last_key.to_string()); - if (child_last_key.compare(last_parent_key) > 0) { - // We have reached the last key, we can stop now + if (child_last_key.compare(last_parent_key) > 0 && !is_parent_edge_node) { + // We have reached the last key, and the parent node doesn't have edge, so we can stop now break; } diff --git a/src/include/homestore/index_service.hpp b/src/include/homestore/index_service.hpp index 0530c6846..c8801c9d2 100644 --- a/src/include/homestore/index_service.hpp +++ b/src/include/homestore/index_service.hpp @@ -70,6 +70,7 @@ class IndexService { void stop(); // Add/Remove Index Table to/from the index service + uint64_t num_tables(); void add_index_table(const std::shared_ptr< IndexTableBase >& tbl); void remove_index_table(const std::shared_ptr< IndexTableBase >& tbl); std::shared_ptr< IndexTableBase > get_index_table(uuid_t uuid) const; diff --git a/src/lib/index/index_cp.cpp b/src/lib/index/index_cp.cpp index 955bd523f..99ba7dbba 100644 --- a/src/lib/index/index_cp.cpp +++ b/src/lib/index/index_cp.cpp @@ -142,6 +142,17 @@ void IndexCPContext::to_string_dot(const std::string& filename) { file << "}\n"; file.close(); + LOGINFO("cp dag is stored in file {}", filename); +} + +uint16_t IndexCPContext::num_dags() { + // count number of buffers whose up_buffers are nullptr + uint16_t count = 0; + std::unique_lock lg{m_flush_buffer_mtx}; + m_dirty_buf_list.foreach_entry([&count](IndexBufferPtr buf) { + if (buf->m_up_buffer == nullptr) { count++; } + }); + return count; } std::string IndexCPContext::to_string_with_dags() { @@ -219,6 +230,7 @@ std::map< BlkId, IndexBufferPtr > IndexCPContext::recover(sisl::byte_view sb) { process_txn_record(rec, buf_map); cur_ptr += rec->size(); + LOGTRACEMOD(wbcache, "Recovered txn record: {}: {}", t, rec->to_string()); } return buf_map; @@ -314,8 +326,8 @@ std::string IndexCPContext::txn_record::to_string() const { if (id_count == 0) { fmt::format_to(std::back_inserter(str), "empty]"); } else { - for (uint8_t i{0}; i < id_count; ++i, ++idx) { - fmt::format_to(std::back_inserter(str), "[chunk={}, blk={}],", ids[idx].second, ids[idx].first); + for (uint8_t i{0}; i < id_count; ++i) { + fmt::format_to(std::back_inserter(str), "[{}],", blk_id(idx++).to_integer()); } fmt::format_to(std::back_inserter(str), "]"); } @@ -324,12 +336,13 @@ std::string IndexCPContext::txn_record::to_string() const { std::string str = fmt::format("ordinal={}, parent=[{}], in_place_child=[{}]", index_ordinal, parent_id_string(), child_id_string(), num_new_ids, num_freed_ids); - uint8_t idx = (has_inplace_parent == 0x1) ? 1 : 0 + (has_inplace_child == 0x1) ? 1 : 0; + uint8_t idx = ((has_inplace_parent == 0x1) ? 1 : 0) + ((has_inplace_child == 0x1) ? 1 : 0); fmt::format_to(std::back_inserter(str), ", new_ids=["); add_to_string(str, idx, num_new_ids); fmt::format_to(std::back_inserter(str), ", freed_ids=["); add_to_string(str, idx, num_freed_ids); + fmt::format_to(std::back_inserter(str), "{}", (is_parent_meta ? ", parent is meta" : "")); return str; } } // namespace homestore diff --git a/src/lib/index/index_cp.hpp b/src/lib/index/index_cp.hpp index 1b8a2a2b0..d7bd124df 100644 --- a/src/lib/index/index_cp.hpp +++ b/src/lib/index/index_cp.hpp @@ -94,12 +94,12 @@ struct IndexCPContext : public VDevCPContext { } std::string parent_id_string() const { - return (has_inplace_parent == 0x1) ? fmt::format("chunk={}, blk={}", ids[0].second, ids[0].first) : "empty"; + return (has_inplace_parent == 0x1) ? fmt::format("{}", blk_id(0).to_integer()) : "empty"; } std::string child_id_string() const { auto const idx = (has_inplace_parent == 0x1) ? 1 : 0; - return (has_inplace_child == 0x1) ? fmt::format("chunk={}, blk={}", ids[idx].second, ids[idx].first) + return (has_inplace_child == 0x1) ? fmt::format("{}", blk_id(idx).to_integer()) : "empty"; } @@ -162,6 +162,7 @@ struct IndexCPContext : public VDevCPContext { std::optional< IndexBufferPtr > next_dirty(); std::string to_string(); std::string to_string_with_dags(); + uint16_t num_dags(); void to_string_dot(const std::string& filename); private: diff --git a/src/lib/index/index_service.cpp b/src/lib/index/index_service.cpp index cc199bbd5..4b3fb5d11 100644 --- a/src/lib/index/index_service.cpp +++ b/src/lib/index/index_service.cpp @@ -87,10 +87,18 @@ void IndexService::start() { for (const auto& [_, tbl] : m_index_map) { tbl->recovery_completed(); } + // Force taking cp after recovery done. This makes sure that the index table is in consistent state and dirty buffer + // after recovery can be added to dirty list for flushing in the new cp + hs()->cp_mgr().trigger_cp_flush(true /* force */); } void IndexService::stop() { m_wb_cache.reset(); } +uint64_t IndexService::num_tables() { + std::unique_lock lg(m_index_map_mtx); + return m_index_map.size(); +} + void IndexService::add_index_table(const std::shared_ptr< IndexTableBase >& tbl) { std::unique_lock lg(m_index_map_mtx); m_index_map.insert(std::make_pair(tbl->uuid(), tbl)); diff --git a/src/lib/index/wb_cache.cpp b/src/lib/index/wb_cache.cpp index 1b7523363..ed5dd7e6d 100644 --- a/src/lib/index/wb_cache.cpp +++ b/src/lib/index/wb_cache.cpp @@ -196,14 +196,19 @@ bool IndexWBCache::refresh_meta_buf(shared< MetaIndexBuffer >& meta_buf, CPConte return false; // meta_buf modified by a newer CP, we shouldn't overwrite that } else if (meta_buf->m_dirtied_cp_id == cp_ctx->id()) { // Modified by the same cp, no need to create new index buffer, but we only copy the superblk to the buffer + LOGTRACEMOD(wbcache, "meta buf {} is already dirtied in cp {} now is in recovery {}", meta_buf->to_string(), + cp_ctx->id(), m_in_recovery); meta_buf->copy_sb_to_buf(); + // TODO: corner case , meta buffer is dirtied by the same cp but not added to dirty list due to previously + // recovery mode } else { // We always create a new meta index buffer on every meta buf update, which copies the superblk auto new_buf = std::make_shared< MetaIndexBuffer >(meta_buf); new_buf->m_dirtied_cp_id = cp_ctx->id(); write_buf(nullptr, new_buf, cp_ctx); meta_buf = new_buf; // Replace the meta_buf with new buf - LOGTRACEMOD(wbcache, "meta buf {} is created in cp {}", meta_buf->to_string(), cp_ctx->id()); + LOGTRACEMOD(wbcache, "meta buf {} is created in cp {} in recovery = {}", meta_buf->to_string(), cp_ctx->id(), + m_in_recovery); } return true; } @@ -292,10 +297,49 @@ void IndexWBCache::transact_bufs(uint32_t index_ordinal, IndexBufferPtr const& p freed_node_bufs // free_node_bufs ); } +#ifdef _PRERELEASE + // log new nodes and freed nodes and parent and child + static uint32_t txn_id = 0; + static int last_cp_id = -2; + static std::string txn = ""; + if (last_cp_id != icp_ctx->id()) { + last_cp_id = icp_ctx->id(); + txn_id = 0; + txn = ""; + } + + if (new_node_bufs.empty() && freed_node_bufs.empty()) { + fmt::format_to(std::back_inserter(txn), "\n{} - parent=[{}] child=[{}] new=[{}] freed=[{}]", txn_id, + (parent_buf && parent_buf->blkid().to_integer() != 0) + ? std::to_string(parent_buf->blkid().to_integer()) + : "empty", + child_buf->blkid().to_integer(), "empty", "empty"); + } else { + std::string new_nodes; + for (auto const& buf : new_node_bufs) { + new_nodes += std::to_string(buf->blkid().to_integer()) + ", "; + } + std::string freed_nodes; + for (auto const& buf : freed_node_bufs) { + freed_nodes += std::to_string(buf->blkid().to_integer()) + ", "; + } + std::string parent_str = (parent_buf && parent_buf->blkid().to_integer() != 0) + ? std::to_string(parent_buf->blkid().to_integer()) + : "empty"; + std::string child_str = (child_buf && child_buf->blkid().to_integer() != 0) + ? std::to_string(child_buf->blkid().to_integer()) + : "empty"; + + fmt::format_to(std::back_inserter(txn), "\n{} - parent={} child={} new=[{}] freed=[{}]", txn_id, parent_str, + child_str, new_nodes, freed_nodes); + } + LOGTRACEMOD(wbcache, "\ttranasction till now: cp: {} \n{}\n", icp_ctx->id(), txn); + txn_id++; +#endif #if 0 static int id = 0; - auto filename = "transact_bufs_"+std::to_string(id++)+ "_" +std::to_string(rand()%100)+".dot"; - LOGINFO("Transact cp is in cp\n{} and storing in {}\n\n\n", icp_ctx->to_string(), filename); + auto filename = fmt::format("txn_buf_{}_{}.dot", icp_ctx->id(), id++); + LOGTRACEMOD(wbcache,"Writing txn to file: {}", filename); icp_ctx->to_string_dot(filename); #endif } @@ -390,6 +434,14 @@ void IndexWBCache::free_buf(const IndexBufferPtr& buf, CPContext* cp_ctx) { } //////////////////// Recovery Related section ///////////////////////////////// +void IndexWBCache::load_buf(IndexBufferPtr const& buf) { + if (buf->m_bytes == nullptr) { + buf->m_bytes = hs_utils::iobuf_alloc(m_node_size, sisl::buftag::btree_node, m_vdev->align_size()); + m_vdev->sync_read(r_cast< char* >(buf->m_bytes), m_node_size, buf->blkid()); + buf->m_dirtied_cp_id = BtreeNode::get_modified_cp_id(buf->m_bytes); + } +} + void IndexWBCache::recover(sisl::byte_view sb) { // If sb is empty, its possible a first time boot. if ((sb.bytes() == nullptr) || (sb.size() == 0)) { @@ -408,6 +460,29 @@ void IndexWBCache::recover(sisl::byte_view sb) { LOGINFOMOD(wbcache, "Detected unclean shutdown, prior cp={} had to flush {} nodes, recovering... ", icp_ctx->id(), bufs.size()); +#ifdef _PRERELEASE + auto detailed_log = [this](std::map< BlkId, IndexBufferPtr > const& bufs, + std::vector< IndexBufferPtr > const& l0_bufs) { + std::string log = fmt::format("\trecovered bufs (#of bufs = {})\n", bufs.size()); + for (auto const& [_, buf] : bufs) { + load_buf(buf); + fmt::format_to(std::back_inserter(log), "{}\n", buf->to_string()); + } + + // list of new_bufs + if (!l0_bufs.empty()) { + fmt::format_to(std::back_inserter(log), "\n\tl0_bufs (#of bufs = {})\n", l0_bufs.size()); + for (auto const& buf : l0_bufs) { + fmt::format_to(std::back_inserter(log), "{}\n", buf->to_string()); + } + } + return log; + }; + + std::string log = fmt::format("Recovering bufs (#of bufs = {}) before processing them\n", bufs.size()); + LOGTRACEMOD(wbcache, "{}\n{}", log, detailed_log(bufs, {})); +#endif + // At this point, we have the DAG structure (up/down dependency graph), exactly the same as prior to crash, with one // addition of all freed buffers also put in the DAG structure. // @@ -433,30 +508,30 @@ void IndexWBCache::recover(sisl::byte_view sb) { l0_bufs.push_back(buf); } else { buf->m_up_buffer->m_wait_for_down_buffers.decrement(); +#ifndef NDEBUG + bool found{false}; + for (auto it = buf->m_up_buffer->m_down_buffers.begin(); + it != buf->m_up_buffer->m_down_buffers.end(); ++it) { + auto sp = it->lock(); + if (sp && sp == buf) { + found = true; + buf->m_up_buffer->m_down_buffers.erase(it); + break; + } + } + HS_DBG_ASSERT(found, + "Down buffer is linked to Up buf, but up_buf doesn't have down_buf in its list"); +#endif } } } } +#ifdef _PRERELEASE LOGINFOMOD(wbcache, "Index Recovery detected {} nodes out of {} as new/freed nodes to be recovered in prev cp={}", l0_bufs.size(), bufs.size(), icp_ctx->id()); - - auto detailed_log = [this](std::map< BlkId, IndexBufferPtr > const& bufs, - std::vector< IndexBufferPtr > const& l0_bufs) { - // Logs to detect down_waits are set correctly for up buffers list of all recovered bufs - std::string log = fmt::format("\trecovered bufs (#of bufs = {})\n", bufs.size()); - for (auto const& [_, buf] : bufs) { - fmt::format_to(std::back_inserter(log), "{}\n", buf->to_string()); - } - - // list of new_bufs - fmt::format_to(std::back_inserter(log), "\n\tl0_bufs (#of bufs = {})\n", l0_bufs.size()); - for (auto const& buf : l0_bufs) { - fmt::format_to(std::back_inserter(log), "{}\n", buf->to_string()); - } - return log; - }; LOGTRACEMOD(wbcache, "All unclean bufs list\n{}", detailed_log(bufs, l0_bufs)); +#endif // Second iteration we start from the lowest levels (which are all new_bufs) and check if up_buffers need to be // repaired. All L1 buffers are not needed to repair, because they are sibling nodes and so we pass false in @@ -469,7 +544,10 @@ void IndexWBCache::recover(sisl::byte_view sb) { } void IndexWBCache::recover_buf(IndexBufferPtr const& buf) { - if (!buf->m_wait_for_down_buffers.decrement_testz()) { return; } + if (!buf->m_wait_for_down_buffers.decrement_testz()) { + // TODO: remove the buf_>m_up_buffer from down_buffers list of buf->m_up_buffer + return; + } // All down buffers are completed and given a nod saying that they are committed. If this buffer is not committed, // then we need to repair this node/buffer. After that we will keep going to the next up level to repair them if @@ -495,21 +573,21 @@ bool IndexWBCache::was_node_committed(IndexBufferPtr const& buf) { } // All down_buf has indicated that they have seen this up buffer, now its time to repair them. - if (buf->m_bytes == nullptr) { - // Read the btree node and get its modified cp_id - buf->m_bytes = hs_utils::iobuf_alloc(m_node_size, sisl::buftag::btree_node, m_vdev->align_size()); - m_vdev->sync_read(r_cast< char* >(buf->m_bytes), m_node_size, buf->blkid()); - if (!BtreeNode::is_valid_node(sisl::blob{buf->m_bytes, m_node_size})) { return false; } - - buf->m_dirtied_cp_id = BtreeNode::get_modified_cp_id(buf->m_bytes); - } - auto cpg = cp_mgr().cp_guard(); - return (buf->m_dirtied_cp_id == cpg->id()); + load_buf(buf); + if (!BtreeNode::is_valid_node(sisl::blob{buf->m_bytes, m_node_size})) { return false; } + return (buf->m_dirtied_cp_id == cp_mgr().cp_guard()->id()); } //////////////////// CP Related API section ///////////////////////////////// folly::Future< bool > IndexWBCache::async_cp_flush(IndexCPContext* cp_ctx) { - LOGTRACEMOD(wbcache, "Starting Index CP Flush with cp context={}", cp_ctx->to_string_with_dags()); + LOGTRACEMOD(wbcache, "Starting Index CP Flush with cp \ndag={}\n\n cp context {}", cp_ctx->to_string_with_dags(), + cp_ctx->to_string()); + // #ifdef _PRERELEASE + // static int id = 0; + // auto filename = "cp_" + std::to_string(id++) + "_" + std::to_string(rand() % 100) + ".dot"; + // LOGTRACEMOD(wbcache, "Transact cp storing in file {}\n\n\n", filename); + // cp_ctx->to_string_dot(filename); + // #endif if (!cp_ctx->any_dirty_buffers()) { if (cp_ctx->id() == 0) { // For the first CP, we need to flush the journal buffer to the meta blk @@ -523,17 +601,20 @@ folly::Future< bool > IndexWBCache::async_cp_flush(IndexCPContext* cp_ctx) { #ifdef _PRERELEASE if (hs()->crash_simulator().is_crashed()) { - LOGINFOMOD(wbcache, "crash simulation is ongoing, so skip the cp flush"); + LOGINFO("crash simulation is ongoing, so skip the cp flush"); return folly::makeFuture< bool >(true); } #endif - // First thing is to flush the new_blks created as part of the CP. + // First thing is to flush the journal created as part of the CP. auto const& journal_buf = cp_ctx->journal_buf(); + auto txn = r_cast< IndexCPContext::txn_journal const* >(journal_buf.cbytes()); if (journal_buf.size() != 0) { if (m_meta_blk) { + LOGTRACEMOD(wbcache, " journal {} ", txn->to_string()); meta_service().update_sub_sb(journal_buf.cbytes(), journal_buf.size(), m_meta_blk); } else { + LOGTRACEMOD(wbcache, " First time journal {} ", txn->to_string()); meta_service().add_sub_sb("wb_cache", journal_buf.cbytes(), journal_buf.size(), m_meta_blk); } } @@ -556,21 +637,20 @@ folly::Future< bool > IndexWBCache::async_cp_flush(IndexCPContext* cp_ctx) { void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const& buf, bool part_of_batch) { #ifdef _PRERELEASE + static std::once_flag flag; if (buf->m_crash_flag_on) { -// std::string filename = "crash_buf_" + std::to_string(cp_ctx->id()) + ".dot"; -// LOGINFOMOD(wbcache, "Simulating crash while writing buffer {}, stored in file {}", buf->to_string(), filename); -// cp_ctx->to_string_dot(filename); - LOGINFOMOD(wbcache, "Simulating crash while writing buffer {}", buf->to_string()); + std::string filename = "crash_buf_" + std::to_string(cp_ctx->id()) + ".dot"; + LOGINFO("Simulating crash while writing buffer {}, stored in file {}", buf->to_string(), filename); + // cp_ctx->to_string_dot(filename); hs()->crash_simulator().crash(); cp_ctx->complete(true); return; } else if (hs()->crash_simulator().is_crashed()) { - LOGINFOMOD(wbcache, "crash simulation is ongoing, aid simulation by not flushing"); + std::call_once(flag, []() { LOGINFO("Crash simulation is ongoing; aid simulation by not flushing."); }); return; } #endif - LOGTRACEMOD(wbcache, "cp={} {}", cp_ctx->id(), buf->to_string()); buf->set_state(index_buf_state_t::FLUSHING); if (buf->is_meta_buf()) { @@ -584,16 +664,13 @@ void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const buf->to_string()); process_write_completion(cp_ctx, buf); } else { - LOGTRACEMOD(wbcache, "flushing cp {} buf {} info: {}", cp_ctx->id(), buf->to_string(), - BtreeNode::to_string_buf(buf->raw_buffer())); + LOGTRACEMOD(wbcache, "flushing cp {} buf {}", cp_ctx->id(), buf->to_string()); m_vdev->async_write(r_cast< const char* >(buf->raw_buffer()), m_node_size, buf->m_blkid, part_of_batch) .thenValue([buf, cp_ctx](auto) { try { auto& pthis = s_cast< IndexWBCache& >(wb_cache()); pthis.process_write_completion(cp_ctx, buf); - } catch (const std::runtime_error& e) { - LOGERROR("Failed to access write-back cache: {}", e.what()); - } + } catch (const std::runtime_error& e) { LOGERROR("Failed to access write-back cache: {}", e.what()); } }); if (!part_of_batch) { m_vdev->submit_batch(); } @@ -602,8 +679,10 @@ void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const void IndexWBCache::process_write_completion(IndexCPContext* cp_ctx, IndexBufferPtr const& buf) { #ifdef _PRERELEASE + static std::once_flag flag; if (hs()->crash_simulator().is_crashed()) { - LOGINFOMOD(wbcache, "Crash simulation is ongoing, ignore all process_write_completion"); + std::call_once( + flag, []() { LOGINFOMOD(wbcache, "Crash simulation is ongoing, ignore all process_write_completion"); }); return; } #endif diff --git a/src/lib/index/wb_cache.hpp b/src/lib/index/wb_cache.hpp index 209d3845e..25a4c8201 100644 --- a/src/lib/index/wb_cache.hpp +++ b/src/lib/index/wb_cache.hpp @@ -41,7 +41,6 @@ class IndexWBCache : public IndexWBCacheBase { std::mutex m_flush_mtx; void* m_meta_blk; bool m_in_recovery{false}; - public: IndexWBCache(const std::shared_ptr< VirtualDev >& vdev, std::pair< meta_blk*, sisl::byte_view > sb, const std::shared_ptr< sisl::Evictor >& evictor, uint32_t node_size); @@ -78,5 +77,6 @@ class IndexWBCache : public IndexWBCacheBase { void recover_buf(IndexBufferPtr const& buf); bool was_node_committed(IndexBufferPtr const& buf); + void load_buf(IndexBufferPtr const& buf); }; } // namespace homestore diff --git a/src/tests/btree_helpers/btree_test_helper.hpp b/src/tests/btree_helpers/btree_test_helper.hpp index 6c00975ea..a047fed23 100644 --- a/src/tests/btree_helpers/btree_test_helper.hpp +++ b/src/tests/btree_helpers/btree_test_helper.hpp @@ -402,7 +402,7 @@ struct BtreeTestHelper { LOGINFO("{}{}", preamble.empty() ? "" : preamble + ":\n", m_bt->to_custom_string(print_key_range)); } - void visualize_keys(const std::string& file) const { m_bt->visualize_tree_keys(file); } + void visualize_keys(const std::string& file) const { /*m_bt->visualize_tree_keys(file);*/ } void compare_files(const std::string& before, const std::string& after) { std::ifstream b(before, std::ifstream::ate); diff --git a/src/tests/btree_helpers/shadow_map.hpp b/src/tests/btree_helpers/shadow_map.hpp index 9818c8a45..3e8c998ef 100644 --- a/src/tests/btree_helpers/shadow_map.hpp +++ b/src/tests/btree_helpers/shadow_map.hpp @@ -240,6 +240,7 @@ class ShadowMap { file << key << " " << value << '\n'; } file.close(); + LOGINFO("Saved shadow map to file: {}", filename); } void load(const std::string& filename) { diff --git a/src/tests/test_common/homestore_test_common.hpp b/src/tests/test_common/homestore_test_common.hpp index 174039495..97ca410f7 100644 --- a/src/tests/test_common/homestore_test_common.hpp +++ b/src/tests/test_common/homestore_test_common.hpp @@ -434,7 +434,7 @@ class HSTestHelper { } #ifdef _PRERELEASE hsi->with_crash_simulator([this](void) mutable { - LOGINFO("CrashSimulator::crash() is called - restarting homestore"); + LOGWARN("CrashSimulator::crash() is called - restarting homestore"); this->restart_homestore(); m_crash_recovered.setValue(); }); diff --git a/src/tests/test_index_crash_recovery.cpp b/src/tests/test_index_crash_recovery.cpp index 11235be6a..0f5963eff 100644 --- a/src/tests/test_index_crash_recovery.cpp +++ b/src/tests/test_index_crash_recovery.cpp @@ -34,25 +34,30 @@ SISL_LOGGING_DECL(test_index_crash_recovery) // TODO Add tests to do write,remove after recovery. // TODO Test with var len key with io mgr page size is 512. -SISL_OPTION_GROUP(test_index_crash_recovery, - (num_iters, "", "num_iters", "number of iterations for rand ops", - ::cxxopts::value< uint32_t >()->default_value("500"), "number"), - (num_entries, "", "num_entries", "number of entries to test with", - ::cxxopts::value< uint32_t >()->default_value("5000"), "number"), - (run_time, "", "run_time", "run time for io", ::cxxopts::value< uint32_t >()->default_value("360000"), - "seconds"), - (max_keys_in_node, "", "max_keys_in_node", "max_keys_in_node", - ::cxxopts::value< uint32_t >()->default_value("0"), ""), - (operation_list, "", "operation_list", - "operation list instead of default created following by percentage", - ::cxxopts::value< std::vector< std::string > >(), "operations [...]"), - (preload_size, "", "preload_size", "number of entries to preload tree with", - ::cxxopts::value< uint32_t >()->default_value("1000"), "number"), - (init_device, "", "init_device", "init device", ::cxxopts::value< bool >()->default_value("1"), ""), - (cleanup_after_shutdown, "", "cleanup_after_shutdown", "cleanup after shutdown", - ::cxxopts::value< bool >()->default_value("1"), ""), - (seed, "", "seed", "random engine seed, use random if not defined", - ::cxxopts::value< uint64_t >()->default_value("0"), "number")) +SISL_OPTION_GROUP( + test_index_crash_recovery, + (num_iters, "", "num_iters", "number of iterations for rand ops", + ::cxxopts::value< uint32_t >()->default_value("500"), "number"), + (num_entries, "", "num_entries", "number of entries to test with", + ::cxxopts::value< uint32_t >()->default_value("5000"), "number"), + (run_time, "", "run_time", "run time for io", ::cxxopts::value< uint32_t >()->default_value("360000"), "seconds"), + (num_rounds, "", "num_rounds", "number of rounds to test with", + ::cxxopts::value< uint32_t >()->default_value("100"), "number"), + (num_entries_per_rounds, "", "num_entries_per_rounds", "number of entries per rounds", + ::cxxopts::value< uint32_t >()->default_value("40"), "number"), + (max_keys_in_node, "", "max_keys_in_node", "max_keys_in_node", ::cxxopts::value< uint32_t >()->default_value("0"), + ""), + (operation_list, "", "operation_list", "operation list instead of default created following by percentage", + ::cxxopts::value< std::vector< std::string > >(), "operations [...]"), + (preload_size, "", "preload_size", "number of entries to preload tree with", + ::cxxopts::value< uint32_t >()->default_value("1000"), "number"), + (init_device, "", "init_device", "init device", ::cxxopts::value< bool >()->default_value("1"), ""), + (load_from_file, "", "load_from_file", "load from file", ::cxxopts::value< bool >()->default_value("0"), ""), + (save_to_file, "", "save_to_file", "save to file", ::cxxopts::value< bool >()->default_value("0"), ""), + (cleanup_after_shutdown, "", "cleanup_after_shutdown", "cleanup after shutdown", + ::cxxopts::value< bool >()->default_value("1"), ""), + (seed, "", "seed", "random engine seed, use random if not defined", + ::cxxopts::value< uint64_t >()->default_value("0"), "number")) void log_obj_life_counter() { std::string str; @@ -74,8 +79,6 @@ class SequenceGenerator { public: SequenceGenerator(int putFreq, int removeFreq, uint64_t start_range, uint64_t end_range) : putFreq_(putFreq), removeFreq_(removeFreq), start_range_(start_range), end_range_(end_range) { - std::random_device rd; - gen_ = std::mt19937(rd()); keyDist_ = std::uniform_int_distribution<>(start_range_, end_range_); updateOperationTypeDistribution(); } @@ -100,11 +103,11 @@ class SequenceGenerator { std::vector< Operation > operations; if (reset) { this->reset(); } for (size_t i = 0; i < numOperations; ++i) { - uint32_t key = keyDist_(gen_); + uint32_t key = keyDist_(g_re); auto [it, inserted] = keyStates.try_emplace(key, false); auto& inUse = it->second; - OperationType operation = static_cast< OperationType >(opTypeDist_(gen_)); + OperationType operation = static_cast< OperationType >(opTypeDist_(g_re)); if (operation == OperationType::Put && !inUse) { operations.emplace_back(key, OperationType::Put); @@ -131,15 +134,16 @@ class SequenceGenerator { } return occurrences; } - __attribute__((noinline)) std::string printOperations(const OperationList& operations) const { + __attribute__((noinline)) static std::string printOperations(const OperationList& operations) { std::ostringstream oss; + auto count = 1; for (const auto& [key, opType] : operations) { std::string opTypeStr = (opType == OperationType::Put) ? "Put" : "Remove"; - oss << "{" << key << ", " << opTypeStr << "}\n"; + oss << count++ << "- {" << key << ", " << opTypeStr << "}\n"; } return oss.str(); } - __attribute__((noinline)) std::string printKeysOccurrences(const OperationList& operations) const { + __attribute__((noinline)) static std::string printKeysOccurrences(const OperationList& operations) { std::set< uint64_t > keys = collectUniqueKeys(operations); std::ostringstream oss; for (auto key : keys) { @@ -152,16 +156,51 @@ class SequenceGenerator { } return oss.str(); } - __attribute__((noinline)) std::string printKeyOccurrences(const OperationList& operations, uint64_t key ) const { + __attribute__((noinline)) static std::string printKeyOccurrences(const OperationList& operations, uint64_t key) { std::ostringstream oss; auto keyOccurrences = inspect(operations, key); oss << "Occurrences of key " << key << ":\n"; for (const auto& [index, operation] : keyOccurrences) { std::string opTypeStr = (operation == OperationType::Put) ? "Put" : "Remove"; - oss << "Index: " << index << ", Operation: " << opTypeStr << "\n"; + oss << "Index: " << index << ", Operation: " << opTypeStr << "\n"; } return oss.str(); } + + static std::set< uint64_t > collectUniqueKeys(const OperationList& operations) { + std::set< uint64_t > keys; + for (const auto& [key, _] : operations) { + keys.insert(key); + } + return keys; + } + static void save_to_file(std::string filename, const OperationList& operations) { + std::ofstream file(filename); + if (file.is_open()) { + for (const auto& [key, opType] : operations) { + file << key << " " << static_cast< int >(opType) << "\n"; + } + file.close(); + } + } + + static OperationList load_from_file(std::string filename) { + std::ifstream file(filename); + OperationList operations; + if (file.is_open()) { + std::string line; + while (std::getline(file, line)) { + std::istringstream iss(line); + uint64_t key; + int opType; + iss >> key >> opType; + operations.emplace_back(key, static_cast< OperationType >(opType)); + } + file.close(); + } + return operations; + } + void reset() { keyStates.clear(); } private: @@ -169,7 +208,6 @@ class SequenceGenerator { int removeFreq_; uint64_t start_range_; uint64_t end_range_; - std::mt19937 gen_; std::uniform_int_distribution<> keyDist_; std::discrete_distribution<> opTypeDist_; std::map< uint64_t, bool > keyStates; @@ -178,15 +216,8 @@ class SequenceGenerator { opTypeDist_ = std::discrete_distribution<>({static_cast< double >(putFreq_), static_cast< double >(removeFreq_)}); } - - std::set< uint64_t > collectUniqueKeys(const OperationList& operations) const { - std::set< uint64_t > keys; - for (const auto& [key, _] : operations) { - keys.insert(key); - } - return keys; - } }; + #ifdef _PRERELEASE template < typename TestType > struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestType >, public ::testing::Test { @@ -198,7 +229,9 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT TestIndexServiceCallbacks(IndexCrashTest* test) : m_test(test) {} std::shared_ptr< IndexTableBase > on_index_table_found(superblk< index_table_sb >&& sb) override { - LOGINFO("Index table recovered, root bnode_id {} version {}", sb->root_node, sb->root_link_version); + LOGINFO("Index table recovered, root bnode_id {} uuid {} ordinal {} version {}", + static_cast< uint64_t >(sb->root_node), boost::uuids::to_string(sb->uuid), sb->ordinal, + sb->root_link_version); m_test->m_cfg = BtreeConfig(hs()->index_service().node_size()); m_test->m_cfg.m_leaf_node_type = T::leaf_node_type; @@ -241,18 +274,29 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT BtreeTestHelper< TestType >::SetUp(); if (this->m_bt == nullptr || SISL_OPTIONS["init_device"].as< bool >()) { this->m_bt = std::make_shared< typename T::BtreeType >(uuid, parent_uuid, 0, this->m_cfg); + auto num_keys = this->m_bt->count_keys(this->m_bt->root_node_id()); + // LOGINFO("Creating new index table with uuid {} - init_device:{:s} bt: {} root id {}, num of + // keys {}", boost::uuids::to_string(uuid), SISL_OPTIONS["init_device"].as< bool >(), + // this->m_bt, this->m_bt->root_node_id(), num_keys); + LOGINFO("Creating new index table with uuid {} - root id {}, num of keys {}", boost::uuids::to_string(uuid), + this->m_bt->root_node_id(), num_keys); + } else { populate_shadow_map(); } hs()->index_service().add_index_table(this->m_bt); - LOGINFO("Added index table to index service"); + LOGINFO("Added index table to index service with uuid {} - total tables in the system is currently {}", + boost::uuids::to_string(uuid), hs()->index_service().num_tables()); } void populate_shadow_map() { + LOGINFO("Populating shadow map"); this->m_shadow_map.load(m_shadow_filename); - ASSERT_EQ(this->m_shadow_map.size(), this->m_bt->count_keys(this->m_bt->root_node_id())) - << "shadow map size and tree size mismatch"; + auto num_keys = this->m_bt->count_keys(this->m_bt->root_node_id()); + LOGINFO("Shadow map size {} - btree keys {} - root id {}", this->m_shadow_map.size(), num_keys, + this->m_bt->root_node_id()); + ASSERT_EQ(this->m_shadow_map.size(), num_keys) << "shadow map size and tree size mismatch"; this->get_all(); } @@ -263,6 +307,8 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT this->m_bt = std::make_shared< typename T::BtreeType >(uuid, parent_uuid, 0, this->m_cfg); hs()->index_service().add_index_table(this->m_bt); this->m_shadow_map.range_erase(0, SISL_OPTIONS["num_entries"].as< uint32_t >() - 1); + this->m_shadow_map.save(m_shadow_filename); + LOGINFO("Reset btree with uuid {} - erase shadow map {}", boost::uuids::to_string(uuid), m_shadow_filename); } void restart_homestore(uint32_t shutdown_delay_sec = 3) override { @@ -274,7 +320,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT void reapply_after_crash() { ShadowMap< K, V > snapshot_map{this->m_shadow_map.max_keys()}; snapshot_map.load(m_shadow_filename); - LOGDEBUG("\tSnapshot before crash\n{}", snapshot_map.to_string()); + LOGINFO("\tSnapshot before crash\n{}", snapshot_map.to_string()); auto diff = this->m_shadow_map.diff(snapshot_map); // visualize tree after crash @@ -286,7 +332,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT for (const auto& [k, addition] : diff) { dif_str += fmt::format(" {} \t{}\n", k.key(), addition); } - LOGDEBUG("Diff between shadow map and snapshot map\n{}\n", dif_str); + LOGINFO("Diff between shadow map and snapshot map\n{}\n", dif_str); for (const auto& [k, addition] : diff) { // this->print_keys(fmt::format("reapply: before inserting key {}", k.key())); @@ -324,8 +370,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT LOGINFO("Error: failed to remove {}", m_shadow_filename); } } - LOGINFO("Teardown with Root bnode_id {} tree size: {}", this->m_bt->root_node_id(), - this->m_bt->count_keys(this->m_bt->root_node_id())); + LOGINFO("Teardown with Root bnode_id {} tree size: {}", this->m_bt->root_node_id(), this->tree_key_count()); BtreeTestHelper< TestType >::TearDown(); this->shutdown_homestore(false); } @@ -341,30 +386,67 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT this->get_all(); LOGINFO("Expect to have [{},{}) in tree and it is actually{} ", s_key, e_key, tree_key_count()); - ASSERT_EQ(this->m_shadow_map.size(), this->m_bt->count_keys(this->m_bt->root_node_id())) - << "shadow map size and tree size mismatch"; + ASSERT_EQ(this->m_shadow_map.size(), this->tree_key_count()) << "shadow map size and tree size mismatch"; + } + + void sanity_check(OperationList& operations) const { + std::set< uint64_t > new_keys; + std::transform(operations.begin(), operations.end(), std::inserter(new_keys, new_keys.end()), + [](const Operation& operation) { return operation.first; }); + uint32_t count = 1; + this->m_shadow_map.foreach ([this, new_keys, &count](K key, V value) { + // discard the new keys to check + if (new_keys.find(key.key()) != new_keys.end()) { return; } + auto copy_key = std::make_unique< K >(); + *copy_key = key; + auto out_v = std::make_unique< V >(); + auto req = BtreeSingleGetRequest{copy_key.get(), out_v.get()}; + req.enable_route_tracing(); + const auto ret = this->m_bt->get(req); + ASSERT_EQ(ret, btree_status_t::success) << "Missing key " << key << " in btree but present in shadow map"; + LOGINFO("{} - Key {} passed sanity check!", count++, key.key()); + }); } void crash_and_recover(OperationList& operations, std::string filename = "") { - // this->print_keys("Btree prior to CP and susbsequent simulated crash: "); + this->print_keys("Btree prior to CP and susbsequent simulated crash: "); + LOGINFO("Before Crash: {} keys in shadow map and it is actually {} keys in tree - operations size {}", + this->m_shadow_map.size(), tree_key_count(), operations.size()); + + if (!filename.empty()) { + std::string b_filename = filename + "_before_crash.dot"; + LOGINFO("Visualize the tree before crash file {}", b_filename); + this->visualize_keys(b_filename); + } + test_common::HSTestHelper::trigger_cp(false); + LOGINFO("waiting for crash to recover"); this->wait_for_crash_recovery(); - // this->print_keys("Post crash and recovery, btree structure:"); if (!filename.empty()) { - LOGINFO("Visualize the tree file {}", filename); - this->visualize_keys(filename); + std::string rec_filename = filename + "_after_recovery.dot"; + LOGINFO("Visualize the tree file after recovery : {}", rec_filename); + this->visualize_keys(rec_filename); + this->print_keys("Post crash and recovery, btree structure: "); } - + sanity_check(operations); + // Added to the index service right after recovery. Not needed here + // test_common::HSTestHelper::trigger_cp(true); + LOGINFO("Before Reapply: {} keys in shadow map and actually {} in trees operation size {}", + this->m_shadow_map.size(), tree_key_count(), operations.size()); this->reapply_after_crash(operations); - - // this->print_keys("\n\nafter reapply keys"); if (!filename.empty()) { - LOGINFO("Visualize the tree file after_reapply__{}", filename); - this->visualize_keys("after_reapply__" + filename); + std::string re_filename = filename + "_after_reapply.dot"; + LOGINFO("Visualize the tree after reapply {}", re_filename); + this->visualize_keys(re_filename); + this->print_keys("Post crash and recovery, btree structure: "); } this->get_all(); + LOGINFO("After reapply: {} keys in shadow map and actually {} in tress", this->m_shadow_map.size(), + tree_key_count()); + ASSERT_EQ(this->m_shadow_map.size(), this->m_bt->count_keys(this->m_bt->root_node_id())) + << "shadow map size and tree size mismatch"; } uint32_t tree_key_count() { return this->m_bt->count_keys(this->m_bt->root_node_id()); } @@ -378,6 +460,8 @@ using BtreeTypes = testing::Types< FixedLenBtree >; TYPED_TEST_SUITE(IndexCrashTest, BtreeTypes); TYPED_TEST(IndexCrashTest, CrashBeforeFirstCp) { + this->m_shadow_map.range_erase(0, SISL_OPTIONS["num_entries"].as< uint32_t >() - 1); + this->m_shadow_map.save(this->m_shadow_filename); // Simulate the crash even before first cp this->set_basic_flip("crash_flush_on_root"); @@ -393,6 +477,8 @@ TYPED_TEST(IndexCrashTest, CrashBeforeFirstCp) { } TYPED_TEST(IndexCrashTest, SplitOnLeftEdge) { + this->m_shadow_map.range_erase(0, SISL_OPTIONS["num_entries"].as< uint32_t >() - 1); + this->m_shadow_map.save(this->m_shadow_filename); // Insert into 4 phases, first fill up the last part, since we need to test split on left edge LOGINFO("Step 1: Fill up the last quarter of the tree"); auto const num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); @@ -525,11 +611,11 @@ TYPED_TEST(IndexCrashTest, SplitCrash1) { vector< std::string > flips = {"crash_flush_on_split_at_parent", "crash_flush_on_split_at_left_child", "crash_flush_on_split_at_right_child"}; OperationList operations; + bool renew_btree_after_crash = true; for (size_t i = 0; i < flips.size(); ++i) { - this->reset_btree(); LOGINFO("Step 1-{}: Set flag {}", i + 1, flips[i]); this->set_basic_flip(flips[i]); - operations = generator.generateOperations(num_entries -1 , true /* reset */); + operations = generator.generateOperations(num_entries - 1, renew_btree_after_crash /* reset */); // LOGINFO("Batch {} Operations:\n {} \n ", i + 1, generator.printOperations(operations)); // LOGINFO("Detailed Key Occurrences for Batch {}:\n {} \n ", i + 1, // generator.printKeyOccurrences(operations)); @@ -538,49 +624,148 @@ TYPED_TEST(IndexCrashTest, SplitCrash1) { this->put(k, btree_put_type::INSERT, true /* expect_success */); } this->crash_and_recover(operations, fmt::format("recover_tree_crash_{}.dot", i + 1)); + if (renew_btree_after_crash) { this->reset_btree(); }; } } TYPED_TEST(IndexCrashTest, long_running_put_crash) { + // Define the lambda function auto const num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); + auto const preload_size = SISL_OPTIONS["preload_size"].as< uint32_t >(); + auto const rounds = SISL_OPTIONS["num_rounds"].as< uint32_t >(); + auto const num_entries_per_rounds = SISL_OPTIONS["num_entries_per_rounds"].as< uint32_t >(); + bool load_mode = SISL_OPTIONS.count("load_from_file"); + bool save_mode = SISL_OPTIONS.count("save_to_file"); SequenceGenerator generator(100 /*putFreq*/, 0 /* removeFreq*/, 0 /*start_range*/, num_entries - 1 /*end_range*/); vector< std::string > flips = {"crash_flush_on_split_at_parent", "crash_flush_on_split_at_left_child", "crash_flush_on_split_at_right_child"}; + + std::string flip = ""; OperationList operations; auto m_start_time = Clock::now(); auto time_to_stop = [this, m_start_time]() { return (get_elapsed_time_sec(m_start_time) > this->m_run_time); }; double elapsed_time, progress_percent, last_progress_time = 0; - for (size_t i = 0; !time_to_stop(); ++i) { - bool print_time = false; - elapsed_time = get_elapsed_time_sec(m_start_time); + bool renew_btree_after_crash = false; + auto cur_flip_idx = 0; + std::uniform_int_distribution<> dis(1, 100); + int flip_percentage = 90; // Set the desired percentage here + bool normal_execution = true; + bool clean_shutdown = true; + // if it is safe then delete all previous save files + if (save_mode) { + std::filesystem::remove_all("/tmp/operations_*.txt"); + std::filesystem::remove_all("/tmp/flips_history.txt"); + } + // init tree + LOGINFO("Step 0: Fill up the tree with {} entries", preload_size); + if (load_mode) { + operations = SequenceGenerator::load_from_file(fmt::format("/tmp/operations_0.txt")); + } else { + operations = generator.generateOperations(preload_size, true /* reset */); + if (save_mode) { SequenceGenerator::save_to_file(fmt::format("/tmp/operations_0.txt"), operations); } + } + auto opstr = SequenceGenerator::printOperations(operations); + LOGINFO("Lets before crash print operations\n{}", opstr); - this->reset_btree(); - auto flip = flips[i % flips.size()]; - LOGINFO("Step 1-{}: Set flag {}", i + 1, flip); + for (auto [k, _] : operations) { + this->put(k, btree_put_type::INSERT, true /* expect_success */); + } - this->set_basic_flip(flip, 1, 10); - operations = generator.generateOperations(num_entries -1, true /* reset */); - // operations = generator.generateOperations(num_entries/10, false /* reset */); - // LOGINFO("Batch {} Operations:\n {} \n ", i + 1, generator.printOperations(operations)); - // LOGINFO("Detailed Key Occurrences for Batch {}:\n {} \n ", i + 1, - // generator.printKeyOccurrences(operations)); + // Trigger the cp to make sure middle part is successful + LOGINFO("Step 0-1: Flush all the entries so far"); + test_common::HSTestHelper::trigger_cp(true); + this->get_all(); + this->m_shadow_map.save(this->m_shadow_filename); + this->print_keys("reapply: after preload"); + this->visualize_keys("tree_after_preload.dot"); + + for (uint32_t round = 1; + round <= rounds && !time_to_stop() && this->tree_key_count() < num_entries - num_entries_per_rounds; round++) { + LOGINFO("\n\n\n\n\n\nRound {} of {}\n\n\n\n\n\n", round, rounds); + bool print_time = false; + elapsed_time = get_elapsed_time_sec(m_start_time); + if (load_mode) { + std::ifstream file("/tmp/flips_history.txt"); + std::string line; + bool found = false; + for (uint32_t i = 0; i < round && std::getline(file, line); i++) { + if (i == round - 1) { + found = true; + break; + } + } + if (found && !line.empty()) { + if (line == "normal") { + normal_execution = true; + } else { + normal_execution = false; + flip = line; + LOGINFO("Step 1-{}: Set flag {}", round, flip); + this->set_basic_flip(flip, 1, 100); + } + } + file.close(); + } else { + if (dis(g_re) <= flip_percentage) { + flip = flips[cur_flip_idx++ % flips.size()]; + LOGINFO("Step 1-{}: Set flag {}", round, flip); + this->set_basic_flip(flip, 1, 100); + normal_execution = false; + } else { + normal_execution = true; + LOGINFO("Step 1-{}: No flip set", round); + } + if (save_mode) { + // save the filp name to a file for later use + std::ofstream file("/tmp/flips_history.txt", std::ios::app); + if (file.is_open()) { file << (normal_execution ? "normal" : flip) << "\n"; } + file.close(); + } + } + if (load_mode) { + operations = SequenceGenerator::load_from_file(fmt::format("/tmp/operations_{}.txt", round)); + } else { + operations = generator.generateOperations(num_entries_per_rounds, renew_btree_after_crash /* reset */); + if (save_mode) { + SequenceGenerator::save_to_file(fmt::format("/tmp/operations_{}.txt", round), operations); + } + } + LOGINFO("Lets before crash print operations\n{}", SequenceGenerator::printOperations(operations)); for (auto [k, _] : operations) { - // LOGINFO("\t\t\t\t\t\t\t\t\t\t\t\t\tupserting entry {}", k); this->put(k, btree_put_type::INSERT, true /* expect_success */); + if (!time_to_stop()) { + static bool print_alert = false; + if (print_alert) { + LOGINFO("It is time to stop but let's finish this round and then stop!"); + print_alert = false; + } + } + } + if (normal_execution) { + if (clean_shutdown) { + this->m_shadow_map.save(this->m_shadow_filename); + this->restart_homestore(); + } else { + test_common::HSTestHelper::trigger_cp(true); + this->get_all(); + } + } else { + this->crash_and_recover(operations, fmt::format("long_tree_{}", round)); } - this->crash_and_recover(operations/*, fmt::format("recover_tree_crash_{}.dot", i + 1)*/); if (elapsed_time - last_progress_time > 30) { last_progress_time = elapsed_time; print_time = true; } if (print_time) { - LOGINFO("\n\n\n\t\t\tProgress: {} iterations completed - Elapsed time: {:.0f} seconds of total " - "{} ({:.2f}%)\n\n\n", - i, elapsed_time, this->m_run_time, elapsed_time * 100.0 / this->m_run_time); + LOGINFO("\n\n\n\t\t\tProgress: {} rounds of total {} ({:.2f}%) completed - Elapsed time: {:.0f} seconds of " + "total {} ({:.2f}%) - {} keys of maximum {} keys ({:.2f}%) inserted\n\n\n", + round, rounds, round * 100.0 / rounds, elapsed_time, this->m_run_time, + elapsed_time * 100.0 / this->m_run_time, this->tree_key_count(), num_entries, + this->tree_key_count() * 100.0 / num_entries); } - this->print_keys(fmt::format("reapply: after iteration {}", i)); - + this->print_keys(fmt::format("reapply: after round {}", round)); + if (renew_btree_after_crash) { this->reset_btree(); }; } } #endif diff --git a/src/tests/test_mem_btree.cpp b/src/tests/test_mem_btree.cpp index 1ef55302b..141fcf5e2 100644 --- a/src/tests/test_mem_btree.cpp +++ b/src/tests/test_mem_btree.cpp @@ -48,6 +48,8 @@ SISL_OPTION_GROUP( ::cxxopts::value< std::vector< std::string > >(), "operations [...]"), (preload_size, "", "preload_size", "number of entries to preload tree with", ::cxxopts::value< uint32_t >()->default_value("1000"), "number"), + (max_keys_in_node, "", "max_keys_in_node", "max_keys_in_node", + ::cxxopts::value< uint32_t >()->default_value("0"), ""), (seed, "", "seed", "random engine seed, use random if not defined", ::cxxopts::value< uint64_t >()->default_value("0"), "number"), (run_time, "", "run_time", "run time for io", ::cxxopts::value< uint32_t >()->default_value("360000"), "seconds")) @@ -102,6 +104,9 @@ struct BtreeTest : public BtreeTestHelper< TestType >, public ::testing::Test { void SetUp() override { BtreeTestHelper< TestType >::SetUp(); +#ifdef _PRERELEASE + this->m_cfg.m_max_keys_in_node = SISL_OPTIONS["max_keys_in_node"].as< uint32_t >(); +#endif this->m_bt = std::make_shared< typename T::BtreeType >(this->m_cfg); } }; @@ -300,6 +305,9 @@ struct BtreeConcurrentTest : public BtreeTestHelper< TestType >, public ::testin .hugepage_size_mb = 0}); BtreeTestHelper< TestType >::SetUp(); +#ifdef _PRERELEASE + this->m_cfg.m_max_keys_in_node = SISL_OPTIONS["max_keys_in_node"].as< uint32_t >(); +#endif this->m_bt = std::make_shared< typename T::BtreeType >(this->m_cfg); } diff --git a/src/tests/test_scripts/index_test.py b/src/tests/test_scripts/index_test.py index 4e4814ccb..02c3e4c2c 100755 --- a/src/tests/test_scripts/index_test.py +++ b/src/tests/test_scripts/index_test.py @@ -20,11 +20,13 @@ def run_test(options, type): raise TestFailedError(f"Test failed for type {type}") print("Test completed") + def run_crash_test(options): - cmd_opts = f"--gtest_filter=IndexCrashTest/0.long_running_put_crash --gtest_break_on_failure --max_keys_in_node={options['max_keys_in_node']} --init_device={options['init_device']} {options['log_mods']} --run_time={options['run_time']} --num_entries={options['num_entries']} {options['dev_list']}" + cmd_opts = f"--gtest_filter=IndexCrashTest/0.long_running_put_crash --gtest_break_on_failure --log_mods=wbcache:trace --max_keys_in_node={options['max_keys_in_node']} --num_entries_per_rounds={options['num_entries_per_rounds']} --init_device={options['init_device']} {options['log_mods']} --run_time={options['run_time']} --num_entries={options['num_entries']} --num_rounds={options['num_rounds']} {options['dev_list']} " # print(f"Running test with options: {cmd_opts}") try: - subprocess.check_call(f"{options['dirpath']}test_index_crash_recovery {cmd_opts}", stderr=subprocess.STDOUT, shell=True) + subprocess.check_call(f"{options['dirpath']}test_index_crash_recovery {cmd_opts}", stderr=subprocess.STDOUT, + shell=True) except subprocess.CalledProcessError as e: print(f"Test failed: {e}") raise TestFailedError(f"Test failed for type {type}") @@ -49,7 +51,9 @@ def parse_arguments(): parser.add_argument('--dev_list', help='Device list', default='') parser.add_argument('--cleanup_after_shutdown', help='Cleanup after shutdown', type=bool, default=False) parser.add_argument('--init_device', help='Initialize device', type=bool, default=True) - parser.add_argument('--max_keys_in_node', help='Maximum num of keys in btree nodes', type=int, default=20) + parser.add_argument('--max_keys_in_node', help='Maximum num of keys in btree nodes', type=int, default=5) + parser.add_argument('--num_rounds', help='number of rounds for crash test', type=int, default=10000) + parser.add_argument('--num_entries_per_rounds', help='number of rounds for crash test', type=int, default=60) # Parse the known arguments and ignore any unknown arguments args, unknown = parser.parse_known_args() @@ -73,7 +77,6 @@ def long_runnig_index(options, type=0): def long_running_clean_shutdown(options, type=0): print("Long running clean shutdown started") - options['run_time'] = int(options['run_time']) // 10 # 20 minutes try: run_test(options, type) @@ -87,14 +90,18 @@ def long_running_clean_shutdown(options, type=0): raise print("Long running clean shutdown completed") + def long_running_crash_put(options): print("Long running crash put started") - options['num_entries'] = 20480 # 20K + options['num_entries'] = 131072 # 128K options['init_device'] = True + options['run_time'] = 14400 # 4 hours + options['preload_size'] = 100 print(f"options: {options}") run_crash_test(options) print("Long running crash put completed") + def main(): options = parse_arguments() test_suite_name = options['test_suits']