diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 000000000..04e3372dd
Binary files /dev/null and b/.DS_Store differ
diff --git a/src/include/homestore/btree/btree.hpp b/src/include/homestore/btree/btree.hpp
index f7a82b14a..4d65ad7e8 100644
--- a/src/include/homestore/btree/btree.hpp
+++ b/src/include/homestore/btree/btree.hpp
@@ -34,6 +34,12 @@ namespace homestore {
using BtreeNodePtr = boost::intrusive_ptr< BtreeNode >;
using BtreeNodeList = folly::small_vector< BtreeNodePtr, 3 >;
+struct BtreeVisualizeVariables{
+ uint64_t parent;
+ uint64_t midPoint;
+ uint64_t index;
+};
+
struct BtreeThreadVariables {
std::vector< btree_locked_node_info > wr_locked_nodes;
std::vector< btree_locked_node_info > rd_locked_nodes;
@@ -116,6 +122,7 @@ class Btree {
void print_tree(const std::string& file = "") const;
void print_tree_keys() const;
+ std::string visualize_tree_keys(const std::string& file) const;
uint64_t count_keys(bnodeid_t bnodeid) const;
nlohmann::json get_metrics_in_json(bool updated = true);
@@ -195,6 +202,9 @@ class Btree {
uint64_t get_child_node_cnt(bnodeid_t bnodeid) const;
void to_string(bnodeid_t bnodeid, std::string& buf) const;
void to_string_keys(bnodeid_t bnodeid, std::string& buf) const;
+ void to_dot_keys(bnodeid_t bnodeid, std::string& buf,
+ std::map< uint32_t, std::vector< uint64_t > >& l_map,
+ std::map< uint64_t, BtreeVisualizeVariables >& info_map) const;
void validate_sanity_child(const BtreeNodePtr& parent_node, uint32_t ind) const;
void validate_sanity_next_child(const BtreeNodePtr& parent_node, uint32_t ind) const;
void print_node(const bnodeid_t& bnodeid) const;
diff --git a/src/include/homestore/btree/btree.ipp b/src/include/homestore/btree/btree.ipp
index 73279d138..4877b6701 100644
--- a/src/include/homestore/btree/btree.ipp
+++ b/src/include/homestore/btree/btree.ipp
@@ -314,7 +314,7 @@ void Btree< K, V >::print_tree(const std::string& file) const {
to_string(m_root_node_info.bnode_id(), buf);
m_btree_lock.unlock_shared();
- BT_LOG(INFO, "Pre order traversal of tree:\n<{}>", buf);
+ LOGINFO( "Pre order traversal of tree:\n<{}>", buf);
if (!file.empty()) {
std::ofstream o(file);
o.write(buf.c_str(), buf.size());
@@ -332,6 +332,46 @@ void Btree< K, V >::print_tree_keys() const {
LOGINFO("Pre order traversal of tree:\n<{}>", buf);
}
+template < typename K, typename V >
+std::string Btree< K, V >::visualize_tree_keys(const std::string& file) const {
+ std::map< uint32_t, std::vector< uint64_t > > level_map;
+ std::map< uint64_t, BtreeVisualizeVariables > info_map;
+ std::string buf = "digraph G\n"
+ "{ \n"
+ "ranksep = 3.0;\n"
+ R"(graph [splines="polyline"];
+)";
+
+ m_btree_lock.lock_shared();
+ to_dot_keys(m_root_node_info.bnode_id(), buf, level_map, info_map);
+ m_btree_lock.unlock_shared();
+ for (const auto& [child, info] : info_map) {
+ if (info.parent) {
+ buf += fmt::format(R"(
+ "{}":connector{} -> "{}":"key{}" [splines=false];)",
+ info.parent, info.index, child, info.midPoint);
+ }
+ }
+
+ std::string result;
+ for (const auto& [key, values] : level_map) {
+ result += "{rank=same; ";
+ std::vector< std::string > quotedValues;
+ std::transform(values.begin(), values.end(), std::back_inserter(quotedValues),
+ [](uint64_t value) { return fmt::format("\"{}\"", value); });
+
+ result += fmt::to_string(fmt::join(quotedValues, " ")) + "}\n";
+ }
+
+ buf += "\n" + result + " }\n";
+ if (!file.empty()) {
+ std::ofstream o(file);
+ o.write(buf.c_str(), buf.size());
+ o.flush();
+ }
+ return buf;
+}
+
template < typename K, typename V >
nlohmann::json Btree< K, V >::get_metrics_in_json(bool updated) {
return m_metrics.get_result_in_json(updated);
diff --git a/src/include/homestore/btree/detail/btree_common.ipp b/src/include/homestore/btree/detail/btree_common.ipp
index edd895d89..d497287a0 100644
--- a/src/include/homestore/btree/detail/btree_common.ipp
+++ b/src/include/homestore/btree/detail/btree_common.ipp
@@ -168,6 +168,36 @@ void Btree< K, V >::to_string_keys(bnodeid_t bnodeid, std::string& buf) const {
unlock_node(node, acq_lock);
}
+template < typename K, typename V >
+void Btree< K, V >::to_dot_keys(bnodeid_t bnodeid, std::string& buf,
+ std::map< uint32_t, std::vector< uint64_t > >& l_map,
+ std::map< uint64_t, BtreeVisualizeVariables >& info_map) const {
+ BtreeNodePtr node;
+ locktype_t acq_lock = locktype_t::READ;
+
+ if (read_and_lock_node(bnodeid, node, acq_lock, acq_lock, nullptr) != btree_status_t::success) { return; }
+ fmt::format_to(std::back_inserter(buf), "{}\n", node->to_dot_keys());
+ l_map[node->level()].push_back(node->node_id());
+ info_map[node->node_id()].midPoint = node->is_leaf() ? 0 : node->total_entries() / 2;
+ if (!node->is_leaf()) {
+ uint32_t i = 0;
+ while (i < node->total_entries()) {
+ BtreeLinkInfo p;
+ node->get_nth_value(i, &p, false);
+ to_dot_keys(p.bnode_id(), buf, l_map, info_map);
+ info_map[p.bnode_id()].parent = node->node_id();
+ info_map[p.bnode_id()].index = i;
+ ++i;
+ }
+ if (node->has_valid_edge()) {
+ to_dot_keys(node->edge_id(), buf, l_map, info_map);
+ info_map[node->edge_id()].parent = node->node_id();
+ info_map[node->edge_id()].index = node->total_entries();
+ }
+ }
+ unlock_node(node, acq_lock);
+}
+
template < typename K, typename V >
uint64_t Btree< K, V >::count_keys(bnodeid_t bnodeid) const {
BtreeNodePtr node;
diff --git a/src/include/homestore/btree/detail/btree_mutate_impl.ipp b/src/include/homestore/btree/detail/btree_mutate_impl.ipp
index 209b35558..58844ba5c 100644
--- a/src/include/homestore/btree/detail/btree_mutate_impl.ipp
+++ b/src/include/homestore/btree/detail/btree_mutate_impl.ipp
@@ -303,13 +303,24 @@ template < typename K, typename V >
template < typename ReqT >
bool Btree< K, V >::is_split_needed(const BtreeNodePtr& node, ReqT& req) const {
if (!node->is_leaf()) { // if internal node, size is atmost one additional entry, size of K/V
- return !node->has_room_for_put(btree_put_type::UPSERT, K::get_max_size(), BtreeLinkInfo::get_fixed_size());
+ return node->total_entries() >= 3;
} else if constexpr (std::is_same_v< ReqT, BtreeRangePutRequest< K > >) {
- return !node->has_room_for_put(req.m_put_type, req.first_key_size(), req.m_newval->serialized_size());
+ return node->total_entries() >= 3;
} else if constexpr (std::is_same_v< ReqT, BtreeSinglePutRequest >) {
- return !node->has_room_for_put(req.m_put_type, req.key().serialized_size(), req.value().serialized_size());
+ return node->total_entries() >= 3;;
} else {
return false;
}
}
+//bool Btree< K, V >::is_split_needed(const BtreeNodePtr& node, ReqT& req) const {
+// if (!node->is_leaf()) { // if internal node, size is atmost one additional entry, size of K/V
+// return !node->has_room_for_put(btree_put_type::UPSERT, K::get_max_size(), BtreeLinkInfo::get_fixed_size());
+// } else if constexpr (std::is_same_v< ReqT, BtreeRangePutRequest< K > >) {
+// return !node->has_room_for_put(req.m_put_type, req.first_key_size(), req.m_newval->serialized_size());
+// } else if constexpr (std::is_same_v< ReqT, BtreeSinglePutRequest >) {
+// return !node->has_room_for_put(req.m_put_type, req.key().serialized_size(), req.value().serialized_size());
+// } else {
+// return false;
+// }
+//}
} // namespace homestore
diff --git a/src/include/homestore/btree/detail/btree_node.hpp b/src/include/homestore/btree/detail/btree_node.hpp
index 73e8be5a8..f1a95372f 100644
--- a/src/include/homestore/btree/detail/btree_node.hpp
+++ b/src/include/homestore/btree/detail/btree_node.hpp
@@ -69,17 +69,23 @@ struct persistent_hdr_t {
persistent_hdr_t() : nentries{0}, leaf{0}, node_deleted{0} {}
std::string to_string() const {
- return fmt::format("magic={} version={} csum={} node_id={} next_node={} nentries={} node_type={} is_leaf={} "
- "node_deleted={} node_gen={} modified_cp_id={} link_version={} edge_nodeid={}, "
- "edge_link_version={} level={} ",
- magic, version, checksum, node_id, next_node, nentries, node_type, leaf, node_deleted,
- node_gen, modified_cp_id, link_version, edge_info.m_bnodeid, edge_info.m_link_version,
+ auto snext = (next_node == empty_bnodeid) ? "" : "next_node="+ std::to_string(next_node);
+ auto sedge = (edge_info.m_bnodeid == empty_bnodeid) ? "" : "edge_nodeid="+ std::to_string(edge_info.m_bnodeid);
+ auto sedgelink = (edge_info.m_bnodeid == empty_bnodeid) ? "" : "edge_link_version="+ std::to_string(edge_info.m_link_version);
+ return fmt::format("magic={} version={} csum={} node_id={} {} nentries={} node_type={} is_leaf={} "
+ "node_deleted={} node_gen={} modified_cp_id={} link_version={} {}, "
+ "{} level={} ",
+ magic, version, checksum, node_id, snext, nentries, node_type, leaf, node_deleted,
+ node_gen, modified_cp_id, link_version, sedge, sedgelink,
level);
}
std::string to_compact_string() const {
- return fmt::format("{} id={} next={} nentries={} {} level={}", (void*)this, node_id, next_node, nentries,
- (node_deleted == 0x1) ? "Deleted" : "", level);
+ auto snext = (next_node == empty_bnodeid) ? "" : "next="+ std::to_string(next_node);
+ auto sedge = (edge_info.m_bnodeid == empty_bnodeid) ? "" : "edge_nodeid="+ std::to_string(edge_info.m_bnodeid);
+ auto sleaf = leaf?"LEAF": "INTERIOR";
+ return fmt::format(" id={} {}{} {} nentries={} {} level={} modified_cp_id={}", node_id, snext,sedge, sleaf, nentries,
+ (node_deleted == 0x1) ? "Deleted" : "", level, modified_cp_id);
}
};
#pragma pack()
@@ -111,10 +117,10 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > {
// Identify if a node is a leaf node or not, from raw buffer, by just reading persistent_hdr_t
static bool identify_leaf_node(uint8_t* buf) { return (r_cast< persistent_hdr_t* >(buf))->leaf; }
+ static std::string to_string_buf(uint8_t* buf) { return (r_cast< persistent_hdr_t* >(buf))->to_compact_string(); }
static BtreeLinkInfo::bnode_link_info identify_edge_info(uint8_t* buf) {
return (r_cast< persistent_hdr_t* >(buf))->edge_info;
}
- static std::string to_string_buf(uint8_t* buf) { return (r_cast< persistent_hdr_t* >(buf))->to_string(); }
static bool is_valid_node(sisl::blob const& buf) {
auto phdr = r_cast< persistent_hdr_t const* >(buf.cbytes());
@@ -385,6 +391,7 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > {
virtual std::string to_string(bool print_friendly = false) const = 0;
virtual std::string to_string_keys(bool print_friendly = false) const = 0;
+ virtual std::string to_dot_keys() const = 0;
protected:
node_find_result_t bsearch_node(const BtreeKey& key) const {
diff --git a/src/include/homestore/btree/detail/prefix_node.hpp b/src/include/homestore/btree/detail/prefix_node.hpp
index 486dec6f1..3bdd06ba7 100644
--- a/src/include/homestore/btree/detail/prefix_node.hpp
+++ b/src/include/homestore/btree/detail/prefix_node.hpp
@@ -645,6 +645,7 @@ class FixedPrefixNode : public VariantNode< K, V > {
}
std::string to_string_keys(bool print_friendly = false) const override { return "NOT Supported"; }
+ std::string to_dot_keys() const override { return "NOT Supported"; }
private:
uint16_t add_prefix(BtreeKey const& key, BtreeValue const& val) {
diff --git a/src/include/homestore/btree/detail/simple_node.hpp b/src/include/homestore/btree/detail/simple_node.hpp
index d909c202f..cb4e02291 100644
--- a/src/include/homestore/btree/detail/simple_node.hpp
+++ b/src/include/homestore/btree/detail/simple_node.hpp
@@ -207,13 +207,16 @@ class SimpleNode : public VariantNode< K, V > {
}
std::string to_string(bool print_friendly = false) const override {
- auto str = fmt::format("{}id={} level={} nEntries={} {} next_node={} ",
+ auto snext = this->next_bnode() == empty_bnodeid ? "" : fmt::format("next_node={}", this->next_bnode());
+ auto str = fmt::format("{}id={} level={} nEntries={} {} {} ",
(print_friendly ? "------------------------------------------------------------\n" : ""),
this->node_id(), this->level(), this->total_entries(),
- (this->is_leaf() ? "LEAF" : "INTERIOR"), this->next_bnode());
+ (this->is_leaf() ? "LEAF" : "INTERIOR"), snext);
if (!this->is_leaf() && (this->has_valid_edge())) {
- fmt::format_to(std::back_inserter(str), "edge_id={}.{}", this->edge_info().m_bnodeid,
- this->edge_info().m_link_version);
+ auto sedge = this->has_valid_edge() ? "edge:" + std::to_string(this->edge_info().m_bnodeid) + "." +
+ std::to_string(this->edge_info().m_link_version)
+ : "";
+ fmt::format_to(std::back_inserter(str), "{}", sedge);
}
for (uint32_t i{0}; i < this->total_entries(); ++i) {
@@ -226,8 +229,8 @@ class SimpleNode : public VariantNode< K, V > {
std::string to_string_keys(bool print_friendly = false) const override {
// FIXME: Implement this, key may not be a unit32_t
- return "";
-#if 0
+// return "";
+//#if 0
std::string delimiter = print_friendly ? "\n" : "\t";
std::string snext = this->next_bnode() == empty_bnodeid ? "" : fmt::format("next_node={}", this->next_bnode());
auto str = fmt::format("{}{}.{} level:{} nEntries={} {} {} node_gen={} ",
@@ -285,7 +288,93 @@ class SimpleNode : public VariantNode< K, V > {
fmt::format_to(std::back_inserter(str), "-{}]", cur_key);
}
return str;
-#endif
+//#endif
+ }
+
+ std::string to_dot_keys() const override {
+ std::string str;
+ std::string snext = this->next_bnode() == empty_bnodeid? "": fmt::format("next_node={}", this->next_bnode());
+ str += fmt::format(R"("{}" [
+ shape = none,
+ labelloc="c",
+ fontsize=25,
+ label = <
+ )",
+ this->node_id());
+ if (this->total_entries() == 0) {
+ return str + fmt::format(R"(
+ | E |
+ |
>])");
+ }
+
+ if (!this->is_leaf()) {
+ // str += " ";
+ for (uint32_t i{0}; i < this->total_entries(); ++i) {
+ uint32_t cur_key = get_nth_key< K >(i, false).key();
+ BtreeLinkInfo child_info;
+ get_nth_value(i, &child_info, false /* copy */);
+ str += fmt::format(R"(
+ | {}.{} | )",
+ i, i, cur_key, child_info.link_version());
+ }
+ std::string sedge = this->has_valid_edge() ? "edge:" + std::to_string( this->edge_info().m_bnodeid) + "." + std::to_string( this->edge_info().m_link_version):"";
+ str += fmt::format(R"(
+ |
+ {}.{} gen={} {} {} |
>];)",this->total_entries(),this->node_id(), this->link_version(), this->node_gen(), snext, sedge );
+
+
+ } else {
+ std::string keys_buf = "";
+ uint32_t prev_key = get_nth_key< K >(0, false).key();
+ uint32_t cur_key = prev_key;
+ uint32_t last_key = get_nth_key< K >(this->total_entries() - 1, false).key();
+ if (last_key - prev_key == this->total_entries() - 1) {
+ if (this->total_entries() == 1) {
+ keys_buf += fmt::format(R"(
+ | {} | )",
+ 0, 0, cur_key);
+ } else {
+ keys_buf += fmt::format(R"(
+ | {}-{} | )",
+ 0, 0, prev_key, last_key);
+ }
+ keys_buf += fmt::format(R"(
+ |
+ {}.{} gen={} {} |
+ >];)",
+ 1, this->node_id(), this->link_version(), this->node_gen(), snext);
+ return str + keys_buf;
+ }
+
+ keys_buf += fmt::format(R"(
+ " | {})",
+ 0, 0, prev_key);
+ uint32_t start_interval_key = prev_key;
+ for (uint32_t i{1}; i < this->total_entries(); ++i) {
+ cur_key = get_nth_key< K >(i, false).key();
+ if (cur_key != prev_key + 1) {
+ if (start_interval_key == prev_key) {
+ keys_buf += fmt::format(" {}", cur_key);
+ } else {
+ keys_buf += fmt::format("-{} {}", prev_key, cur_key);
+ }
+ start_interval_key = cur_key;
+ }
+ prev_key = cur_key;
+ }
+
+ if (start_interval_key == prev_key) {
+ keys_buf += fmt::format(" | ");
+ } else {
+ keys_buf += fmt::format(" {}", cur_key);
+ }
+ keys_buf += fmt::format(R"(
+ |
+ {}.{} gen={} {} | >];)",
+ 1, this->node_id(), this->link_version(), this->node_gen(), snext);
+ return str + keys_buf;
+ }
+ return str;
}
#ifndef NDEBUG
diff --git a/src/include/homestore/btree/detail/varlen_node.hpp b/src/include/homestore/btree/detail/varlen_node.hpp
index e98b63a52..120838dce 100644
--- a/src/include/homestore/btree/detail/varlen_node.hpp
+++ b/src/include/homestore/btree/detail/varlen_node.hpp
@@ -567,6 +567,7 @@ class VariableNode : public VariantNode< K, V > {
#endif
return {};
}
+ std::string to_dot_keys() const override { return "NOT Supported"; }
/*int compare_nth_key_range(const BtreeKeyRange& range, uint32_t ind) const {
return get_nth_key(ind, false).compare_range(range);
diff --git a/src/include/homestore/index/index_internal.hpp b/src/include/homestore/index/index_internal.hpp
index d813fa4e2..09e944f18 100644
--- a/src/include/homestore/index/index_internal.hpp
+++ b/src/include/homestore/index/index_internal.hpp
@@ -122,6 +122,7 @@ struct IndexBuffer : public sisl::ObjLifeCounter< IndexBuffer > {
bool is_meta_buf() const { return m_is_meta_buf; }
std::string to_string() const;
+ std::string to_string_dot() const;
};
// This is a special buffer which is used to write to the meta block
diff --git a/src/include/homestore/index/index_table.hpp b/src/include/homestore/index/index_table.hpp
index ec6f7a7bc..50b69ac0d 100644
--- a/src/include/homestore/index/index_table.hpp
+++ b/src/include/homestore/index/index_table.hpp
@@ -108,18 +108,25 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
}
void repair_node(IndexBufferPtr const& idx_buf) override {
- BtreeNode* n = this->init_node(idx_buf->raw_buffer(), idx_buf->blkid().to_integer(), true,
+ BtreeNode* n = this->init_node(idx_buf->raw_buffer(), idx_buf->blkid().to_integer(), false /* init_buf */,
BtreeNode::identify_leaf_node(idx_buf->raw_buffer()));
static_cast< IndexBtreeNode* >(n)->attach_buf(idx_buf);
auto cpg = cp_mgr().cp_guard();
+ //m_dirtied_cp_id was set in -1. It is time to make it dirty
+ idx_buf->m_dirtied_cp_id = cpg->id();
+ LOGTRACEMOD(wbcache, "repair_node cp {} {}", cpg->id(), idx_buf->to_string());
repair_links(BtreeNodePtr{n}, (void*)cpg.context(cp_consumer_t::INDEX_SVC));
}
void repair_root(IndexBufferPtr const& root_buf) override {
- BtreeNode* n = this->init_node(root_buf->raw_buffer(), root_buf->blkid().to_integer(), true,
+ BtreeNode* n = this->init_node(root_buf->raw_buffer(), root_buf->blkid().to_integer(), false /* init_buf */,
BtreeNode::identify_leaf_node(root_buf->raw_buffer()));
static_cast< IndexBtreeNode* >(n)->attach_buf(root_buf);
auto cpg = cp_mgr().cp_guard();
+ //m_dirtied_cp_id was set in -1. It is time to make it dirty
+ root_buf->m_dirtied_cp_id = cpg->id();
+ LOGTRACEMOD(wbcache, "repair_root cp {} {}", cpg->id(), root_buf->to_string());
+
on_root_changed(n, (void*)cpg.context(cp_consumer_t::INDEX_SVC));
}
@@ -140,18 +147,20 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
auto prev_state = idx_node->m_idx_buf->m_state.exchange(index_buf_state_t::DIRTY);
if (prev_state == index_buf_state_t::CLEAN) {
// It was clean before, dirtying it first time, add it to the wb_cache list to flush
- BT_DBG_ASSERT_EQ(idx_node->m_idx_buf->m_dirtied_cp_id, cp_ctx->id(),
- "Writing a node which was not acquired by this cp");
+ if(idx_node->m_idx_buf->m_dirtied_cp_id !=-1) {
+ BT_DBG_ASSERT_EQ(idx_node->m_idx_buf->m_dirtied_cp_id, cp_ctx->id(),
+ "Writing a node which was not acquired by this cp");
+ }
wb_cache().write_buf(node, idx_node->m_idx_buf, cp_ctx);
+ node->set_checksum();
+ node->set_modified_cp_id(cp_ctx->id());
LOGTRACEMOD(wbcache, "add to dirty list cp {} {}", cp_ctx->id(), idx_node->m_idx_buf->to_string());
} else {
BT_DBG_ASSERT_NE(
(int)prev_state, (int)index_buf_state_t::FLUSHING,
"Writing on a node buffer which was currently in flushing state on cur_cp={} buffer_cp_id={}",
- cp_ctx->id(), idx_node->m_idx_buf->m_dirtied_cp_id)
+ cp_ctx->id(), idx_node->m_idx_buf->m_dirtied_cp_id);
}
- node->set_checksum();
- node->set_modified_cp_id(cp_ctx->id());
return btree_status_t::success;
}
@@ -168,7 +177,8 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
new_node_bufs.push_back(static_cast< IndexBtreeNode* >(right_child_node.get())->m_idx_buf);
}
write_node_impl(left_child_node, context);
- write_node_impl(parent_node, context);
+ // during recovery it is possible that there is no parent_node
+ if (parent_node.get() != nullptr) {write_node_impl(parent_node, context); }
IndexBufferPtrList freed_node_bufs;
for (const auto& freed_node : freed_nodes) {
@@ -176,7 +186,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
this->free_node(freed_node, locktype_t::WRITE, context);
}
- wb_cache().transact_bufs(ordinal(), parent_buf, left_child_buf, new_node_bufs, freed_node_bufs, cp_ctx);
+ wb_cache().transact_bufs(ordinal(), parent_node.get() ? parent_buf : nullptr, left_child_buf, new_node_bufs, freed_node_bufs, cp_ctx);
return btree_status_t::success;
}
@@ -218,7 +228,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
}
btree_status_t repair_links(BtreeNodePtr const& parent_node, void* cp_ctx) {
- BT_LOG(DEBUG, "Repairing links for parent node {}", parent_node->node_id());
+ BT_LOG(DEBUG, "Repairing links for parent node {}", parent_node->to_string());
// Get the last key in the node
auto const last_parent_key = parent_node->get_last_key< K >();
@@ -252,7 +262,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
BtreeNodeList new_parent_nodes;
do {
if (child_node->has_valid_edge()) {
- BT_DBG_ASSERT(!is_parent_edge_node,
+ BT_DBG_ASSERT(is_parent_edge_node,
"Child node={} is an edge node but parent_node={} is not an edge node",
child_node->node_id(), parent_node->node_id());
cur_parent->set_edge_value(BtreeLinkInfo{child_node->node_id(), child_node->link_version()});
@@ -288,6 +298,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
BtreeLinkInfo{child_node->node_id(), child_node->link_version()});
// Move to the next child node
+ this->unlock_node(child_node, locktype_t::READ);
auto const next_node_id = child_node->next_bnode();
if (next_node_id == empty_bnodeid) {
BT_LOG_ASSERT(
@@ -305,6 +316,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
break;
}
} while (true);
+ this->unlock_node(child_node, locktype_t::READ);
if (ret == btree_status_t::success) {
ret = transact_nodes(new_parent_nodes, {}, parent_node, nullptr, cp_ctx);
diff --git a/src/include/homestore/index/wb_cache_base.hpp b/src/include/homestore/index/wb_cache_base.hpp
index d576c2b6b..4624f9444 100644
--- a/src/include/homestore/index/wb_cache_base.hpp
+++ b/src/include/homestore/index/wb_cache_base.hpp
@@ -64,6 +64,7 @@ class IndexWBCacheBase {
/// @param cur_buf
/// @return
// virtual IndexBufferPtr copy_buffer(const IndexBufferPtr& cur_buf, const CPContext* context) const = 0;
+ virtual void recover(sisl::byte_view sb) = 0;
};
} // namespace homestore
diff --git a/src/lib/index/index_cp.cpp b/src/lib/index/index_cp.cpp
index 730dc2f30..ce21fdee9 100644
--- a/src/lib/index/index_cp.cpp
+++ b/src/lib/index/index_cp.cpp
@@ -81,7 +81,7 @@ std::optional< IndexBufferPtr > IndexCPContext::next_dirty() {
}
std::string IndexCPContext::to_string() {
- std::string str{fmt::format("IndexCPContext cpid={} dirty_buf_count={} dirty_buf_list_size={}", m_cp->id(),
+ std::string str{fmt::format("IndexCPContext cpid={} dirty_buf_count={} dirty_buf_list_size={}\n", m_cp->id(),
m_dirty_buf_count.get(), m_dirty_buf_list.size())};
// Mapping from a node to all its parents in the graph.
@@ -108,6 +108,37 @@ std::string IndexCPContext::to_string() {
return str;
}
+void IndexCPContext::to_string_dot(const std::string& filename) {
+ std::ofstream file(filename);
+ if (!file.is_open()) {
+ throw std::runtime_error("Failed to open file: " + filename);
+ }
+
+ file << "digraph G {\n";
+
+ // Mapping from a node to all its parents in the graph.
+ std::unordered_map< IndexBuffer*, std::vector< IndexBuffer* > > parents;
+
+ m_dirty_buf_list.foreach_entry([&parents](IndexBufferPtr buf) {
+ // Add this buf to his children.
+ parents[buf->m_up_buffer.get()].emplace_back(buf.get());
+ });
+ m_dirty_buf_list.foreach_entry([&file, &parents, this](IndexBufferPtr buf) {
+ std::vector colors={"lightgreen", "lightcoral", "lightyellow"};
+ auto sbuf = BtreeNode::to_string_buf(buf->raw_buffer());
+ auto pos = sbuf.find("LEAF");
+ if (pos != std::string::npos) {sbuf.insert(pos+4, "
");}else {pos = sbuf.find("INTERIOR"); if (pos != std::string::npos) { sbuf.insert(pos + 8, "
"); }}
+ file << fmt::format("\"{}\" [shape={}, label=< {}
{} >, fillcolor=\"{}\", style=\"filled\", fontname=\"bold\"];\n", r_cast< void* >(buf.get()),
+ m_cp->id()==buf->m_created_cp_id?"ellipse":"box", buf->to_string_dot(),sbuf,colors[s_cast< int >(buf->state())]);
+ for (const auto& p : parents[buf.get()]) {
+ file << fmt::format("\"{}\" -> \"{}\";\n", r_cast< void* >(p), r_cast< void* >(buf.get()));
+ }
+ });
+ file << "}\n";
+
+ file.close();
+}
+
std::string IndexCPContext::to_string_with_dags() {
struct DagNode {
IndexBufferPtr buf;
@@ -215,6 +246,22 @@ void IndexCPContext::process_txn_record(txn_record const* rec, std::map< BlkId,
if (up_buf) {
DEBUG_ASSERT(((buf->m_up_buffer == nullptr) || (buf->m_up_buffer == up_buf)), "Inconsistent up buffer");
auto real_up_buf = (up_buf->m_created_cp_id == cpg->id()) ? up_buf->m_up_buffer : up_buf;
+
+#ifndef NDEBUG
+ // if (!is_sibling_link || (buf->m_up_buffer == real_up_buf)) { return buf;}
+ // Already linked with same buf or its not a sibling link to override
+ bool found{false};
+ for (auto const& dbuf : real_up_buf->m_down_buffers) {
+ if (dbuf.lock() == buf) {
+ found = true;
+ break;
+ }
+ }
+ if (found){
+ return buf;
+ }
+ real_up_buf->m_down_buffers.emplace_back(buf);
+#endif
real_up_buf->m_wait_for_down_buffers.increment(1);
buf->m_up_buffer = real_up_buf;
}
diff --git a/src/lib/index/index_cp.hpp b/src/lib/index/index_cp.hpp
index ab72a0b08..1b8a2a2b0 100644
--- a/src/lib/index/index_cp.hpp
+++ b/src/lib/index/index_cp.hpp
@@ -162,6 +162,7 @@ struct IndexCPContext : public VDevCPContext {
std::optional< IndexBufferPtr > next_dirty();
std::string to_string();
std::string to_string_with_dags();
+ void to_string_dot(const std::string& filename);
private:
void check_cycle();
diff --git a/src/lib/index/index_service.cpp b/src/lib/index/index_service.cpp
index cf2e589a5..0d2712db5 100644
--- a/src/lib/index/index_service.cpp
+++ b/src/lib/index/index_service.cpp
@@ -72,17 +72,17 @@ void IndexService::itable_meta_blk_found(const sisl::byte_view& buf, void* meta_
// IndexTable instance. This callback will be called twice, first time when it boots before IndexService starts and
// once as part of IndexService start, will read all metablks and then call this. We process everything after
// IndexService has been started, because writeback cache would have recovered all nodes by now.
- if (m_wb_cache) {
- superblk< index_table_sb > sb;
- sb.load(buf, meta_cookie);
- add_index_table(m_svc_cbs->on_index_table_found(std::move(sb)));
- }
+// At this moment: m_wb_cache is nullptr but add_index_table to populate the table, ow, recovery path has no table to recover and repair_node does nothing
+ superblk< index_table_sb > sb;
+ sb.load(buf, meta_cookie);
+ add_index_table(m_svc_cbs->on_index_table_found(std::move(sb)));
}
void IndexService::start() {
// Start Writeback cache
- m_wb_cache = std::make_unique< IndexWBCache >(m_vdev, std::move(m_wbcache_sb), hs()->evictor(),
+ m_wb_cache = std::make_unique< IndexWBCache >(m_vdev, m_wbcache_sb, hs()->evictor(),
hs()->device_mgr()->atomic_page_size(HSDevType::Fast));
+ m_wb_cache->recover(m_wbcache_sb.second);
// Read the index metablks again
meta_service().read_sub_sb("index");
@@ -151,11 +151,35 @@ std::string IndexBuffer::to_string() const {
voidptr_cast(const_cast< IndexBuffer* >(this)), m_index_ordinal, int_cast(state()),
m_created_cp_id, m_dirtied_cp_id, m_wait_for_down_buffers.get(), m_node_freed);
} else {
- return fmt::format("Buf={} index={} state={} create/dirty_cp={}/{} down_wait#={} {} node=[{}] ",
+ // store m_down_buffers in a string
+
+ std::string down_bufs="";
+#ifndef NDEBUG
+ for (auto const& down_buf : m_down_buffers) {
+ if (auto ptr = down_buf.lock()) {
+ fmt::format_to(std::back_inserter(down_bufs), " [{}]", voidptr_cast(ptr.get()));
+ }
+ }
+#endif
+
+ return fmt::format("Buf={} index={} state={} create/dirty_cp={}/{} down_wait#={} {} up_buffer={} node=[{}] down_buffers=[{}]",
voidptr_cast(const_cast< IndexBuffer* >(this)), m_index_ordinal, int_cast(state()),
m_created_cp_id, m_dirtied_cp_id, m_wait_for_down_buffers.get(), m_node_freed ? "Freed" : "",
- r_cast< persistent_hdr_t const* >(m_bytes)->to_compact_string());
+ voidptr_cast(const_cast< IndexBuffer* >(m_up_buffer.get())), (m_bytes == nullptr) ? " not attached yet" :r_cast< persistent_hdr_t const* >(m_bytes)->to_compact_string(), down_bufs);
+ }
+}
+std::string IndexBuffer::to_string_dot() const {
+ auto str = fmt::format("IndexBuffer {} ", reinterpret_cast< void* >(const_cast< IndexBuffer* >(this)));
+ if (m_bytes == nullptr) {
+ fmt::format_to(std::back_inserter(str), " node_buf=nullptr ");
+ } else {
+// fmt::format_to(std::back_inserter(str), " node_buf={}", static_cast< void* >(m_node_buf->m_bytes));
+ fmt::format_to(std::back_inserter(str), " node_buf={} {} created/dirtied={}/{} {} down_wait#={}",
+ static_cast< void* >(m_bytes), m_is_meta_buf?"[META]":"", m_created_cp_id, m_dirtied_cp_id, m_node_freed?"FREED":"", m_wait_for_down_buffers.get());
}
+ // fmt::format_to(std::back_inserter(str), "
next_buffer={}",
+ // m_next_buffer.lock() ? reinterpret_cast< void* >(m_next_buffer.lock().get()) : 0);
+ return str;
}
MetaIndexBuffer::MetaIndexBuffer(superblk< index_table_sb >& sb) : IndexBuffer{nullptr, BlkId{}}, m_sb{sb} {
diff --git a/src/lib/index/wb_cache.cpp b/src/lib/index/wb_cache.cpp
index dcece7ffc..b5e32e00c 100644
--- a/src/lib/index/wb_cache.cpp
+++ b/src/lib/index/wb_cache.cpp
@@ -54,7 +54,6 @@ IndexWBCache::IndexWBCache(const std::shared_ptr< VirtualDev >& vdev, std::pair<
// We need to register the consumer first before recovery, so that recovery can use the cp_ctx created to add/track
// recovered new nodes.
cp_mgr().register_consumer(cp_consumer_t::INDEX_SVC, std::move(std::make_unique< IndexCPCallbacks >(this)));
- recover(std::move(sb.second));
}
void IndexWBCache::start_flush_threads() {
@@ -184,6 +183,7 @@ bool IndexWBCache::refresh_meta_buf(shared< MetaIndexBuffer >& meta_buf, CPConte
new_buf->m_dirtied_cp_id = cp_ctx->id();
write_buf(nullptr, new_buf, cp_ctx);
meta_buf = new_buf; // Replace the meta_buf with new buf
+ LOGTRACEMOD(wbcache, "meta buf {} is created in cp {}", meta_buf->to_string(), cp_ctx->id());
}
return true;
}
@@ -193,7 +193,7 @@ static void set_crash_flips(IndexBufferPtr const& parent_buf, IndexBufferPtr con
IndexBufferPtrList const& new_node_bufs, IndexBufferPtrList const& freed_node_bufs) {
// TODO: Need an API from flip to quickly check if flip is enabled, so this method doesn't check flip_enabled a
// bunch of times.
- if (parent_buf->is_meta_buf()) {
+ if (parent_buf && parent_buf->is_meta_buf()) {
// Split or merge happening on root
if (iomgr_flip::instance()->test_flip("crash_flush_on_meta")) {
parent_buf->set_crash_flag();
@@ -258,8 +258,10 @@ void IndexWBCache::transact_bufs(uint32_t index_ordinal, IndexBufferPtr const& p
if (new_node_bufs.empty() && freed_node_bufs.empty()) {
// This is an update for meta, root transaction.
- DEBUG_ASSERT_EQ(child_buf->m_created_cp_id, icp_ctx->id(),
- "Root buffer is not created by current cp (for split root), its not expected");
+ if(child_buf->m_created_cp_id != -1) {
+ DEBUG_ASSERT_EQ(child_buf->m_created_cp_id, icp_ctx->id(),
+ "Root buffer is not created by current cp (for split root), its not expected");
+ }
icp_ctx->add_to_txn_journal(index_ordinal, parent_buf, nullptr, {child_buf}, {});
} else {
icp_ctx->add_to_txn_journal(index_ordinal, // Ordinal
@@ -270,6 +272,12 @@ void IndexWBCache::transact_bufs(uint32_t index_ordinal, IndexBufferPtr const& p
freed_node_bufs // free_node_bufs
);
}
+#if 0
+ static int id = 0;
+ auto filename = "transact_bufs_"+std::to_string(id++)+ "_" +std::to_string(rand()%100)+".dot";
+ LOGINFO("Transact cp is in cp\n{} and storing in {}\n\n\n", icp_ctx->to_string(), filename);
+ icp_ctx->to_string_dot(filename);
+#endif
}
void IndexWBCache::link_buf(IndexBufferPtr const& up_buf, IndexBufferPtr const& down_buf, bool is_sibling_link,
@@ -285,7 +293,7 @@ void IndexWBCache::link_buf(IndexBufferPtr const& up_buf, IndexBufferPtr const&
if (up_buf->m_created_cp_id == icp_ctx->id()) {
real_up_buf = up_buf->m_up_buffer;
HS_DBG_ASSERT(real_up_buf,
- "Up buffer is newly created in this cp, but it doesn't have its own up_buffer, its not expected");
+ "Up buffer is newly created in this cp, but it doesn't have its own up_buffer, its not expected");
}
// Condition 2: If down_buf already has an up_buf, we can override it newly passed up_buf it only in case of
@@ -317,7 +325,7 @@ void IndexWBCache::link_buf(IndexBufferPtr const& up_buf, IndexBufferPtr const&
// This link is acheived by unconditionally changing the link in case of is_sibling=true to passed up_buf, but
// conditionally do it in case of parent link where it already has a link don't override it.
if (down_buf->m_up_buffer != nullptr) {
- HS_DBG_ASSERT_LT(down_buf->m_up_buffer->m_created_cp_id, icp_ctx->id(),
+ HS_DBG_ASSERT_LT(down_buf->m_up_buffer->m_created_cp_id, icp_ctx->id(),
"down_buf=[{}] up_buffer=[{}] should never have been created on same cp",
down_buf->to_string(), down_buf->m_up_buffer->to_string());
@@ -326,6 +334,8 @@ void IndexWBCache::link_buf(IndexBufferPtr const& up_buf, IndexBufferPtr const&
real_up_buf = down_buf->m_up_buffer;
HS_DBG_ASSERT(!real_up_buf->m_wait_for_down_buffers.testz(),
"Up buffer waiting count is zero, whereas down buf is already linked to up buf");
+ if(!(real_up_buf->m_dirtied_cp_id == down_buf->m_dirtied_cp_id) || (real_up_buf->is_meta_buf()))
+ { icp_ctx->to_string_dot ("crash5.dot"); }
HS_DBG_ASSERT((real_up_buf->m_dirtied_cp_id == down_buf->m_dirtied_cp_id) || (real_up_buf->is_meta_buf()),
"Up buffer is not modified by current cp, but down buffer is linked to it");
#ifndef NDEBUG
@@ -391,17 +401,38 @@ void IndexWBCache::recover(sisl::byte_view sb) {
// this blk again, so that it will not be reallocated for other node during repair.
m_vdev->commit_blk(buf->m_blkid);
new_bufs.push_back(buf);
+ }else{
+ // In th recovery path, an uncommited node has a set of commited down bufs and also a set of uncommited down bufs. In repair a node must wait ONLY for nodes that are the part of recovery path. So here, we corrected the down wait of up bufs for unreached path
+ if(was_node_committed(buf) && !was_node_committed(buf->m_up_buffer)){
+ LOGINFO(" \n\t\t Mehdi: buf {} was commited but up_buffer {} was not commited, hence discarded ", voidptr_cast(buf.get()), voidptr_cast(buf->m_up_buffer.get()));
+ buf->m_up_buffer->m_wait_for_down_buffers.decrement(1);
+ }
}
}
}
+#if 0
+ // I keep it here to see the down_waits are set correctly for up buffers
+ // list of all recovered bufs
+ std::string log = "\n\n\t\t\t\t\t recovered bufs (#of bufs = " + std::to_string(bufs.size()) +" ) \n";
+ for (auto const& [_, buf] : bufs) {
+ fmt::format_to(std::back_inserter(log), "{}\n", buf->to_string());
+ }
+ LOGINFO("{}",log);
+ // list of new_bufs
+ std::string log2 = "\n\n\t\t\t\t\t new_bufs (#of new bufs " + std::to_string(new_bufs.size()) + " )\n";
+ for (auto const& buf : new_bufs) {
+ fmt::format_to(std::back_inserter(log2), "{}\n", buf->to_string());
+ }
+ LOGINFO("{}", log2);
+#endif
+
// Second iteration we start from the lowest levels (which are all new_bufs) and check if up_buffers need to be
// repaired. All L1 buffers are not needed to repair, because they are sibling nodes and so we pass false in
// do_repair flag.
for (auto const& buf : new_bufs) {
process_up_buf(buf->m_up_buffer, false /* do_repair */);
}
-
m_vdev->recovery_completed();
}
@@ -411,11 +442,19 @@ void IndexWBCache::process_up_buf(IndexBufferPtr const& buf, bool do_repair) {
// repair
buf->m_dirtied_cp_id = -1;
}
-
if (!buf->m_wait_for_down_buffers.decrement_testz()) { return; }
// One of the down buffer indicated that it has to repair our node, so we issue a repair
- if (buf->m_dirtied_cp_id == -1) { index_service().repair_index_node(buf->m_index_ordinal, buf); }
+ if (buf->m_dirtied_cp_id == -1) {
+ if(buf->m_up_buffer->is_meta_buf()){
+ LOGTRACEMOD(wbcache, "repair_index_root for buf {}", buf->to_string());
+ index_service().repair_index_root(buf->m_index_ordinal, buf);
+ }else
+ {
+ LOGTRACEMOD(wbcache, "repair_index_node for buf {}", buf->to_string());
+ index_service().repair_index_node(buf->m_index_ordinal, buf);
+ }
+ }
// If there is an up buffer on next level, we need to process them and ask them to repair in case they were not
// written as part of this CP.
@@ -464,6 +503,10 @@ folly::Future< bool > IndexWBCache::async_cp_flush(IndexCPContext* cp_ctx) {
}
return folly::makeFuture< bool >(true); // nothing to flush
}
+ // if is_crashed don't do anything
+ if (hs()->crash_simulator().is_crashed()) {
+ return folly::makeFuture< bool >(true); // nothing to flush
+ }
// First thing is to flush the new_blks created as part of the CP.
auto const& journal_buf = cp_ctx->journal_buf();
@@ -492,13 +535,24 @@ folly::Future< bool > IndexWBCache::async_cp_flush(IndexCPContext* cp_ctx) {
}
void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const& buf, bool part_of_batch) {
+#ifdef _PRERELEASE
+ if (hs()->crash_simulator().is_crashed()) {
+ LOGINFOMOD(wbcache, "crash simulation is ongoing");
+ return;
+ }
+#endif
+
LOGTRACEMOD(wbcache, "cp {} buf {}", cp_ctx->id(), buf->to_string());
buf->set_state(index_buf_state_t::FLUSHING);
#ifdef _PRERELEASE
if (buf->m_crash_flag_on) {
- LOGINFOMOD(wbcache, "Simulating crash while writing buffer {}", buf->to_string());
+ std::string filename = "crash_buf_"+std::to_string(cp_ctx->id())+".dot";
+ LOGINFOMOD(wbcache, "Simulating crash while writing buffer {}, stored in file {}", buf->to_string(), filename);
+ cp_ctx->to_string_dot(filename);
hs()->crash_simulator().crash();
+ cp_ctx->complete(true);
+ return;
}
#endif
@@ -526,6 +580,11 @@ void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const
}
void IndexWBCache::process_write_completion(IndexCPContext* cp_ctx, IndexBufferPtr const& buf) {
+ if (hs()->crash_simulator().is_crashed()) {
+ LOGINFOMOD(wbcache, "process_write_completion don't do anything for {}", buf->to_string());
+ cp_ctx->complete(true);
+ return;
+ }
LOGTRACEMOD(wbcache, "cp {} buf {}", cp_ctx->id(), buf->to_string());
resource_mgr().dec_dirty_buf_size(m_node_size);
auto [next_buf, has_more] = on_buf_flush_done(cp_ctx, buf);
diff --git a/src/lib/index/wb_cache.hpp b/src/lib/index/wb_cache.hpp
index 8f08f73a1..d7b78800d 100644
--- a/src/lib/index/wb_cache.hpp
+++ b/src/lib/index/wb_cache.hpp
@@ -59,7 +59,7 @@ class IndexWBCache : public IndexWBCacheBase {
//////////////////// CP Related API section /////////////////////////////////
folly::Future< bool > async_cp_flush(IndexCPContext* context);
IndexBufferPtr copy_buffer(const IndexBufferPtr& cur_buf, const CPContext* cp_ctx) const;
- void recover(sisl::byte_view sb);
+ void recover(sisl::byte_view sb) override;
private:
void start_flush_threads();
diff --git a/src/tests/btree_helpers/btree_test_helper.hpp b/src/tests/btree_helpers/btree_test_helper.hpp
index 950884903..cb4a20e88 100644
--- a/src/tests/btree_helpers/btree_test_helper.hpp
+++ b/src/tests/btree_helpers/btree_test_helper.hpp
@@ -274,7 +274,7 @@ struct BtreeTestHelper {
qreq.enable_route_tracing();
auto const ret = m_bt->query(qreq, out_vector);
auto const expected_count = std::min(remaining, batch_size);
-
+ this->print_keys();
ASSERT_EQ(out_vector.size(), expected_count) << "Received incorrect value on query pagination";
if (remaining < batch_size) {
@@ -376,6 +376,7 @@ struct BtreeTestHelper {
void print(const std::string& file = "") const { m_bt->print_tree(file); }
void print_keys() const { m_bt->print_tree_keys(); }
+ void visualize_keys(const std::string &file) const { m_bt->visualize_tree_keys( file ); }
void compare_files(const std::string& before, const std::string& after) {
std::ifstream b(before, std::ifstream::ate);
diff --git a/src/tests/btree_helpers/shadow_map.hpp b/src/tests/btree_helpers/shadow_map.hpp
index 7d7589946..9818c8a45 100644
--- a/src/tests/btree_helpers/shadow_map.hpp
+++ b/src/tests/btree_helpers/shadow_map.hpp
@@ -3,6 +3,7 @@
#include "btree_test_kvs.hpp"
+
template < typename K, typename V >
class ShadowMap {
private:
@@ -11,7 +12,11 @@ class ShadowMap {
uint32_t m_max_keys;
using mutex = iomgr::FiberManagerLib::shared_mutex;
mutex m_mutex;
-
+//#define SHOWM(X) cout << #X " = " << (X) << endl
+// void testPrint(std::map< uint32_t, std::string >& m_map, int i) {
+// SHOWM(m[i]);
+// SHOWM(m.find(i)->first);
+// }
public:
ShadowMap(uint32_t num_keys) : m_range_scheduler(num_keys), m_max_keys{num_keys} {}
@@ -163,7 +168,7 @@ class ShadowMap {
}
while (it2 != other.m_map.end()) {
- ret_diff.emplace_back(it1->first, false /* addition */);
+ ret_diff.emplace_back(it2->first, false /* addition */);
++it2;
}
return ret_diff;
diff --git a/src/tests/test_index_crash_recovery.cpp b/src/tests/test_index_crash_recovery.cpp
index ec0985130..c4c90fd73 100644
--- a/src/tests/test_index_crash_recovery.cpp
+++ b/src/tests/test_index_crash_recovery.cpp
@@ -125,18 +125,36 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT
this->get_all();
}
- void restart_homestore(uint32_t shutdown_delay_sec = 5) override {
+ void restart_homestore(uint32_t shutdown_delay_sec = 3) override {
this->params(HS_SERVICE::INDEX).index_svc_cbs = new TestIndexServiceCallbacks(this);
+ LOGINFO("\n\n\n\n\n\n shutdown homestore for index service Test\n\n\n\n\n");
+// this->m_shadow_map.save(this->m_shadow_filename);
test_common::HSTestHelper::restart_homestore(shutdown_delay_sec);
+
}
void reapply_after_crash() {
ShadowMap< K, V > snapshot_map{this->m_shadow_map.max_keys()};
snapshot_map.load(m_shadow_filename);
+ LOGINFO("\t\t\t\tMehdi: snapshot before crash\n{}", snapshot_map.to_string());
auto diff = this->m_shadow_map.diff(snapshot_map);
+ std::string dif_str="KEY \tADDITION\n";
+ for (const auto& [k, addition] : diff) {
+ dif_str += fmt::format(" {} \t{}\n", k.key(), addition);
+ }
+ // visualize tree after crash
+ std::string recovered_tree_filename = "tree_after_crash_" + to_string(rand()%100)+".dot";
+ this->visualize_keys(recovered_tree_filename);
+ LOGINFO(" tree after recovered stored in {}", recovered_tree_filename );
+ test_common::HSTestHelper::trigger_cp(true);
+ LOGINFO("Diff between shadow map and snapshot map\n{}\n", dif_str);
for (const auto& [k, addition] : diff) {
+ LOGINFO("\t\n\n reapply: before inserting key {}", k.key());
+ this->print_keys();
+ this->visualize_keys(recovered_tree_filename);
if (addition) { this->force_upsert(k.key()); }
}
+ test_common::HSTestHelper::trigger_cp(true);
this->m_shadow_map.save(m_shadow_filename);
}
@@ -157,6 +175,22 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT
BtreeTestHelper< TestType >::TearDown();
this->shutdown_homestore(false);
}
+ void crash_and_recover(uint32_t s_key, uint32_t e_key){
+ this->print_keys();
+ test_common::HSTestHelper::trigger_cp(false);
+ this->wait_for_crash_recovery();
+ this->visualize_keys("tree_after_crash_"+std::to_string(s_key)+"_"+std::to_string(e_key)+".dot");
+ this->print_keys();
+ this->reapply_after_crash();
+
+ this->get_all();
+ LOGINFO(" except to have [{},{}) in tree and it is actually{} ", s_key, e_key,tree_key_count());
+ ASSERT_EQ(this->m_shadow_map.size(), this->m_bt->count_keys(this->m_bt->root_node_id()))
+ << "shadow map size and tree size mismatch";
+ }
+ uint32_t tree_key_count(){
+ return this->m_bt->count_keys(this->m_bt->root_node_id());
+ }
protected:
const std::string m_shadow_filename = "/tmp/shadow_map_index_recovery.txt";
@@ -192,6 +226,8 @@ TYPED_TEST(IndexCrashTest, SplitOnLeftEdge) {
// Trigger the cp to make sure middle part is successful
LOGINFO("Step 2: Flush all the entries so far");
test_common::HSTestHelper::trigger_cp(true);
+ this->get_all();
+ this->m_shadow_map.save(this->m_shadow_filename);
// Now fill the entries from first and the leftmost child will always split, with crash flip set during flush phase
LOGINFO("Step 3: Fill the 3rd quarter of the tree, to make sure left child is split and we crash on flush of the "
@@ -200,40 +236,38 @@ TYPED_TEST(IndexCrashTest, SplitOnLeftEdge) {
for (auto k = num_entries / 2; k < num_entries * 3 / 4; ++k) {
this->put(k, btree_put_type::INSERT, true /* expect_success */);
}
-
- // Post crash, load the shadow_map into a new instance and compute the diff. Redo the operation
LOGINFO("Step 4: Post crash we reapply the missing entries to tree");
- test_common::HSTestHelper::trigger_cp(false);
- this->wait_for_crash_recovery();
- this->reapply_after_crash();
+ this->crash_and_recover(num_entries / 2, num_entries);
+
-#if 0
// TODO: Uncomment this once we do a fix for the inconsistent query results
LOGINFO("Step 5: Fill the 2nd quarter of the tree, to make sure left child is split and we crash on flush of the "
"left child");
this->set_basic_flip("crash_flush_on_split_at_left_child");
+ this->visualize_keys("tree_before_insert.dot");
for (auto k = num_entries / 4; k < num_entries / 2; ++k) {
+ LOGINFO("inserting key {}", k);
+ this->visualize_keys("tree_before_"+to_string(k)+".dot");
this->put(k, btree_put_type::INSERT, true /* expect_success */);
}
+ this->visualize_keys("tree_before_crash.dot");
+ this->print_keys();
+ this->print();
LOGINFO("Step 6: Post crash we reapply the missing entries to tree");
- test_common::HSTestHelper::trigger_cp(false);
- this->wait_for_crash_recovery();
- this->reapply_after_crash();
+ this->crash_and_recover(num_entries / 4, num_entries);
- LOGINFO("Step 7: Fill the 1st quarter of the tree, to make sure left child is split and we crash on flush of the "
- "parent node");
- this->set_basic_flip("crash_flush_on_split_at_parent");
- for (auto k = 0u; k < num_entries / 4; ++k) {
- this->put(k, btree_put_type::INSERT, true /* expect_success */);
- }
- LOGINFO("Step 8: Post crash we reapply the missing entries to tree");
- test_common::HSTestHelper::trigger_cp(false);
- this->wait_for_crash_recovery();
- this->reapply_after_crash();
-#endif
+// LOGINFO("Step 7: Fill the 1st quarter of the tree, to make sure left child is split and we crash on flush of the "
+// "parent node");
+// this->set_basic_flip("crash_flush_on_split_at_parent");
+// for (auto k = 0u; k <= num_entries / 4; ++k) {
+// this->put(k, btree_put_type::INSERT, true /* expect_success */);
+// }
+// LOGINFO("Step 8: Post crash we reapply the missing entries to tree");
+// this->crash_and_recover(0, num_entries);
LOGINFO("Step 9: Query all entries and validate with pagination of 80 entries");
this->query_all_paginate(80);
+
}
int main(int argc, char* argv[]) {