diff --git a/src/include/homestore/btree/btree.hpp b/src/include/homestore/btree/btree.hpp
index 4d65ad7e8..87e7eb351 100644
--- a/src/include/homestore/btree/btree.hpp
+++ b/src/include/homestore/btree/btree.hpp
@@ -34,7 +34,7 @@ namespace homestore {
using BtreeNodePtr = boost::intrusive_ptr< BtreeNode >;
using BtreeNodeList = folly::small_vector< BtreeNodePtr, 3 >;
-struct BtreeVisualizeVariables{
+struct BtreeVisualizeVariables {
uint64_t parent;
uint64_t midPoint;
uint64_t index;
@@ -120,8 +120,8 @@ class Btree {
virtual std::pair< btree_status_t, uint64_t > destroy_btree(void* context);
nlohmann::json get_status(int log_level) const;
- void print_tree(const std::string& file = "") const;
- void print_tree_keys() const;
+ void dump_tree_to_file(const std::string& file = "") const;
+ std::string to_string_keys() const;
std::string visualize_tree_keys(const std::string& file) const;
uint64_t count_keys(bnodeid_t bnodeid) const;
@@ -202,8 +202,7 @@ class Btree {
uint64_t get_child_node_cnt(bnodeid_t bnodeid) const;
void to_string(bnodeid_t bnodeid, std::string& buf) const;
void to_string_keys(bnodeid_t bnodeid, std::string& buf) const;
- void to_dot_keys(bnodeid_t bnodeid, std::string& buf,
- std::map< uint32_t, std::vector< uint64_t > >& l_map,
+ void to_dot_keys(bnodeid_t bnodeid, std::string& buf, std::map< uint32_t, std::vector< uint64_t > >& l_map,
std::map< uint64_t, BtreeVisualizeVariables >& info_map) const;
void validate_sanity_child(const BtreeNodePtr& parent_node, uint32_t ind) const;
void validate_sanity_next_child(const BtreeNodePtr& parent_node, uint32_t ind) const;
diff --git a/src/include/homestore/btree/btree.ipp b/src/include/homestore/btree/btree.ipp
index 4877b6701..6b10dbdd9 100644
--- a/src/include/homestore/btree/btree.ipp
+++ b/src/include/homestore/btree/btree.ipp
@@ -308,13 +308,13 @@ nlohmann::json Btree< K, V >::get_status(int log_level) const {
}
template < typename K, typename V >
-void Btree< K, V >::print_tree(const std::string& file) const {
+void Btree< K, V >::dump_tree_to_file(const std::string& file) const {
std::string buf;
m_btree_lock.lock_shared();
to_string(m_root_node_info.bnode_id(), buf);
m_btree_lock.unlock_shared();
- LOGINFO( "Pre order traversal of tree:\n<{}>", buf);
+ BT_LOG(INFO, "Pre order traversal of tree:\n<{}>", buf);
if (!file.empty()) {
std::ofstream o(file);
o.write(buf.c_str(), buf.size());
@@ -323,13 +323,13 @@ void Btree< K, V >::print_tree(const std::string& file) const {
}
template < typename K, typename V >
-void Btree< K, V >::print_tree_keys() const {
+std::string Btree< K, V >::to_string_keys() const {
std::string buf;
m_btree_lock.lock_shared();
to_string_keys(m_root_node_info.bnode_id(), buf);
m_btree_lock.unlock_shared();
- LOGINFO("Pre order traversal of tree:\n<{}>", buf);
+ return buf;
}
template < typename K, typename V >
diff --git a/src/include/homestore/btree/detail/btree_mutate_impl.ipp b/src/include/homestore/btree/detail/btree_mutate_impl.ipp
index 58844ba5c..0c5313eb2 100644
--- a/src/include/homestore/btree/detail/btree_mutate_impl.ipp
+++ b/src/include/homestore/btree/detail/btree_mutate_impl.ipp
@@ -299,28 +299,31 @@ btree_status_t Btree< K, V >::split_node(const BtreeNodePtr& parent_node, const
return ret;
}
+// template < typename K, typename V >
+// template < typename ReqT >
+// bool Btree< K, V >::is_split_needed(const BtreeNodePtr& node, ReqT& req) const {
+// if (!node->is_leaf()) { // if internal node, size is atmost one additional entry, size of K/V
+// return node->total_entries() >= 3;
+// } else if constexpr (std::is_same_v< ReqT, BtreeRangePutRequest< K > >) {
+// return node->total_entries() >= 3;
+// } else if constexpr (std::is_same_v< ReqT, BtreeSinglePutRequest >) {
+// return node->total_entries() >= 3;;
+// } else {
+// return false;
+// }
+// }
+
template < typename K, typename V >
template < typename ReqT >
bool Btree< K, V >::is_split_needed(const BtreeNodePtr& node, ReqT& req) const {
if (!node->is_leaf()) { // if internal node, size is atmost one additional entry, size of K/V
- return node->total_entries() >= 3;
+ return !node->has_room_for_put(btree_put_type::UPSERT, K::get_max_size(), BtreeLinkInfo::get_fixed_size());
} else if constexpr (std::is_same_v< ReqT, BtreeRangePutRequest< K > >) {
- return node->total_entries() >= 3;
+ return !node->has_room_for_put(req.m_put_type, req.first_key_size(), req.m_newval->serialized_size());
} else if constexpr (std::is_same_v< ReqT, BtreeSinglePutRequest >) {
- return node->total_entries() >= 3;;
+ return !node->has_room_for_put(req.m_put_type, req.key().serialized_size(), req.value().serialized_size());
} else {
return false;
}
}
-//bool Btree< K, V >::is_split_needed(const BtreeNodePtr& node, ReqT& req) const {
-// if (!node->is_leaf()) { // if internal node, size is atmost one additional entry, size of K/V
-// return !node->has_room_for_put(btree_put_type::UPSERT, K::get_max_size(), BtreeLinkInfo::get_fixed_size());
-// } else if constexpr (std::is_same_v< ReqT, BtreeRangePutRequest< K > >) {
-// return !node->has_room_for_put(req.m_put_type, req.first_key_size(), req.m_newval->serialized_size());
-// } else if constexpr (std::is_same_v< ReqT, BtreeSinglePutRequest >) {
-// return !node->has_room_for_put(req.m_put_type, req.key().serialized_size(), req.value().serialized_size());
-// } else {
-// return false;
-// }
-//}
} // namespace homestore
diff --git a/src/include/homestore/btree/detail/btree_node.hpp b/src/include/homestore/btree/detail/btree_node.hpp
index f1a95372f..b1c22ba9a 100644
--- a/src/include/homestore/btree/detail/btree_node.hpp
+++ b/src/include/homestore/btree/detail/btree_node.hpp
@@ -69,23 +69,24 @@ struct persistent_hdr_t {
persistent_hdr_t() : nentries{0}, leaf{0}, node_deleted{0} {}
std::string to_string() const {
- auto snext = (next_node == empty_bnodeid) ? "" : "next_node="+ std::to_string(next_node);
- auto sedge = (edge_info.m_bnodeid == empty_bnodeid) ? "" : "edge_nodeid="+ std::to_string(edge_info.m_bnodeid);
- auto sedgelink = (edge_info.m_bnodeid == empty_bnodeid) ? "" : "edge_link_version="+ std::to_string(edge_info.m_link_version);
- return fmt::format("magic={} version={} csum={} node_id={} {} nentries={} node_type={} is_leaf={} "
- "node_deleted={} node_gen={} modified_cp_id={} link_version={} {}, "
- "{} level={} ",
- magic, version, checksum, node_id, snext, nentries, node_type, leaf, node_deleted,
- node_gen, modified_cp_id, link_version, sedge, sedgelink,
- level);
+ auto snext = (next_node == empty_bnodeid) ? "" : " next=" + std::to_string(next_node);
+ auto sedge = (edge_info.m_bnodeid == empty_bnodeid)
+ ? ""
+ : fmt::format(" edge={}.{}", edge_info.m_bnodeid, edge_info.m_link_version);
+ return fmt::format("magic={} version={} csum={} node_id={}{} nentries={} node_type={} is_leaf={} "
+ "node_deleted={} node_gen={} modified_cp_id={} link_version={}{} level={} ",
+ magic, version, checksum, node_id, snext, nentries, node_type, leaf, node_deleted, node_gen,
+ modified_cp_id, link_version, sedge, level);
}
std::string to_compact_string() const {
- auto snext = (next_node == empty_bnodeid) ? "" : "next="+ std::to_string(next_node);
- auto sedge = (edge_info.m_bnodeid == empty_bnodeid) ? "" : "edge_nodeid="+ std::to_string(edge_info.m_bnodeid);
- auto sleaf = leaf?"LEAF": "INTERIOR";
- return fmt::format(" id={} {}{} {} nentries={} {} level={} modified_cp_id={}", node_id, snext,sedge, sleaf, nentries,
- (node_deleted == 0x1) ? "Deleted" : "", level, modified_cp_id);
+ auto snext = (next_node == empty_bnodeid) ? "" : " next=" + std::to_string(next_node);
+ auto sedge = (edge_info.m_bnodeid == empty_bnodeid)
+ ? ""
+ : fmt::format(" edge={}.{}", edge_info.m_bnodeid, edge_info.m_link_version);
+ return fmt::format("id={}{}{} {} level={} nentries={}{} mod_cp={}", node_id, snext, sedge,
+ leaf ? "LEAF" : "INTERIOR", level, nentries, (node_deleted == 0x1) ? " Deleted" : "",
+ modified_cp_id);
}
};
#pragma pack()
diff --git a/src/include/homestore/btree/detail/simple_node.hpp b/src/include/homestore/btree/detail/simple_node.hpp
index cb4e02291..97ea6a393 100644
--- a/src/include/homestore/btree/detail/simple_node.hpp
+++ b/src/include/homestore/btree/detail/simple_node.hpp
@@ -201,6 +201,9 @@ class SimpleNode : public VariantNode< K, V > {
}
bool has_room_for_put(btree_put_type put_type, uint32_t key_size, uint32_t value_size) const override {
+#ifdef _PRERELEASE
+ // return (this->total_entries() <= 3);
+#endif
return ((put_type == btree_put_type::UPSERT) || (put_type == btree_put_type::INSERT))
? (get_available_entries() > 0)
: true;
@@ -214,7 +217,7 @@ class SimpleNode : public VariantNode< K, V > {
(this->is_leaf() ? "LEAF" : "INTERIOR"), snext);
if (!this->is_leaf() && (this->has_valid_edge())) {
auto sedge = this->has_valid_edge() ? "edge:" + std::to_string(this->edge_info().m_bnodeid) + "." +
- std::to_string(this->edge_info().m_link_version)
+ std::to_string(this->edge_info().m_link_version)
: "";
fmt::format_to(std::back_inserter(str), "{}", sedge);
}
@@ -229,71 +232,71 @@ class SimpleNode : public VariantNode< K, V > {
std::string to_string_keys(bool print_friendly = false) const override {
// FIXME: Implement this, key may not be a unit32_t
-// return "";
-//#if 0
- std::string delimiter = print_friendly ? "\n" : "\t";
- std::string snext = this->next_bnode() == empty_bnodeid ? "" : fmt::format("next_node={}", this->next_bnode());
- auto str = fmt::format("{}{}.{} level:{} nEntries={} {} {} node_gen={} ",
- print_friendly ? "------------------------------------------------------------\n" : "",
- this->node_id(), this->link_version(), this->level(), this->total_entries(),
- (this->is_leaf() ? "LEAF" : "INTERIOR"), snext, this->node_gen());
- if (!this->is_leaf() && (this->has_valid_edge())) {
- fmt::format_to(std::back_inserter(str), "edge_id={}.{}", this->edge_info().m_bnodeid,
- this->edge_info().m_link_version);
- }
- if (this->total_entries() == 0) {
- fmt::format_to(std::back_inserter(str), " [EMPTY] ");
- return str;
- }
- if (!this->is_leaf()) {
- fmt::format_to(std::back_inserter(str), " [");
- for (uint32_t i{0}; i < this->total_entries(); ++i) {
- uint32_t cur_key = BtreeNode::get_nth_key< K >(i, false).key();
- BtreeLinkInfo child_info;
- get_nth_value(i, &child_info, false /* copy */);
- fmt::format_to(std::back_inserter(str), "{}.{} {}", cur_key, child_info.link_version(),
- i == this->total_entries() - 1 ? "" : ", ");
- }
- fmt::format_to(std::back_inserter(str), "]");
- return str;
- }
- uint32_t prev_key = BtreeNode::get_nth_key< K >(0, false).key();
- uint32_t cur_key = prev_key;
- uint32_t last_key = BtreeNode::get_nth_key< K >(this->total_entries() - 1, false).key();
- if (last_key - prev_key == this->total_entries() - 1) {
- if (this->total_entries() == 1)
- fmt::format_to(std::back_inserter(str), "{}[{}]", delimiter, prev_key);
- else
- fmt::format_to(std::back_inserter(str), "{}[{}-{}]", delimiter, prev_key, last_key);
- return str;
+ // return "";
+ // #if 0
+ std::string delimiter = print_friendly ? "\n" : "\t";
+ std::string snext = this->next_bnode() == empty_bnodeid ? "" : fmt::format("next_node={}", this->next_bnode());
+ auto str = fmt::format("{}{}.{} level:{} nEntries={} {} {} node_gen={} ",
+ print_friendly ? "------------------------------------------------------------\n" : "",
+ this->node_id(), this->link_version(), this->level(), this->total_entries(),
+ (this->is_leaf() ? "LEAF" : "INTERIOR"), snext, this->node_gen());
+ if (!this->is_leaf() && (this->has_valid_edge())) {
+ fmt::format_to(std::back_inserter(str), "edge_id={}.{}", this->edge_info().m_bnodeid,
+ this->edge_info().m_link_version);
+ }
+ if (this->total_entries() == 0) {
+ fmt::format_to(std::back_inserter(str), " [EMPTY] ");
+ return str;
+ }
+ if (!this->is_leaf()) {
+ fmt::format_to(std::back_inserter(str), " [");
+ for (uint32_t i{0}; i < this->total_entries(); ++i) {
+ uint32_t cur_key = BtreeNode::get_nth_key< K >(i, false).key();
+ BtreeLinkInfo child_info;
+ get_nth_value(i, &child_info, false /* copy */);
+ fmt::format_to(std::back_inserter(str), "{}.{} {}", cur_key, child_info.link_version(),
+ i == this->total_entries() - 1 ? "" : ", ");
}
- fmt::format_to(std::back_inserter(str), "{}0 - [{}", delimiter, prev_key);
- uint32_t start_interval_key = prev_key;
- for (uint32_t i{1}; i < this->total_entries(); ++i) {
- cur_key = BtreeNode::get_nth_key< K >(i, false).key();
- if (cur_key != prev_key + 1) {
- if (start_interval_key == prev_key) {
- fmt::format_to(std::back_inserter(str), "-{}]{}{}- [{}", prev_key, delimiter, i, cur_key);
- } else {
- fmt::format_to(std::back_inserter(str), "]{}{}- [{}", delimiter, i, cur_key);
- }
- start_interval_key = cur_key;
+ fmt::format_to(std::back_inserter(str), "]");
+ return str;
+ }
+ uint32_t prev_key = BtreeNode::get_nth_key< K >(0, false).key();
+ uint32_t cur_key = prev_key;
+ uint32_t last_key = BtreeNode::get_nth_key< K >(this->total_entries() - 1, false).key();
+ if (last_key - prev_key == this->total_entries() - 1) {
+ if (this->total_entries() == 1)
+ fmt::format_to(std::back_inserter(str), "{}[{}]", delimiter, prev_key);
+ else
+ fmt::format_to(std::back_inserter(str), "{}[{}-{}]", delimiter, prev_key, last_key);
+ return str;
+ }
+ fmt::format_to(std::back_inserter(str), "{}0 - [{}", delimiter, prev_key);
+ uint32_t start_interval_key = prev_key;
+ for (uint32_t i{1}; i < this->total_entries(); ++i) {
+ cur_key = BtreeNode::get_nth_key< K >(i, false).key();
+ if (cur_key != prev_key + 1) {
+ if (start_interval_key == prev_key) {
+ fmt::format_to(std::back_inserter(str), "-{}]{}{}- [{}", prev_key, delimiter, i, cur_key);
+ } else {
+ fmt::format_to(std::back_inserter(str), "]{}{}- [{}", delimiter, i, cur_key);
}
- prev_key = cur_key;
+ start_interval_key = cur_key;
}
+ prev_key = cur_key;
+ }
- if (start_interval_key == prev_key) {
- fmt::format_to(std::back_inserter(str), "]");
- } else {
- fmt::format_to(std::back_inserter(str), "-{}]", cur_key);
- }
- return str;
-//#endif
+ if (start_interval_key == prev_key) {
+ fmt::format_to(std::back_inserter(str), "]");
+ } else {
+ fmt::format_to(std::back_inserter(str), "-{}]", cur_key);
+ }
+ return str;
+ // #endif
}
std::string to_dot_keys() const override {
std::string str;
- std::string snext = this->next_bnode() == empty_bnodeid? "": fmt::format("next_node={}", this->next_bnode());
+ std::string snext = this->next_bnode() == empty_bnodeid ? "" : fmt::format("next_node={}", this->next_bnode());
str += fmt::format(R"("{}" [
shape = none,
labelloc="c",
@@ -317,11 +320,14 @@ class SimpleNode : public VariantNode< K, V > {
| {}.{} | )",
i, i, cur_key, child_info.link_version());
}
- std::string sedge = this->has_valid_edge() ? "edge:" + std::to_string( this->edge_info().m_bnodeid) + "." + std::to_string( this->edge_info().m_link_version):"";
+ std::string sedge = this->has_valid_edge() ? "edge:" + std::to_string(this->edge_info().m_bnodeid) + "." +
+ std::to_string(this->edge_info().m_link_version)
+ : "";
str += fmt::format(R"(
|
- {}.{} gen={} {} {} | >];)",this->total_entries(),this->node_id(), this->link_version(), this->node_gen(), snext, sedge );
-
+ {}.{} gen={} {} {} | >];)",
+ this->total_entries(), this->node_id(), this->link_version(), this->node_gen(), snext,
+ sedge);
} else {
std::string keys_buf = "";
diff --git a/src/include/homestore/index/index_internal.hpp b/src/include/homestore/index/index_internal.hpp
index 09e944f18..85c2a304d 100644
--- a/src/include/homestore/index/index_internal.hpp
+++ b/src/include/homestore/index/index_internal.hpp
@@ -68,11 +68,11 @@ class IndexTableBase {
public:
virtual ~IndexTableBase() = default;
virtual uuid_t uuid() const = 0;
+ virtual void recovery_completed() = 0;
virtual uint32_t ordinal() const = 0;
virtual uint64_t used_size() const = 0;
virtual void destroy() = 0;
virtual void repair_node(IndexBufferPtr const& buf) = 0;
- virtual void repair_root(IndexBufferPtr const& buf) = 0;
};
enum class index_buf_state_t : uint8_t {
diff --git a/src/include/homestore/index/index_table.hpp b/src/include/homestore/index/index_table.hpp
index 50b69ac0d..2bec275e3 100644
--- a/src/include/homestore/index/index_table.hpp
+++ b/src/include/homestore/index/index_table.hpp
@@ -60,14 +60,20 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
// After recovery, we see that root node is empty, which means that after btree is created, we crashed.
// So create new root node, which is essential for btree to function.
+ if (m_sb->root_node != empty_bnodeid) {
+ this->set_root_node_info(BtreeLinkInfo{m_sb->root_node, m_sb->root_link_version});
+ }
+ }
+
+ void recovery_completed() override {
if (m_sb->root_node == empty_bnodeid) {
+ // After recovery, we see that root node is empty, which means that after btree is created, we crashed.
+ // So create new root node, which is essential for btree to function.
auto cp = hs()->cp_mgr().cp_guard();
auto const status = this->create_root_node((void*)cp.context(cp_consumer_t::INDEX_SVC));
if (status != btree_status_t::success) {
throw std::runtime_error(fmt::format("Unable to create root node"));
}
- } else {
- this->set_root_node_info(BtreeLinkInfo{m_sb->root_node, m_sb->root_link_version});
}
}
@@ -108,26 +114,30 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
}
void repair_node(IndexBufferPtr const& idx_buf) override {
+ if (idx_buf->is_meta_buf()) {
+ // We cannot repair the meta buf on its own, we need to repair the root node which modifies the
+ // meta_buf. It is ok to ignore this call, because repair will be done from root before meta_buf is
+ // attempted to repair, which would have updated the meta_buf already.
+ return;
+ }
BtreeNode* n = this->init_node(idx_buf->raw_buffer(), idx_buf->blkid().to_integer(), false /* init_buf */,
BtreeNode::identify_leaf_node(idx_buf->raw_buffer()));
static_cast< IndexBtreeNode* >(n)->attach_buf(idx_buf);
auto cpg = cp_mgr().cp_guard();
- //m_dirtied_cp_id was set in -1. It is time to make it dirty
+
+ // Set the cp_id to current cp buf, since repair doesn't call get_writable_buf (unlike regular IO path), so
+ // we need to set it here, so that other code path assumes correct cp
idx_buf->m_dirtied_cp_id = cpg->id();
- LOGTRACEMOD(wbcache, "repair_node cp {} {}", cpg->id(), idx_buf->to_string());
- repair_links(BtreeNodePtr{n}, (void*)cpg.context(cp_consumer_t::INDEX_SVC));
- }
+ BtreeNodePtr bn = BtreeNodePtr{n};
- void repair_root(IndexBufferPtr const& root_buf) override {
- BtreeNode* n = this->init_node(root_buf->raw_buffer(), root_buf->blkid().to_integer(), false /* init_buf */,
- BtreeNode::identify_leaf_node(root_buf->raw_buffer()));
- static_cast< IndexBtreeNode* >(n)->attach_buf(root_buf);
- auto cpg = cp_mgr().cp_guard();
- //m_dirtied_cp_id was set in -1. It is time to make it dirty
- root_buf->m_dirtied_cp_id = cpg->id();
- LOGTRACEMOD(wbcache, "repair_root cp {} {}", cpg->id(), root_buf->to_string());
+ LOGTRACEMOD(wbcache, "repair_node cp={} buf={}", cpg->id(), idx_buf->to_string());
+ repair_links(bn, (void*)cpg.context(cp_consumer_t::INDEX_SVC));
- on_root_changed(n, (void*)cpg.context(cp_consumer_t::INDEX_SVC));
+ if (idx_buf->m_up_buffer && idx_buf->m_up_buffer->is_meta_buf()) {
+ // Our up buffer is a meta buffer, which means that we are the new root node, we need to update the
+ // meta_buf with new root as well
+ on_root_changed(bn, (void*)cpg.context(cp_consumer_t::INDEX_SVC));
+ }
}
protected:
@@ -144,17 +154,16 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
auto cp_ctx = r_cast< CPContext* >(context);
auto idx_node = static_cast< IndexBtreeNode* >(node.get());
+ node->set_checksum();
auto prev_state = idx_node->m_idx_buf->m_state.exchange(index_buf_state_t::DIRTY);
if (prev_state == index_buf_state_t::CLEAN) {
// It was clean before, dirtying it first time, add it to the wb_cache list to flush
- if(idx_node->m_idx_buf->m_dirtied_cp_id !=-1) {
+ if (idx_node->m_idx_buf->m_dirtied_cp_id != -1) {
BT_DBG_ASSERT_EQ(idx_node->m_idx_buf->m_dirtied_cp_id, cp_ctx->id(),
"Writing a node which was not acquired by this cp");
}
- wb_cache().write_buf(node, idx_node->m_idx_buf, cp_ctx);
- node->set_checksum();
node->set_modified_cp_id(cp_ctx->id());
- LOGTRACEMOD(wbcache, "add to dirty list cp {} {}", cp_ctx->id(), idx_node->m_idx_buf->to_string());
+ wb_cache().write_buf(node, idx_node->m_idx_buf, cp_ctx);
} else {
BT_DBG_ASSERT_NE(
(int)prev_state, (int)index_buf_state_t::FLUSHING,
@@ -168,25 +177,25 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
const BtreeNodePtr& left_child_node, const BtreeNodePtr& parent_node,
void* context) override {
CPContext* cp_ctx = r_cast< CPContext* >(context);
- auto& left_child_buf = static_cast< IndexBtreeNode* >(left_child_node.get())->m_idx_buf;
- auto& parent_buf = static_cast< IndexBtreeNode* >(parent_node.get())->m_idx_buf;
IndexBufferPtrList new_node_bufs;
for (const auto& right_child_node : new_nodes) {
write_node_impl(right_child_node, context);
- new_node_bufs.push_back(static_cast< IndexBtreeNode* >(right_child_node.get())->m_idx_buf);
+ new_node_bufs.push_back(s_cast< IndexBtreeNode* >(right_child_node.get())->m_idx_buf);
}
write_node_impl(left_child_node, context);
// during recovery it is possible that there is no parent_node
- if (parent_node.get() != nullptr) {write_node_impl(parent_node, context); }
+ if (parent_node.get() != nullptr) { write_node_impl(parent_node, context); }
IndexBufferPtrList freed_node_bufs;
for (const auto& freed_node : freed_nodes) {
- freed_node_bufs.push_back(static_cast< IndexBtreeNode* >(freed_node.get())->m_idx_buf);
+ freed_node_bufs.push_back(s_cast< IndexBtreeNode* >(freed_node.get())->m_idx_buf);
this->free_node(freed_node, locktype_t::WRITE, context);
}
- wb_cache().transact_bufs(ordinal(), parent_node.get() ? parent_buf : nullptr, left_child_buf, new_node_bufs, freed_node_bufs, cp_ctx);
+ wb_cache().transact_bufs(
+ ordinal(), parent_node.get() ? s_cast< IndexBtreeNode* >(parent_node.get())->m_idx_buf : nullptr,
+ s_cast< IndexBtreeNode* >(left_child_node.get())->m_idx_buf, new_node_bufs, freed_node_bufs, cp_ctx);
return btree_status_t::success;
}
@@ -238,6 +247,8 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
parent_node->node_id());
return btree_status_t::not_found;
}
+ BT_LOG(INFO, "Repairing node={} with last_parent_key={}", parent_node->to_string(),
+ last_parent_key.to_string());
// Get the first child node and its link info
BtreeLinkInfo child_info;
@@ -261,15 +272,19 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
auto cur_parent = parent_node;
BtreeNodeList new_parent_nodes;
do {
- if (child_node->has_valid_edge()) {
+ if (child_node->has_valid_edge() ||
+ (child_node->is_leaf() && (child_node->next_bnode() == empty_bnodeid))) {
BT_DBG_ASSERT(is_parent_edge_node,
"Child node={} is an edge node but parent_node={} is not an edge node",
- child_node->node_id(), parent_node->node_id());
+ child_node->node_id(), cur_parent->node_id());
cur_parent->set_edge_value(BtreeLinkInfo{child_node->node_id(), child_node->link_version()});
break;
}
auto const child_last_key = child_node->get_last_key< K >();
+ BT_LOG(INFO, "Repairing node={} child_node={} child_last_key={}", cur_parent->node_id(),
+ child_node->to_string(), child_last_key.to_string());
+
if (child_last_key.compare(last_parent_key) > 0) {
// We have reached the last key, we can stop now
break;
@@ -297,14 +312,16 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
cur_parent->insert(cur_parent->total_entries(), child_last_key,
BtreeLinkInfo{child_node->node_id(), child_node->link_version()});
+ BT_LOG(INFO, "Repairing node={}, repaired so_far={}", cur_parent->node_id(), cur_parent->to_string());
+
// Move to the next child node
this->unlock_node(child_node, locktype_t::READ);
auto const next_node_id = child_node->next_bnode();
if (next_node_id == empty_bnodeid) {
- BT_LOG_ASSERT(
- false,
- "Child node={} next_node_id is empty, while its not a edge node, parent_node={} repair is partial",
- child_node->node_id(), parent_node->node_id());
+ BT_LOG_ASSERT(false,
+ "Child node={} next_node_id is empty, while its not a edge node, parent_node={} "
+ "repair is partial",
+ child_node->node_id(), parent_node->node_id());
ret = btree_status_t::not_found;
break;
}
@@ -323,12 +340,12 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
}
if (ret != btree_status_t::success) {
- BT_LOG(DEBUG, "An error occurred status={} during repair of parent_node={}, aborting the repair",
+ BT_LOG(ERROR, "An error occurred status={} during repair of parent_node={}, aborting the repair",
enum_name(ret), parent_node->node_id());
std::memcpy(parent_node->m_phys_node_buf, tmp_buffer, this->m_bt_cfg.node_size());
}
- delete tmp_buffer;
+ delete[] tmp_buffer;
return ret;
}
};
diff --git a/src/include/homestore/index_service.hpp b/src/include/homestore/index_service.hpp
index cfd5bd5f7..fb776eb6b 100644
--- a/src/include/homestore/index_service.hpp
+++ b/src/include/homestore/index_service.hpp
@@ -47,6 +47,7 @@ class IndexService {
std::shared_ptr< VirtualDev > m_vdev;
std::pair< meta_blk*, sisl::byte_view > m_wbcache_sb{
std::pair< meta_blk*, sisl::byte_view >{nullptr, sisl::byte_view{}}};
+ std::vector< std::pair< meta_blk*, sisl::byte_view > > m_itable_sbs;
std::unique_ptr< sisl::IDReserver > m_ordinal_reserver;
mutable std::mutex m_index_map_mtx;
@@ -80,12 +81,8 @@ class IndexService {
uint64_t used_size() const;
uint32_t node_size() const;
void repair_index_node(uint32_t ordinal, IndexBufferPtr const& node_buf);
- void repair_index_root(uint32_t ordinal, IndexBufferPtr const& root_buf);
IndexWBCacheBase& wb_cache() { return *m_wb_cache; }
-
-private:
- void itable_meta_blk_found(const sisl::byte_view& buf, void* meta_cookie);
};
extern IndexService& index_service();
diff --git a/src/lib/blkalloc/fixed_blk_allocator.cpp b/src/lib/blkalloc/fixed_blk_allocator.cpp
index c609b24ee..9df3a07ca 100644
--- a/src/lib/blkalloc/fixed_blk_allocator.cpp
+++ b/src/lib/blkalloc/fixed_blk_allocator.cpp
@@ -57,9 +57,20 @@ BlkAllocStatus FixedBlkAllocator::alloc([[maybe_unused]] blk_count_t nblks, blk_
#ifdef _PRERELEASE
if (iomgr_flip::instance()->test_flip("fixed_blkalloc_no_blks")) { return BlkAllocStatus::SPACE_FULL; }
#endif
+retry:
blk_num_t blk_num;
if (!m_free_blk_q.read(blk_num)) { return BlkAllocStatus::SPACE_FULL; }
+ if (m_state != state_t::ACTIVE) {
+ // We are not in active state, means we must be recovering. During recovery state, if any of the blks which are
+ // committed, we shouldn't be allocating those blocks. So remove from free blk q and retry another blk
+ std::lock_guard lg(m_reserve_blk_mtx);
+ if ((m_state == state_t::RECOVERING) && (m_reserved_blks.find(blk_num) != m_reserved_blks.end())) {
+ m_reserved_blks.erase(blk_num); // This blk can be removed from reserved, because its no longer free (since
+ // we also removed from free_blk q)
+ goto retry;
+ }
+ }
out_blkid = BlkId{blk_num, 1, m_chunk_id};
return BlkAllocStatus::SUCCESS;
}
@@ -67,26 +78,25 @@ BlkAllocStatus FixedBlkAllocator::alloc([[maybe_unused]] blk_count_t nblks, blk_
BlkAllocStatus FixedBlkAllocator::alloc_contiguous(BlkId& out_blkid) { return alloc(1, {}, out_blkid); }
BlkAllocStatus FixedBlkAllocator::reserve_on_cache(BlkId const& b) {
- std::lock_guard lg(m_mark_blk_mtx);
- if (m_state == state_t::RECOVERING) { m_marked_blks.insert(b.blk_num()); }
+ std::lock_guard lg(m_reserve_blk_mtx);
+ if (m_state == state_t::RECOVERING) { m_reserved_blks.insert(b.blk_num()); }
return BlkAllocStatus::SUCCESS;
}
void FixedBlkAllocator::recovery_completed() {
- std::lock_guard lg(m_mark_blk_mtx);
- if (!m_marked_blks.empty()) {
+ std::lock_guard lg(m_reserve_blk_mtx);
+ if (!m_reserved_blks.empty()) {
auto const count = available_blks();
- for (uint64_t i{0}; ((i < count) && !m_marked_blks.empty()); ++i) {
+ for (uint64_t i{0}; ((i < count) && !m_reserved_blks.empty()); ++i) {
blk_num_t blk_num;
if (!m_free_blk_q.read(blk_num)) { break; }
- if (m_marked_blks.find(blk_num) != m_marked_blks.end()) {
- m_marked_blks.erase(blk_num); // This blk needs to be skipped
- } else {
+ if (m_reserved_blks.find(blk_num) == m_reserved_blks.end()) {
m_free_blk_q.write(blk_num); // This blk is not marked, put it back at the end of queue
}
}
- HS_DBG_ASSERT(m_marked_blks.empty(), "All marked blks should have been removed from free list");
+ // We no longer need any reserved blks, since all of them are now marked as allocated
+ m_reserved_blks.clear();
}
m_state = state_t::ACTIVE;
}
@@ -97,6 +107,13 @@ void FixedBlkAllocator::free(BlkId const& b) {
const auto pushed = m_free_blk_q.write(b.blk_num());
HS_DBG_ASSERT_EQ(pushed, true, "Expected to be able to push the blk on fixed capacity Q");
+ if (m_state != state_t::ACTIVE) {
+ std::lock_guard lg(m_reserve_blk_mtx);
+ if (m_state == state_t::RECOVERING) {
+ // If we are in recovering state and freeing blk would removed it from being reserved as well.
+ m_reserved_blks.erase(b.blk_num());
+ }
+ }
if (is_persistent()) { free_on_disk(b); }
}
diff --git a/src/lib/blkalloc/fixed_blk_allocator.h b/src/lib/blkalloc/fixed_blk_allocator.h
index 0e7179f5d..fa28681f2 100644
--- a/src/lib/blkalloc/fixed_blk_allocator.h
+++ b/src/lib/blkalloc/fixed_blk_allocator.h
@@ -51,8 +51,8 @@ class FixedBlkAllocator : public BitmapBlkAllocator {
enum class state_t : uint8_t { RECOVERING, ACTIVE };
state_t m_state{state_t::RECOVERING};
- std::unordered_set< blk_num_t > m_marked_blks; // Keep track of all blks which are marked as allocated
- std::mutex m_mark_blk_mtx; // Mutex used while removing marked_blks from blk_q
+ std::unordered_set< blk_num_t > m_reserved_blks; // Keep track of all blks which are reserved as allocated
+ std::mutex m_reserve_blk_mtx; // Mutex used while removing marked_blks from blk_q
folly::MPMCQueue< blk_num_t > m_free_blk_q;
};
} // namespace homestore
diff --git a/src/lib/index/index_service.cpp b/src/lib/index/index_service.cpp
index 0d2712db5..cc199bbd5 100644
--- a/src/lib/index/index_service.cpp
+++ b/src/lib/index/index_service.cpp
@@ -33,7 +33,7 @@ IndexService::IndexService(std::unique_ptr< IndexServiceCallbacks > cbs) : m_svc
meta_service().register_handler(
"index",
[this](meta_blk* mblk, sisl::byte_view buf, size_t size) {
- itable_meta_blk_found(std::move(buf), voidptr_cast(mblk));
+ m_itable_sbs.emplace_back(std::pair{mblk, std::move(buf)});
},
nullptr);
@@ -67,25 +67,26 @@ shared< VirtualDev > IndexService::open_vdev(const vdev_info& vinfo, bool load_e
uint32_t IndexService::reserve_ordinal() { return m_ordinal_reserver->reserve(); }
-void IndexService::itable_meta_blk_found(const sisl::byte_view& buf, void* meta_cookie) {
- // We have found an index table superblock. Notify the callback which should convert the superblock into actual
- // IndexTable instance. This callback will be called twice, first time when it boots before IndexService starts and
- // once as part of IndexService start, will read all metablks and then call this. We process everything after
- // IndexService has been started, because writeback cache would have recovered all nodes by now.
-// At this moment: m_wb_cache is nullptr but add_index_table to populate the table, ow, recovery path has no table to recover and repair_node does nothing
- superblk< index_table_sb > sb;
- sb.load(buf, meta_cookie);
- add_index_table(m_svc_cbs->on_index_table_found(std::move(sb)));
-}
-
void IndexService::start() {
// Start Writeback cache
m_wb_cache = std::make_unique< IndexWBCache >(m_vdev, m_wbcache_sb, hs()->evictor(),
hs()->device_mgr()->atomic_page_size(HSDevType::Fast));
+
+ // Load any index tables which are to loaded from meta blk
+ for (auto const& [meta_cookie, buf] : m_itable_sbs) {
+ superblk< index_table_sb > sb;
+ sb.load(buf, meta_cookie);
+ add_index_table(m_svc_cbs->on_index_table_found(std::move(sb)));
+ }
+
+ // Recover the writeback cache, which in-turns recovers any index table nodes
m_wb_cache->recover(m_wbcache_sb.second);
- // Read the index metablks again
- meta_service().read_sub_sb("index");
+ // Notify each table that we have completed recovery
+ std::unique_lock lg(m_index_map_mtx);
+ for (const auto& [_, tbl] : m_index_map) {
+ tbl->recovery_completed();
+ }
}
void IndexService::stop() { m_wb_cache.reset(); }
@@ -116,12 +117,11 @@ std::shared_ptr< IndexTableBase > IndexService::get_index_table(uint32_t ordinal
void IndexService::repair_index_node(uint32_t ordinal, IndexBufferPtr const& node_buf) {
auto tbl = get_index_table(ordinal);
- if (tbl) { tbl->repair_node(node_buf); }
-}
-
-void IndexService::repair_index_root(uint32_t ordinal, IndexBufferPtr const& root_buf) {
- auto tbl = get_index_table(ordinal);
- if (tbl) { tbl->repair_root(root_buf); }
+ if (tbl) {
+ tbl->repair_node(node_buf);
+ } else {
+ HS_DBG_ASSERT(false, "Index corresponding to ordinal={} has not been loaded yet, unexpected", ordinal);
+ }
}
uint32_t IndexService::node_size() const { return m_vdev->atomic_page_size(); }
@@ -152,20 +152,22 @@ std::string IndexBuffer::to_string() const {
m_created_cp_id, m_dirtied_cp_id, m_wait_for_down_buffers.get(), m_node_freed);
} else {
// store m_down_buffers in a string
-
- std::string down_bufs="";
+ std::string down_bufs = "";
#ifndef NDEBUG
for (auto const& down_buf : m_down_buffers) {
if (auto ptr = down_buf.lock()) {
- fmt::format_to(std::back_inserter(down_bufs), " [{}]", voidptr_cast(ptr.get()));
+ fmt::format_to(std::back_inserter(down_bufs), "[{}]", voidptr_cast(ptr.get()));
}
}
#endif
- return fmt::format("Buf={} index={} state={} create/dirty_cp={}/{} down_wait#={} {} up_buffer={} node=[{}] down_buffers=[{}]",
+ return fmt::format("Buf={} index={} state={} create/dirty_cp={}/{} down_wait#={}{} up={} node=[{}] down=[{}]",
voidptr_cast(const_cast< IndexBuffer* >(this)), m_index_ordinal, int_cast(state()),
- m_created_cp_id, m_dirtied_cp_id, m_wait_for_down_buffers.get(), m_node_freed ? "Freed" : "",
- voidptr_cast(const_cast< IndexBuffer* >(m_up_buffer.get())), (m_bytes == nullptr) ? " not attached yet" :r_cast< persistent_hdr_t const* >(m_bytes)->to_compact_string(), down_bufs);
+ m_created_cp_id, m_dirtied_cp_id, m_wait_for_down_buffers.get(),
+ m_node_freed ? " Freed" : "", voidptr_cast(const_cast< IndexBuffer* >(m_up_buffer.get())),
+ (m_bytes == nullptr) ? "not attached yet"
+ : r_cast< persistent_hdr_t const* >(m_bytes)->to_compact_string(),
+ down_bufs);
}
}
std::string IndexBuffer::to_string_dot() const {
@@ -173,12 +175,10 @@ std::string IndexBuffer::to_string_dot() const {
if (m_bytes == nullptr) {
fmt::format_to(std::back_inserter(str), " node_buf=nullptr ");
} else {
-// fmt::format_to(std::back_inserter(str), " node_buf={}", static_cast< void* >(m_node_buf->m_bytes));
fmt::format_to(std::back_inserter(str), " node_buf={} {} created/dirtied={}/{} {} down_wait#={}",
- static_cast< void* >(m_bytes), m_is_meta_buf?"[META]":"", m_created_cp_id, m_dirtied_cp_id, m_node_freed?"FREED":"", m_wait_for_down_buffers.get());
+ static_cast< void* >(m_bytes), m_is_meta_buf ? "[META]" : "", m_created_cp_id, m_dirtied_cp_id,
+ m_node_freed ? "FREED" : "", m_wait_for_down_buffers.get());
}
- // fmt::format_to(std::back_inserter(str), "
next_buffer={}",
- // m_next_buffer.lock() ? reinterpret_cast< void* >(m_next_buffer.lock().get()) : 0);
return str;
}
diff --git a/src/lib/index/wb_cache.cpp b/src/lib/index/wb_cache.cpp
index b5e32e00c..6c0c722ae 100644
--- a/src/lib/index/wb_cache.cpp
+++ b/src/lib/index/wb_cache.cpp
@@ -101,9 +101,11 @@ BtreeNodePtr IndexWBCache::alloc_buf(node_initializer_t&& node_initializer) {
idx_buf->m_dirtied_cp_id = cpg->id();
auto node = node_initializer(idx_buf);
- // Add the node to the cache
- bool done = m_cache.insert(node);
- HS_REL_ASSERT_EQ(done, true, "Unable to add alloc'd node to cache, low memory or duplicate inserts?");
+ if (!m_in_recovery) {
+ // Add the node to the cache. Skip if we are in recovery mode.
+ bool done = m_cache.insert(node);
+ HS_REL_ASSERT_EQ(done, true, "Unable to add alloc'd node to cache, low memory or duplicate inserts?");
+ }
// The entire index is updated in the commit path, so we alloc the blk and commit them right away
auto alloc_status = m_vdev->commit_blk(blkid);
@@ -114,9 +116,19 @@ BtreeNodePtr IndexWBCache::alloc_buf(node_initializer_t&& node_initializer) {
void IndexWBCache::write_buf(const BtreeNodePtr& node, const IndexBufferPtr& buf, CPContext* cp_ctx) {
// TODO upsert always returns false even if it succeeds.
- if (node != nullptr) { m_cache.upsert(node); }
- r_cast< IndexCPContext* >(cp_ctx)->add_to_dirty_list(buf);
- resource_mgr().inc_dirty_buf_size(m_node_size);
+ if (m_in_recovery) {
+ if (buf->is_meta_buf()) {
+ auto const& sb = r_cast< MetaIndexBuffer* >(buf.get())->m_sb;
+ meta_service().update_sub_sb(buf->m_bytes, sb.size(), sb.meta_blk());
+ } else {
+ m_vdev->sync_write(r_cast< const char* >(buf->raw_buffer()), m_node_size, buf->m_blkid);
+ }
+ } else {
+ if (node != nullptr) { m_cache.upsert(node); }
+ LOGTRACEMOD(wbcache, "add to dirty list cp {} {}", cp_ctx->id(), buf->to_string());
+ r_cast< IndexCPContext* >(cp_ctx)->add_to_dirty_list(buf);
+ resource_mgr().inc_dirty_buf_size(m_node_size);
+ }
}
void IndexWBCache::read_buf(bnodeid_t id, BtreeNodePtr& node, node_initializer_t&& node_initializer) {
@@ -124,7 +136,7 @@ void IndexWBCache::read_buf(bnodeid_t id, BtreeNodePtr& node, node_initializer_t
retry:
// Check if the blkid is already in cache, if not load and put it into the cache
- if (m_cache.get(blkid, node)) { return; }
+ if (!m_in_recovery && m_cache.get(blkid, node)) { return; }
// Read the buffer from virtual device
auto idx_buf = std::make_shared< IndexBuffer >(blkid, m_node_size, m_vdev->align_size());
@@ -134,10 +146,12 @@ void IndexWBCache::read_buf(bnodeid_t id, BtreeNodePtr& node, node_initializer_t
node = node_initializer(idx_buf);
// Push the node into cache
- bool done = m_cache.insert(node);
- if (!done) {
- // There is a race between 2 concurrent reads from vdev and other party won the race. Re-read from cache
- goto retry;
+ if (!m_in_recovery) {
+ bool done = m_cache.insert(node);
+ if (!done) {
+ // There is a race between 2 concurrent reads from vdev and other party won the race. Re-read from cache
+ goto retry;
+ }
}
}
@@ -258,7 +272,7 @@ void IndexWBCache::transact_bufs(uint32_t index_ordinal, IndexBufferPtr const& p
if (new_node_bufs.empty() && freed_node_bufs.empty()) {
// This is an update for meta, root transaction.
- if(child_buf->m_created_cp_id != -1) {
+ if (child_buf->m_created_cp_id != -1) {
DEBUG_ASSERT_EQ(child_buf->m_created_cp_id, icp_ctx->id(),
"Root buffer is not created by current cp (for split root), its not expected");
}
@@ -293,7 +307,7 @@ void IndexWBCache::link_buf(IndexBufferPtr const& up_buf, IndexBufferPtr const&
if (up_buf->m_created_cp_id == icp_ctx->id()) {
real_up_buf = up_buf->m_up_buffer;
HS_DBG_ASSERT(real_up_buf,
- "Up buffer is newly created in this cp, but it doesn't have its own up_buffer, its not expected");
+ "Up buffer is newly created in this cp, but it doesn't have its own up_buffer, its not expected");
}
// Condition 2: If down_buf already has an up_buf, we can override it newly passed up_buf it only in case of
@@ -325,7 +339,7 @@ void IndexWBCache::link_buf(IndexBufferPtr const& up_buf, IndexBufferPtr const&
// This link is acheived by unconditionally changing the link in case of is_sibling=true to passed up_buf, but
// conditionally do it in case of parent link where it already has a link don't override it.
if (down_buf->m_up_buffer != nullptr) {
- HS_DBG_ASSERT_LT(down_buf->m_up_buffer->m_created_cp_id, icp_ctx->id(),
+ HS_DBG_ASSERT_LT(down_buf->m_up_buffer->m_created_cp_id, icp_ctx->id(),
"down_buf=[{}] up_buffer=[{}] should never have been created on same cp",
down_buf->to_string(), down_buf->m_up_buffer->to_string());
@@ -334,8 +348,6 @@ void IndexWBCache::link_buf(IndexBufferPtr const& up_buf, IndexBufferPtr const&
real_up_buf = down_buf->m_up_buffer;
HS_DBG_ASSERT(!real_up_buf->m_wait_for_down_buffers.testz(),
"Up buffer waiting count is zero, whereas down buf is already linked to up buf");
- if(!(real_up_buf->m_dirtied_cp_id == down_buf->m_dirtied_cp_id) || (real_up_buf->is_meta_buf()))
- { icp_ctx->to_string_dot ("crash5.dot"); }
HS_DBG_ASSERT((real_up_buf->m_dirtied_cp_id == down_buf->m_dirtied_cp_id) || (real_up_buf->is_meta_buf()),
"Up buffer is not modified by current cp, but down buffer is linked to it");
#ifndef NDEBUG
@@ -362,8 +374,10 @@ void IndexWBCache::link_buf(IndexBufferPtr const& up_buf, IndexBufferPtr const&
void IndexWBCache::free_buf(const IndexBufferPtr& buf, CPContext* cp_ctx) {
BtreeNodePtr node;
- bool done = m_cache.remove(buf->m_blkid, node);
- HS_REL_ASSERT_EQ(done, true, "Race on cache removal of btree blkid?");
+ if (!m_in_recovery) {
+ bool done = m_cache.remove(buf->m_blkid, node);
+ HS_REL_ASSERT_EQ(done, true, "Race on cache removal of btree blkid?");
+ }
resource_mgr().inc_free_blk(m_node_size);
m_vdev->free_blk(buf->m_blkid, s_cast< VDevCPContext* >(cp_ctx));
@@ -377,95 +391,92 @@ void IndexWBCache::recover(sisl::byte_view sb) {
return;
}
+ m_in_recovery = true; // For entirity of this call, we should mark it as being recovered.
+
// Recover the CP Context with the buf_map of all the buffers that were dirtied in the last cp with its
// relationship (up/down buf links) as it was by the cp that was flushing the buffers prior to unclean shutdown.
auto cpg = cp_mgr().cp_guard();
auto icp_ctx = r_cast< IndexCPContext* >(cpg.context(cp_consumer_t::INDEX_SVC));
std::map< BlkId, IndexBufferPtr > bufs = icp_ctx->recover(std::move(sb));
- // With all the buffers recovered, we first make the decision of which blk to keep and which blk to free. This
- // is needed so that subsequent repair can do blk allocation and shouldn't incorrectly allocate the blks which
- // are going to be committed later.
- std::vector< IndexBufferPtr > new_bufs;
+ LOGINFOMOD(wbcache, "Detected unclean shutdown, prior cp={} had to flush {} nodes, recovering... ", icp_ctx->id(),
+ bufs.size());
+
+ // At this point, we have the DAG structure (up/down dependency graph), exactly the same as prior to crash, with one
+ // addition of all freed buffers also put in the DAG structure.
+ //
+ // We do repair/recovery as 2 passes. A quick glance would look like we don't need 2 passes of the walking through
+ // all the buffers, but it is essential.
+ //
+ // In the first pass, we look for any new bufs and any freed bufs and commit/free their corresponding node blkids.
+ // This has to be done before doing any repair, because repair can allocate blkids and we don't want to allocate
+ // the same blkid which could clash with the blkid next in the buf list.
+ //
+ // On the second pass, we only take the new nodes/bufs and then repair their up buffers, if needed.
+ std::vector< IndexBufferPtr > l0_bufs;
for (auto const& [_, buf] : bufs) {
- if (buf->m_node_freed) {
- // If the node was freed according txn records, we need to check if up_buf node was also written
- if (was_node_committed(buf->m_up_buffer)) {
- // Up buffer was written, so this buffer can be freed and thus can free the blk.
- m_vdev->free_blk(buf->m_blkid, s_cast< VDevCPContext* >(icp_ctx));
- }
- buf->m_up_buffer->m_wait_for_down_buffers.decrement();
- } else if (buf->m_created_cp_id == icp_ctx->id()) {
- if (was_node_committed(buf) && was_node_committed(buf->m_up_buffer)) {
- // The buffer was newly created on this cp, and its up_buffer is also written, we need to commit
- // this blk again, so that it will not be reallocated for other node during repair.
- m_vdev->commit_blk(buf->m_blkid);
- new_bufs.push_back(buf);
- }else{
- // In th recovery path, an uncommited node has a set of commited down bufs and also a set of uncommited down bufs. In repair a node must wait ONLY for nodes that are the part of recovery path. So here, we corrected the down wait of up bufs for unreached path
- if(was_node_committed(buf) && !was_node_committed(buf->m_up_buffer)){
- LOGINFO(" \n\t\t Mehdi: buf {} was commited but up_buffer {} was not commited, hence discarded ", voidptr_cast(buf.get()), voidptr_cast(buf->m_up_buffer.get()));
- buf->m_up_buffer->m_wait_for_down_buffers.decrement(1);
+ if (buf->m_node_freed || (buf->m_created_cp_id == icp_ctx->id())) {
+ if (was_node_committed(buf)) {
+ if (was_node_committed(buf->m_up_buffer)) {
+ if (buf->m_node_freed) {
+ // Up buffer was written, so this buffer can be freed and thus can free the blk.
+ m_vdev->free_blk(buf->m_blkid, s_cast< VDevCPContext* >(icp_ctx));
+ } else {
+ m_vdev->commit_blk(buf->m_blkid);
+ }
+ l0_bufs.push_back(buf);
+ } else {
+ buf->m_up_buffer->m_wait_for_down_buffers.decrement();
}
}
}
}
-#if 0
- // I keep it here to see the down_waits are set correctly for up buffers
- // list of all recovered bufs
- std::string log = "\n\n\t\t\t\t\t recovered bufs (#of bufs = " + std::to_string(bufs.size()) +" ) \n";
+ LOGINFOMOD(wbcache, "Index Recovery detected {} nodes out of {} as new/freed nodes to be recovered in prev cp={}",
+ l0_bufs.size(), bufs.size(), icp_ctx->id());
+
+ auto detailed_log = [this](std::map< BlkId, IndexBufferPtr > const& bufs,
+ std::vector< IndexBufferPtr > const& l0_bufs) {
+ // Logs to detect down_waits are set correctly for up buffers list of all recovered bufs
+ std::string log = fmt::format("\trecovered bufs (#of bufs = {})\n", bufs.size());
for (auto const& [_, buf] : bufs) {
fmt::format_to(std::back_inserter(log), "{}\n", buf->to_string());
}
- LOGINFO("{}",log);
- // list of new_bufs
- std::string log2 = "\n\n\t\t\t\t\t new_bufs (#of new bufs " + std::to_string(new_bufs.size()) + " )\n";
- for (auto const& buf : new_bufs) {
- fmt::format_to(std::back_inserter(log2), "{}\n", buf->to_string());
+
+ // list of new_bufs
+ fmt::format_to(std::back_inserter(log), "\n\tl0_bufs (#of bufs = {})\n", l0_bufs.size());
+ for (auto const& buf : l0_bufs) {
+ fmt::format_to(std::back_inserter(log), "{}\n", buf->to_string());
}
- LOGINFO("{}", log2);
-#endif
+ return log;
+ };
+ LOGTRACEMOD(wbcache, "All unclean bufs list\n{}", detailed_log(bufs, l0_bufs));
// Second iteration we start from the lowest levels (which are all new_bufs) and check if up_buffers need to be
// repaired. All L1 buffers are not needed to repair, because they are sibling nodes and so we pass false in
// do_repair flag.
- for (auto const& buf : new_bufs) {
- process_up_buf(buf->m_up_buffer, false /* do_repair */);
+ for (auto const& buf : l0_bufs) {
+ recover_buf(buf->m_up_buffer);
}
+ m_in_recovery = false;
m_vdev->recovery_completed();
}
-void IndexWBCache::process_up_buf(IndexBufferPtr const& buf, bool do_repair) {
- if (do_repair) {
- // If the buffer needs to be repaired, reset the dirtied_cp_id so once all down buffers are done, it does
- // repair
- buf->m_dirtied_cp_id = -1;
- }
+void IndexWBCache::recover_buf(IndexBufferPtr const& buf) {
if (!buf->m_wait_for_down_buffers.decrement_testz()) { return; }
- // One of the down buffer indicated that it has to repair our node, so we issue a repair
- if (buf->m_dirtied_cp_id == -1) {
- if(buf->m_up_buffer->is_meta_buf()){
- LOGTRACEMOD(wbcache, "repair_index_root for buf {}", buf->to_string());
- index_service().repair_index_root(buf->m_index_ordinal, buf);
- }else
- {
- LOGTRACEMOD(wbcache, "repair_index_node for buf {}", buf->to_string());
- index_service().repair_index_node(buf->m_index_ordinal, buf);
- }
+ // All down buffers are completed and given a nod saying that they are committed. If this buffer is not committed,
+ // then we need to repair this node/buffer. After that we will keep going to the next up level to repair them if
+ // needed
+ if (!was_node_committed(buf)) {
+ LOGDEBUGMOD(wbcache, "Index Recovery detected uncommitted up node [{}], repairing it", buf->to_string());
+ index_service().repair_index_node(buf->m_index_ordinal, buf);
+ } else {
+ LOGTRACEMOD(wbcache, "Index Recovery detected up node [{}] as committed no need to repair that",
+ buf->to_string());
}
- // If there is an up buffer on next level, we need to process them and ask them to repair in case they were not
- // written as part of this CP.
- if (buf->m_up_buffer) {
- if (buf->m_up_buffer->is_meta_buf()) {
- // If the up_buf is meta buffer, then we found the new root, repair the current node accordingly
- index_service().repair_index_root(buf->m_index_ordinal, buf);
- } else {
- process_up_buf(buf->m_up_buffer, !was_node_committed(buf->m_up_buffer));
- }
- }
+ if (buf->m_up_buffer) { recover_buf(buf->m_up_buffer); }
}
bool IndexWBCache::was_node_committed(IndexBufferPtr const& buf) {
@@ -492,7 +503,7 @@ bool IndexWBCache::was_node_committed(IndexBufferPtr const& buf) {
//////////////////// CP Related API section /////////////////////////////////
folly::Future< bool > IndexWBCache::async_cp_flush(IndexCPContext* cp_ctx) {
- LOGTRACEMOD(wbcache, "cp_ctx {}", cp_ctx->to_string());
+ LOGTRACEMOD(wbcache, "Starting Index CP Flush with cp context={}", cp_ctx->to_string_with_dags());
if (!cp_ctx->any_dirty_buffers()) {
if (cp_ctx->id() == 0) {
// For the first CP, we need to flush the journal buffer to the meta blk
@@ -503,10 +514,13 @@ folly::Future< bool > IndexWBCache::async_cp_flush(IndexCPContext* cp_ctx) {
}
return folly::makeFuture< bool >(true); // nothing to flush
}
- // if is_crashed don't do anything
- if (hs()->crash_simulator().is_crashed()) {
- return folly::makeFuture< bool >(true); // nothing to flush
- }
+
+#ifdef _PRERELEASE
+ if (hs()->crash_simulator().is_crashed()) {
+ LOGINFOMOD(wbcache, "crash simulation is ongoing, so skip the cp flush");
+ return folly::makeFuture< bool >(true);
+ }
+#endif
// First thing is to flush the new_blks created as part of the CP.
auto const& journal_buf = cp_ctx->journal_buf();
@@ -535,27 +549,23 @@ folly::Future< bool > IndexWBCache::async_cp_flush(IndexCPContext* cp_ctx) {
}
void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const& buf, bool part_of_batch) {
-#ifdef _PRERELEASE
- if (hs()->crash_simulator().is_crashed()) {
- LOGINFOMOD(wbcache, "crash simulation is ongoing");
- return;
- }
-#endif
-
- LOGTRACEMOD(wbcache, "cp {} buf {}", cp_ctx->id(), buf->to_string());
- buf->set_state(index_buf_state_t::FLUSHING);
-
#ifdef _PRERELEASE
if (buf->m_crash_flag_on) {
- std::string filename = "crash_buf_"+std::to_string(cp_ctx->id())+".dot";
+ std::string filename = "crash_buf_" + std::to_string(cp_ctx->id()) + ".dot";
LOGINFOMOD(wbcache, "Simulating crash while writing buffer {}, stored in file {}", buf->to_string(), filename);
cp_ctx->to_string_dot(filename);
hs()->crash_simulator().crash();
cp_ctx->complete(true);
return;
+ } else if (hs()->crash_simulator().is_crashed()) {
+ LOGINFOMOD(wbcache, "crash simulation is ongoing, aid simulation by not flushing");
+ return;
}
#endif
+ LOGTRACEMOD(wbcache, "cp={} {}", cp_ctx->id(), buf->to_string());
+ buf->set_state(index_buf_state_t::FLUSHING);
+
if (buf->is_meta_buf()) {
LOGTRACEMOD(wbcache, "flushing cp {} meta buf {} possibly because of root split", cp_ctx->id(),
buf->to_string());
@@ -580,11 +590,13 @@ void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const
}
void IndexWBCache::process_write_completion(IndexCPContext* cp_ctx, IndexBufferPtr const& buf) {
+#ifdef _PRERELEASE
if (hs()->crash_simulator().is_crashed()) {
- LOGINFOMOD(wbcache, "process_write_completion don't do anything for {}", buf->to_string());
- cp_ctx->complete(true);
+ LOGINFOMOD(wbcache, "Crash simulation is ongoing, ignore all process_write_completion");
return;
}
+#endif
+
LOGTRACEMOD(wbcache, "cp {} buf {}", cp_ctx->id(), buf->to_string());
resource_mgr().dec_dirty_buf_size(m_node_size);
auto [next_buf, has_more] = on_buf_flush_done(cp_ctx, buf);
diff --git a/src/lib/index/wb_cache.hpp b/src/lib/index/wb_cache.hpp
index d7b78800d..209d3845e 100644
--- a/src/lib/index/wb_cache.hpp
+++ b/src/lib/index/wb_cache.hpp
@@ -40,6 +40,7 @@ class IndexWBCache : public IndexWBCacheBase {
std::vector< iomgr::io_fiber_t > m_cp_flush_fibers;
std::mutex m_flush_mtx;
void* m_meta_blk;
+ bool m_in_recovery{false};
public:
IndexWBCache(const std::shared_ptr< VirtualDev >& vdev, std::pair< meta_blk*, sisl::byte_view > sb,
@@ -75,7 +76,7 @@ class IndexWBCache : public IndexWBCacheBase {
void get_next_bufs_internal(IndexCPContext* cp_ctx, uint32_t max_count, IndexBufferPtr const& prev_flushed_buf,
IndexBufferPtrList& bufs);
- void process_up_buf(IndexBufferPtr const& buf, bool do_repair);
+ void recover_buf(IndexBufferPtr const& buf);
bool was_node_committed(IndexBufferPtr const& buf);
};
} // namespace homestore
diff --git a/src/tests/btree_helpers/btree_test_helper.hpp b/src/tests/btree_helpers/btree_test_helper.hpp
index cb4a20e88..dd5508bea 100644
--- a/src/tests/btree_helpers/btree_test_helper.hpp
+++ b/src/tests/btree_helpers/btree_test_helper.hpp
@@ -274,7 +274,7 @@ struct BtreeTestHelper {
qreq.enable_route_tracing();
auto const ret = m_bt->query(qreq, out_vector);
auto const expected_count = std::min(remaining, batch_size);
- this->print_keys();
+ // this->print_keys();
ASSERT_EQ(out_vector.size(), expected_count) << "Received incorrect value on query pagination";
if (remaining < batch_size) {
@@ -374,9 +374,11 @@ struct BtreeTestHelper {
run_in_parallel(op_list);
}
- void print(const std::string& file = "") const { m_bt->print_tree(file); }
- void print_keys() const { m_bt->print_tree_keys(); }
- void visualize_keys(const std::string &file) const { m_bt->visualize_tree_keys( file ); }
+ void dump_to_file(const std::string& file = "") const { m_bt->dump_tree_to_file(file); }
+ void print_keys(const std::string& preamble = "") const {
+ LOGINFO("{}{}", preamble.empty() ? "" : preamble + ":\n", m_bt->to_string_keys());
+ }
+ void visualize_keys(const std::string& file) const { m_bt->visualize_tree_keys(file); }
void compare_files(const std::string& before, const std::string& after) {
std::ifstream b(before, std::ifstream::ate);
diff --git a/src/tests/test_index_btree.cpp b/src/tests/test_index_btree.cpp
index 68e6bbee0..6b5ff27ab 100644
--- a/src/tests/test_index_btree.cpp
+++ b/src/tests/test_index_btree.cpp
@@ -288,7 +288,7 @@ TYPED_TEST(BtreeTest, CpFlush) {
LOGINFO("Query {} entries and validate with pagination of 75 entries", num_entries);
this->do_query(0, num_entries - 1, 75);
- this->print(std::string("before.txt"));
+ this->dump_to_file(std::string("before.txt"));
this->destroy_btree();
@@ -298,7 +298,7 @@ TYPED_TEST(BtreeTest, CpFlush) {
std::this_thread::sleep_for(std::chrono::seconds{1});
LOGINFO("Restarted homestore with index recovered");
- this->print(std::string("after.txt"));
+ this->dump_to_file(std::string("after.txt"));
LOGINFO("Query {} entries", num_entries);
this->do_query(0, num_entries - 1, 1000);
@@ -336,7 +336,7 @@ TYPED_TEST(BtreeTest, MultipleCpFlush) {
LOGINFO("Query {} entries and validate with pagination of 75 entries", num_entries);
this->do_query(0, num_entries - 1, 75);
- this->print(std::string("before.txt"));
+ this->dump_to_file(std::string("before.txt"));
this->destroy_btree();
@@ -345,7 +345,7 @@ TYPED_TEST(BtreeTest, MultipleCpFlush) {
std::this_thread::sleep_for(std::chrono::seconds{1});
LOGINFO(" Restarted homestore with index recovered");
- this->print(std::string("after.txt"));
+ this->dump_to_file(std::string("after.txt"));
this->compare_files("before.txt", "after.txt");
@@ -400,7 +400,7 @@ TYPED_TEST(BtreeTest, ThreadedCpFlush) {
LOGINFO("Query {} entries and validate with pagination of 75 entries", num_entries);
this->do_query(0, num_entries - 1, 75);
- this->print(std::string("before.txt"));
+ this->dump_to_file(std::string("before.txt"));
this->destroy_btree();
// Restart homestore. m_bt is updated by the TestIndexServiceCallback.
@@ -408,7 +408,7 @@ TYPED_TEST(BtreeTest, ThreadedCpFlush) {
std::this_thread::sleep_for(std::chrono::seconds{1});
LOGINFO(" Restarted homestore with index recovered");
- this->print(std::string("after.txt"));
+ this->dump_to_file(std::string("after.txt"));
this->compare_files("before.txt", "after.txt");
diff --git a/src/tests/test_index_crash_recovery.cpp b/src/tests/test_index_crash_recovery.cpp
index c4c90fd73..0e34e6daf 100644
--- a/src/tests/test_index_crash_recovery.cpp
+++ b/src/tests/test_index_crash_recovery.cpp
@@ -89,6 +89,7 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT
HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) {
s.generic.cache_max_throttle_cnt = 10000;
s.generic.cp_timer_us = 0x8000000000000000;
+ s.resource_limits.dirty_buf_percent = 100;
HS_SETTINGS_FACTORY().save();
});
@@ -128,30 +129,28 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT
void restart_homestore(uint32_t shutdown_delay_sec = 3) override {
this->params(HS_SERVICE::INDEX).index_svc_cbs = new TestIndexServiceCallbacks(this);
LOGINFO("\n\n\n\n\n\n shutdown homestore for index service Test\n\n\n\n\n");
-// this->m_shadow_map.save(this->m_shadow_filename);
+ // this->m_shadow_map.save(this->m_shadow_filename);
test_common::HSTestHelper::restart_homestore(shutdown_delay_sec);
-
}
void reapply_after_crash() {
ShadowMap< K, V > snapshot_map{this->m_shadow_map.max_keys()};
snapshot_map.load(m_shadow_filename);
- LOGINFO("\t\t\t\tMehdi: snapshot before crash\n{}", snapshot_map.to_string());
+ LOGINFO("\tSnapshot before crash\n{}", snapshot_map.to_string());
auto diff = this->m_shadow_map.diff(snapshot_map);
- std::string dif_str="KEY \tADDITION\n";
+ std::string dif_str = "KEY \tADDITION\n";
for (const auto& [k, addition] : diff) {
dif_str += fmt::format(" {} \t{}\n", k.key(), addition);
}
// visualize tree after crash
- std::string recovered_tree_filename = "tree_after_crash_" + to_string(rand()%100)+".dot";
- this->visualize_keys(recovered_tree_filename);
- LOGINFO(" tree after recovered stored in {}", recovered_tree_filename );
- test_common::HSTestHelper::trigger_cp(true);
+ std::string recovered_tree_filename = "tree_after_crash_" + to_string(rand() % 100) + ".dot";
+ // this->visualize_keys(recovered_tree_filename);
+ LOGINFO(" tree after recovered stored in {}", recovered_tree_filename);
+ // test_common::HSTestHelper::trigger_cp(true);
LOGINFO("Diff between shadow map and snapshot map\n{}\n", dif_str);
for (const auto& [k, addition] : diff) {
- LOGINFO("\t\n\n reapply: before inserting key {}", k.key());
- this->print_keys();
- this->visualize_keys(recovered_tree_filename);
+ // this->print_keys(fmt::format("reapply: before inserting key {}", k.key()));
+ // this->visualize_keys(recovered_tree_filename);
if (addition) { this->force_upsert(k.key()); }
}
test_common::HSTestHelper::trigger_cp(true);
@@ -175,22 +174,21 @@ struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestT
BtreeTestHelper< TestType >::TearDown();
this->shutdown_homestore(false);
}
- void crash_and_recover(uint32_t s_key, uint32_t e_key){
- this->print_keys();
+ void crash_and_recover(uint32_t s_key, uint32_t e_key) {
+ this->print_keys("Btree prior to CP and susbsequent simulated crash: ");
test_common::HSTestHelper::trigger_cp(false);
this->wait_for_crash_recovery();
- this->visualize_keys("tree_after_crash_"+std::to_string(s_key)+"_"+std::to_string(e_key)+".dot");
- this->print_keys();
+ // this->visualize_keys("tree_after_crash_" + std::to_string(s_key) + "_" + std::to_string(e_key) + ".dot");
+
+ this->print_keys("Post crash and recovery, btree structure: ");
this->reapply_after_crash();
this->get_all();
- LOGINFO(" except to have [{},{}) in tree and it is actually{} ", s_key, e_key,tree_key_count());
+ LOGINFO(" except to have [{},{}) in tree and it is actually{} ", s_key, e_key, tree_key_count());
ASSERT_EQ(this->m_shadow_map.size(), this->m_bt->count_keys(this->m_bt->root_node_id()))
<< "shadow map size and tree size mismatch";
}
- uint32_t tree_key_count(){
- return this->m_bt->count_keys(this->m_bt->root_node_id());
- }
+ uint32_t tree_key_count() { return this->m_bt->count_keys(this->m_bt->root_node_id()); }
protected:
const std::string m_shadow_filename = "/tmp/shadow_map_index_recovery.txt";
@@ -236,10 +234,9 @@ TYPED_TEST(IndexCrashTest, SplitOnLeftEdge) {
for (auto k = num_entries / 2; k < num_entries * 3 / 4; ++k) {
this->put(k, btree_put_type::INSERT, true /* expect_success */);
}
- LOGINFO("Step 4: Post crash we reapply the missing entries to tree");
+ LOGINFO("Step 4: Crash and reapply the missing entries to tree");
this->crash_and_recover(num_entries / 2, num_entries);
-
// TODO: Uncomment this once we do a fix for the inconsistent query results
LOGINFO("Step 5: Fill the 2nd quarter of the tree, to make sure left child is split and we crash on flush of the "
"left child");
@@ -247,27 +244,24 @@ TYPED_TEST(IndexCrashTest, SplitOnLeftEdge) {
this->visualize_keys("tree_before_insert.dot");
for (auto k = num_entries / 4; k < num_entries / 2; ++k) {
LOGINFO("inserting key {}", k);
- this->visualize_keys("tree_before_"+to_string(k)+".dot");
+ this->visualize_keys("tree_before_" + to_string(k) + ".dot");
this->put(k, btree_put_type::INSERT, true /* expect_success */);
}
this->visualize_keys("tree_before_crash.dot");
- this->print_keys();
- this->print();
- LOGINFO("Step 6: Post crash we reapply the missing entries to tree");
+ this->dump_to_file("tree_before_crash.txt");
+ LOGINFO("Step 6: Simulate crash and then recover, reapply keys to tree");
this->crash_and_recover(num_entries / 4, num_entries);
-
-// LOGINFO("Step 7: Fill the 1st quarter of the tree, to make sure left child is split and we crash on flush of the "
-// "parent node");
-// this->set_basic_flip("crash_flush_on_split_at_parent");
-// for (auto k = 0u; k <= num_entries / 4; ++k) {
-// this->put(k, btree_put_type::INSERT, true /* expect_success */);
-// }
-// LOGINFO("Step 8: Post crash we reapply the missing entries to tree");
-// this->crash_and_recover(0, num_entries);
+ LOGINFO("Step 7: Fill the 1st quarter of the tree, to make sure left child is split and we crash on flush of the "
+ "parent node");
+ this->set_basic_flip("crash_flush_on_split_at_parent");
+ for (auto k = 0u; k <= num_entries / 4; ++k) {
+ this->put(k, btree_put_type::INSERT, true /* expect_success */);
+ }
+ LOGINFO("Step 8: Post crash we reapply the missing entries to tree");
+ this->crash_and_recover(0, num_entries);
LOGINFO("Step 9: Query all entries and validate with pagination of 80 entries");
this->query_all_paginate(80);
-
}
int main(int argc, char* argv[]) {