diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 000000000..04e3372dd
Binary files /dev/null and b/.DS_Store differ
diff --git a/conanfile.py b/conanfile.py
index 774a1ac0d..fd824630f 100644
--- a/conanfile.py
+++ b/conanfile.py
@@ -9,7 +9,7 @@
class HomestoreConan(ConanFile):
name = "homestore"
- version = "6.4.33"
+ version = "6.4.34"
homepage = "https://github.com/eBay/Homestore"
description = "HomeStore Storage Engine"
diff --git a/src/include/homestore/btree/btree.hpp b/src/include/homestore/btree/btree.hpp
index f7a82b14a..2ef1e1d44 100644
--- a/src/include/homestore/btree/btree.hpp
+++ b/src/include/homestore/btree/btree.hpp
@@ -34,6 +34,12 @@ namespace homestore {
using BtreeNodePtr = boost::intrusive_ptr< BtreeNode >;
using BtreeNodeList = folly::small_vector< BtreeNodePtr, 3 >;
+struct BtreeVisualizeVariables {
+ uint64_t parent;
+ uint64_t midPoint;
+ uint64_t index;
+};
+
struct BtreeThreadVariables {
std::vector< btree_locked_node_info > wr_locked_nodes;
std::vector< btree_locked_node_info > rd_locked_nodes;
@@ -114,8 +120,9 @@ class Btree {
virtual std::pair< btree_status_t, uint64_t > destroy_btree(void* context);
nlohmann::json get_status(int log_level) const;
- void print_tree(const std::string& file = "") const;
- void print_tree_keys() const;
+ void dump_tree_to_file(const std::string& file = "") const;
+ std::string to_custom_string(to_string_cb_t< K, V > const& cb) const;
+ std::string visualize_tree_keys(const std::string& file) const;
uint64_t count_keys(bnodeid_t bnodeid) const;
nlohmann::json get_metrics_in_json(bool updated = true);
@@ -194,7 +201,9 @@ class Btree {
uint64_t get_btree_node_cnt() const;
uint64_t get_child_node_cnt(bnodeid_t bnodeid) const;
void to_string(bnodeid_t bnodeid, std::string& buf) const;
- void to_string_keys(bnodeid_t bnodeid, std::string& buf) const;
+ void to_custom_string_internal(bnodeid_t bnodeid, std::string& buf, to_string_cb_t< K, V > const& cb) const;
+ void to_dot_keys(bnodeid_t bnodeid, std::string& buf, std::map< uint32_t, std::vector< uint64_t > >& l_map,
+ std::map< uint64_t, BtreeVisualizeVariables >& info_map) const;
void validate_sanity_child(const BtreeNodePtr& parent_node, uint32_t ind) const;
void validate_sanity_next_child(const BtreeNodePtr& parent_node, uint32_t ind) const;
void print_node(const bnodeid_t& bnodeid) const;
diff --git a/src/include/homestore/btree/btree.ipp b/src/include/homestore/btree/btree.ipp
index 73279d138..35c42ab97 100644
--- a/src/include/homestore/btree/btree.ipp
+++ b/src/include/homestore/btree/btree.ipp
@@ -308,7 +308,7 @@ nlohmann::json Btree< K, V >::get_status(int log_level) const {
}
template < typename K, typename V >
-void Btree< K, V >::print_tree(const std::string& file) const {
+void Btree< K, V >::dump_tree_to_file(const std::string& file) const {
std::string buf;
m_btree_lock.lock_shared();
to_string(m_root_node_info.bnode_id(), buf);
@@ -323,13 +323,53 @@ void Btree< K, V >::print_tree(const std::string& file) const {
}
template < typename K, typename V >
-void Btree< K, V >::print_tree_keys() const {
+std::string Btree< K, V >::to_custom_string(to_string_cb_t< K, V > const& cb) const {
std::string buf;
m_btree_lock.lock_shared();
- to_string_keys(m_root_node_info.bnode_id(), buf);
+ to_custom_string_internal(m_root_node_info.bnode_id(), buf, cb);
m_btree_lock.unlock_shared();
- LOGINFO("Pre order traversal of tree:\n<{}>", buf);
+ return buf;
+}
+
+template < typename K, typename V >
+std::string Btree< K, V >::visualize_tree_keys(const std::string& file) const {
+ std::map< uint32_t, std::vector< uint64_t > > level_map;
+ std::map< uint64_t, BtreeVisualizeVariables > info_map;
+ std::string buf = "digraph G\n"
+ "{ \n"
+ "ranksep = 3.0;\n"
+ R"(graph [splines="polyline"];
+)";
+
+ m_btree_lock.lock_shared();
+ to_dot_keys(m_root_node_info.bnode_id(), buf, level_map, info_map);
+ m_btree_lock.unlock_shared();
+ for (const auto& [child, info] : info_map) {
+ if (info.parent) {
+ buf += fmt::format(R"(
+ "{}":connector{} -> "{}":"key{}" [splines=false];)",
+ info.parent, info.index, child, info.midPoint);
+ }
+ }
+
+ std::string result;
+ for (const auto& [key, values] : level_map) {
+ result += "{rank=same; ";
+ std::vector< std::string > quotedValues;
+ std::transform(values.begin(), values.end(), std::back_inserter(quotedValues),
+ [](uint64_t value) { return fmt::format("\"{}\"", value); });
+
+ result += fmt::to_string(fmt::join(quotedValues, " ")) + "}\n";
+ }
+
+ buf += "\n" + result + " }\n";
+ if (!file.empty()) {
+ std::ofstream o(file);
+ o.write(buf.c_str(), buf.size());
+ o.flush();
+ }
+ return buf;
}
template < typename K, typename V >
diff --git a/src/include/homestore/btree/detail/btree_common.ipp b/src/include/homestore/btree/detail/btree_common.ipp
index edd895d89..b21305497 100644
--- a/src/include/homestore/btree/detail/btree_common.ipp
+++ b/src/include/homestore/btree/detail/btree_common.ipp
@@ -147,23 +147,54 @@ void Btree< K, V >::to_string(bnodeid_t bnodeid, std::string& buf) const {
}
template < typename K, typename V >
-void Btree< K, V >::to_string_keys(bnodeid_t bnodeid, std::string& buf) const {
+void Btree< K, V >::to_custom_string_internal(bnodeid_t bnodeid, std::string& buf,
+ to_string_cb_t< K, V > const& cb) const {
BtreeNodePtr node;
locktype_t acq_lock = locktype_t::READ;
if (read_and_lock_node(bnodeid, node, acq_lock, acq_lock, nullptr) != btree_status_t::success) { return; }
- fmt::format_to(std::back_inserter(buf), "{}\n", node->to_string_keys());
+ fmt::format_to(std::back_inserter(buf), "{}\n", node->to_custom_string(cb));
if (!node->is_leaf()) {
uint32_t i = 0;
while (i < node->total_entries()) {
BtreeLinkInfo p;
node->get_nth_value(i, &p, false);
- to_string_keys(p.bnode_id(), buf);
+ to_custom_string_internal(p.bnode_id(), buf, cb);
++i;
}
- if (node->has_valid_edge()) { to_string_keys(node->edge_id(), buf); }
+ if (node->has_valid_edge()) { to_custom_string_internal(node->edge_id(), buf, cb); }
+ }
+ unlock_node(node, acq_lock);
+}
+
+template < typename K, typename V >
+void Btree< K, V >::to_dot_keys(bnodeid_t bnodeid, std::string& buf,
+ std::map< uint32_t, std::vector< uint64_t > >& l_map,
+ std::map< uint64_t, BtreeVisualizeVariables >& info_map) const {
+ BtreeNodePtr node;
+ locktype_t acq_lock = locktype_t::READ;
+
+ if (read_and_lock_node(bnodeid, node, acq_lock, acq_lock, nullptr) != btree_status_t::success) { return; }
+ fmt::format_to(std::back_inserter(buf), "{}\n", node->to_dot_keys());
+ l_map[node->level()].push_back(node->node_id());
+ info_map[node->node_id()].midPoint = node->is_leaf() ? 0 : node->total_entries() / 2;
+ if (!node->is_leaf()) {
+ uint32_t i = 0;
+ while (i < node->total_entries()) {
+ BtreeLinkInfo p;
+ node->get_nth_value(i, &p, false);
+ to_dot_keys(p.bnode_id(), buf, l_map, info_map);
+ info_map[p.bnode_id()].parent = node->node_id();
+ info_map[p.bnode_id()].index = i;
+ ++i;
+ }
+ if (node->has_valid_edge()) {
+ to_dot_keys(node->edge_id(), buf, l_map, info_map);
+ info_map[node->edge_id()].parent = node->node_id();
+ info_map[node->edge_id()].index = node->total_entries();
+ }
}
unlock_node(node, acq_lock);
}
diff --git a/src/include/homestore/btree/detail/btree_internal.hpp b/src/include/homestore/btree/detail/btree_internal.hpp
index 352b06e04..a77bfe5ac 100644
--- a/src/include/homestore/btree/detail/btree_internal.hpp
+++ b/src/include/homestore/btree/detail/btree_internal.hpp
@@ -215,6 +215,9 @@ class BtreeNode;
void intrusive_ptr_add_ref(BtreeNode* node);
void intrusive_ptr_release(BtreeNode* node);
+template < typename K, typename V >
+using to_string_cb_t = std::function< std::string(std::vector< std::pair< K, V > > const&) >;
+
ENUM(btree_event_t, uint8_t, READ, MUTATE, REMOVE, SPLIT, REPAIR, MERGE);
struct trace_route_entry {
diff --git a/src/include/homestore/btree/detail/btree_mutate_impl.ipp b/src/include/homestore/btree/detail/btree_mutate_impl.ipp
index 209b35558..0c5313eb2 100644
--- a/src/include/homestore/btree/detail/btree_mutate_impl.ipp
+++ b/src/include/homestore/btree/detail/btree_mutate_impl.ipp
@@ -299,6 +299,20 @@ btree_status_t Btree< K, V >::split_node(const BtreeNodePtr& parent_node, const
return ret;
}
+// template < typename K, typename V >
+// template < typename ReqT >
+// bool Btree< K, V >::is_split_needed(const BtreeNodePtr& node, ReqT& req) const {
+// if (!node->is_leaf()) { // if internal node, size is atmost one additional entry, size of K/V
+// return node->total_entries() >= 3;
+// } else if constexpr (std::is_same_v< ReqT, BtreeRangePutRequest< K > >) {
+// return node->total_entries() >= 3;
+// } else if constexpr (std::is_same_v< ReqT, BtreeSinglePutRequest >) {
+// return node->total_entries() >= 3;;
+// } else {
+// return false;
+// }
+// }
+
template < typename K, typename V >
template < typename ReqT >
bool Btree< K, V >::is_split_needed(const BtreeNodePtr& node, ReqT& req) const {
diff --git a/src/include/homestore/btree/detail/btree_node.hpp b/src/include/homestore/btree/detail/btree_node.hpp
index 73e8be5a8..84f70aa05 100644
--- a/src/include/homestore/btree/detail/btree_node.hpp
+++ b/src/include/homestore/btree/detail/btree_node.hpp
@@ -69,17 +69,24 @@ struct persistent_hdr_t {
persistent_hdr_t() : nentries{0}, leaf{0}, node_deleted{0} {}
std::string to_string() const {
- return fmt::format("magic={} version={} csum={} node_id={} next_node={} nentries={} node_type={} is_leaf={} "
- "node_deleted={} node_gen={} modified_cp_id={} link_version={} edge_nodeid={}, "
- "edge_link_version={} level={} ",
- magic, version, checksum, node_id, next_node, nentries, node_type, leaf, node_deleted,
- node_gen, modified_cp_id, link_version, edge_info.m_bnodeid, edge_info.m_link_version,
- level);
+ auto snext = (next_node == empty_bnodeid) ? "" : " next=" + std::to_string(next_node);
+ auto sedge = (edge_info.m_bnodeid == empty_bnodeid)
+ ? ""
+ : fmt::format(" edge={}.{}", edge_info.m_bnodeid, edge_info.m_link_version);
+ return fmt::format("magic={} version={} csum={} node_id={}{} nentries={} node_type={} is_leaf={} "
+ "node_deleted={} node_gen={} modified_cp_id={} link_version={}{} level={} ",
+ magic, version, checksum, node_id, snext, nentries, node_type, leaf, node_deleted, node_gen,
+ modified_cp_id, link_version, sedge, level);
}
std::string to_compact_string() const {
- return fmt::format("{} id={} next={} nentries={} {} level={}", (void*)this, node_id, next_node, nentries,
- (node_deleted == 0x1) ? "Deleted" : "", level);
+ auto snext = (next_node == empty_bnodeid) ? "" : " next=" + std::to_string(next_node);
+ auto sedge = (edge_info.m_bnodeid == empty_bnodeid)
+ ? ""
+ : fmt::format(" edge={}.{}", edge_info.m_bnodeid, edge_info.m_link_version);
+ return fmt::format("id={}{}{} {} level={} nentries={}{} mod_cp={}", node_id, snext, sedge,
+ leaf ? "LEAF" : "INTERIOR", level, nentries, (node_deleted == 0x1) ? " Deleted" : "",
+ modified_cp_id);
}
};
#pragma pack()
@@ -111,10 +118,10 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > {
// Identify if a node is a leaf node or not, from raw buffer, by just reading persistent_hdr_t
static bool identify_leaf_node(uint8_t* buf) { return (r_cast< persistent_hdr_t* >(buf))->leaf; }
+ static std::string to_string_buf(uint8_t* buf) { return (r_cast< persistent_hdr_t* >(buf))->to_compact_string(); }
static BtreeLinkInfo::bnode_link_info identify_edge_info(uint8_t* buf) {
return (r_cast< persistent_hdr_t* >(buf))->edge_info;
}
- static std::string to_string_buf(uint8_t* buf) { return (r_cast< persistent_hdr_t* >(buf))->to_string(); }
static bool is_valid_node(sisl::blob const& buf) {
auto phdr = r_cast< persistent_hdr_t const* >(buf.cbytes());
@@ -347,6 +354,41 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > {
void lock_acknowledge() { m_trans_hdr.upgraders.decrement(1); }
bool any_upgrade_waiters() const { return (!m_trans_hdr.upgraders.testz()); }
+ template < typename K, typename V >
+ std::string to_custom_string(to_string_cb_t< K, V > const& cb) const {
+ std::string snext =
+ (this->next_bnode() == empty_bnodeid) ? "" : fmt::format(" next_node={}", this->next_bnode());
+ auto str = fmt::format("id={}.{} level={} nEntries={} {}{} node_gen={} ", this->node_id(), this->link_version(),
+ this->level(), this->total_entries(), (this->is_leaf() ? "LEAF" : "INTERIOR"), snext,
+ this->node_gen());
+ if (this->has_valid_edge()) {
+ fmt::format_to(std::back_inserter(str), " edge={}.{}", this->edge_info().m_bnodeid,
+ this->edge_info().m_link_version);
+ }
+
+ if (this->total_entries() == 0) {
+ fmt::format_to(std::back_inserter(str), " [EMPTY] ");
+ return str;
+ } else if (this->is_leaf()) {
+ std::vector< std::pair< K, V > > entries;
+ for (uint32_t i{0}; i < this->total_entries(); ++i) {
+ V v;
+ get_nth_value(i, &v, false);
+ entries.emplace_back(std::make_pair(get_nth_key< K >(i, false), v));
+ }
+ fmt::format_to(std::back_inserter(str), " Keys=[{}]", cb(entries));
+ return str;
+ } else {
+ fmt::format_to(std::back_inserter(str), " Keys=[");
+ for (uint32_t i{0}; i < this->total_entries(); ++i) {
+ fmt::format_to(std::back_inserter(str), "{}{}", get_nth_key< K >(i, false).to_string(),
+ (i == this->total_entries() - 1) ? "" : ", ");
+ }
+ fmt::format_to(std::back_inserter(str), "]");
+ }
+ return str;
+ }
+
public:
// Public method which needs to be implemented by variants
virtual btree_status_t insert(uint32_t ind, const BtreeKey& key, const BtreeValue& val) = 0;
@@ -384,7 +426,7 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > {
}
virtual std::string to_string(bool print_friendly = false) const = 0;
- virtual std::string to_string_keys(bool print_friendly = false) const = 0;
+ virtual std::string to_dot_keys() const = 0;
protected:
node_find_result_t bsearch_node(const BtreeKey& key) const {
diff --git a/src/include/homestore/btree/detail/btree_remove_impl.ipp b/src/include/homestore/btree/detail/btree_remove_impl.ipp
index 6b20473d3..82213dcc6 100644
--- a/src/include/homestore/btree/detail/btree_remove_impl.ipp
+++ b/src/include/homestore/btree/detail/btree_remove_impl.ipp
@@ -119,7 +119,7 @@ retry:
goto retry;
} else if (ret == btree_status_t::merge_not_required) {
BT_NODE_LOG(DEBUG, my_node, "merge is not required for child = {} keys: {}", curr_idx,
- child_node->to_string_keys());
+ child_node->to_string());
}
}
}
diff --git a/src/include/homestore/btree/detail/prefix_node.hpp b/src/include/homestore/btree/detail/prefix_node.hpp
index 486dec6f1..072b4f654 100644
--- a/src/include/homestore/btree/detail/prefix_node.hpp
+++ b/src/include/homestore/btree/detail/prefix_node.hpp
@@ -644,7 +644,7 @@ class FixedPrefixNode : public VariantNode< K, V > {
return str;
}
- std::string to_string_keys(bool print_friendly = false) const override { return "NOT Supported"; }
+ std::string to_dot_keys() const override { return "NOT Supported"; }
private:
uint16_t add_prefix(BtreeKey const& key, BtreeValue const& val) {
diff --git a/src/include/homestore/btree/detail/simple_node.hpp b/src/include/homestore/btree/detail/simple_node.hpp
index d909c202f..85318c9b9 100644
--- a/src/include/homestore/btree/detail/simple_node.hpp
+++ b/src/include/homestore/btree/detail/simple_node.hpp
@@ -201,18 +201,22 @@ class SimpleNode : public VariantNode< K, V > {
}
bool has_room_for_put(btree_put_type put_type, uint32_t key_size, uint32_t value_size) const override {
+#ifdef _PRERELEASE
+ // return (this->total_entries() <= 3);
+#endif
return ((put_type == btree_put_type::UPSERT) || (put_type == btree_put_type::INSERT))
? (get_available_entries() > 0)
: true;
}
std::string to_string(bool print_friendly = false) const override {
- auto str = fmt::format("{}id={} level={} nEntries={} {} next_node={} ",
+ auto snext = this->next_bnode() == empty_bnodeid ? "" : fmt::format("next_node={}", this->next_bnode());
+ auto str = fmt::format("{}id={} level={} nEntries={} {} {} ",
(print_friendly ? "------------------------------------------------------------\n" : ""),
this->node_id(), this->level(), this->total_entries(),
- (this->is_leaf() ? "LEAF" : "INTERIOR"), this->next_bnode());
- if (!this->is_leaf() && (this->has_valid_edge())) {
- fmt::format_to(std::back_inserter(str), "edge_id={}.{}", this->edge_info().m_bnodeid,
+ (this->is_leaf() ? "LEAF" : "INTERIOR"), snext);
+ if (this->has_valid_edge()) {
+ fmt::format_to(std::back_inserter(str), " edge={}.{}", this->edge_info().m_bnodeid,
this->edge_info().m_link_version);
}
@@ -224,55 +228,75 @@ class SimpleNode : public VariantNode< K, V > {
return str;
}
- std::string to_string_keys(bool print_friendly = false) const override {
- // FIXME: Implement this, key may not be a unit32_t
- return "";
-#if 0
- std::string delimiter = print_friendly ? "\n" : "\t";
- std::string snext = this->next_bnode() == empty_bnodeid ? "" : fmt::format("next_node={}", this->next_bnode());
- auto str = fmt::format("{}{}.{} level:{} nEntries={} {} {} node_gen={} ",
- print_friendly ? "------------------------------------------------------------\n" : "",
- this->node_id(), this->link_version(), this->level(), this->total_entries(),
- (this->is_leaf() ? "LEAF" : "INTERIOR"), snext, this->node_gen());
- if (!this->is_leaf() && (this->has_valid_edge())) {
- fmt::format_to(std::back_inserter(str), "edge_id={}.{}", this->edge_info().m_bnodeid,
- this->edge_info().m_link_version);
- }
- if (this->total_entries() == 0) {
- fmt::format_to(std::back_inserter(str), " [EMPTY] ");
- return str;
- }
- if (!this->is_leaf()) {
- fmt::format_to(std::back_inserter(str), " [");
- for (uint32_t i{0}; i < this->total_entries(); ++i) {
- uint32_t cur_key = BtreeNode::get_nth_key< K >(i, false).key();
- BtreeLinkInfo child_info;
- get_nth_value(i, &child_info, false /* copy */);
- fmt::format_to(std::back_inserter(str), "{}.{} {}", cur_key, child_info.link_version(),
- i == this->total_entries() - 1 ? "" : ", ");
- }
- fmt::format_to(std::back_inserter(str), "]");
- return str;
+ std::string to_dot_keys() const override {
+ std::string str;
+ std::string snext = this->next_bnode() == empty_bnodeid ? "" : fmt::format("next_node={}", this->next_bnode());
+ str += fmt::format(R"("{}" [
+ shape = none,
+ labelloc="c",
+ fontsize=25,
+ label = <
+ )",
+ this->node_id());
+ if (this->total_entries() == 0) {
+ return str + fmt::format(R"(
+ | E |
+ |
>])");
+ }
+
+ if (!this->is_leaf()) {
+ // str += " ";
+ for (uint32_t i{0}; i < this->total_entries(); ++i) {
+ uint32_t cur_key = get_nth_key< K >(i, false).key();
+ BtreeLinkInfo child_info;
+ get_nth_value(i, &child_info, false /* copy */);
+ str += fmt::format(R"(
+ | {}.{} | )",
+ i, i, cur_key, child_info.link_version());
}
- uint32_t prev_key = BtreeNode::get_nth_key< K >(0, false).key();
+ std::string sedge = this->has_valid_edge() ? "edge:" + std::to_string(this->edge_info().m_bnodeid) + "." +
+ std::to_string(this->edge_info().m_link_version)
+ : "";
+ str += fmt::format(R"(
+ |
+ {}.{} gen={} {} {} |
>];)",
+ this->total_entries(), this->node_id(), this->link_version(), this->node_gen(), snext,
+ sedge);
+
+ } else {
+ std::string keys_buf = "";
+ uint32_t prev_key = get_nth_key< K >(0, false).key();
uint32_t cur_key = prev_key;
- uint32_t last_key = BtreeNode::get_nth_key< K >(this->total_entries() - 1, false).key();
+ uint32_t last_key = get_nth_key< K >(this->total_entries() - 1, false).key();
if (last_key - prev_key == this->total_entries() - 1) {
- if (this->total_entries() == 1)
- fmt::format_to(std::back_inserter(str), "{}[{}]", delimiter, prev_key);
- else
- fmt::format_to(std::back_inserter(str), "{}[{}-{}]", delimiter, prev_key, last_key);
- return str;
+ if (this->total_entries() == 1) {
+ keys_buf += fmt::format(R"(
+ | {} | )",
+ 0, 0, cur_key);
+ } else {
+ keys_buf += fmt::format(R"(
+ | {}-{} | )",
+ 0, 0, prev_key, last_key);
+ }
+ keys_buf += fmt::format(R"(
+ |
+ {}.{} gen={} {} |
+ >];)",
+ 1, this->node_id(), this->link_version(), this->node_gen(), snext);
+ return str + keys_buf;
}
- fmt::format_to(std::back_inserter(str), "{}0 - [{}", delimiter, prev_key);
+
+ keys_buf += fmt::format(R"(
+ " | {})",
+ 0, 0, prev_key);
uint32_t start_interval_key = prev_key;
for (uint32_t i{1}; i < this->total_entries(); ++i) {
- cur_key = BtreeNode::get_nth_key< K >(i, false).key();
+ cur_key = get_nth_key< K >(i, false).key();
if (cur_key != prev_key + 1) {
if (start_interval_key == prev_key) {
- fmt::format_to(std::back_inserter(str), "-{}]{}{}- [{}", prev_key, delimiter, i, cur_key);
+ keys_buf += fmt::format(" {}", cur_key);
} else {
- fmt::format_to(std::back_inserter(str), "]{}{}- [{}", delimiter, i, cur_key);
+ keys_buf += fmt::format("-{} {}", prev_key, cur_key);
}
start_interval_key = cur_key;
}
@@ -280,12 +304,17 @@ class SimpleNode : public VariantNode< K, V > {
}
if (start_interval_key == prev_key) {
- fmt::format_to(std::back_inserter(str), "]");
+ keys_buf += fmt::format(" | ");
} else {
- fmt::format_to(std::back_inserter(str), "-{}]", cur_key);
+ keys_buf += fmt::format(" {}", cur_key);
}
- return str;
-#endif
+ keys_buf += fmt::format(R"(
+ |
+ {}.{} gen={} {} | >];)",
+ 1, this->node_id(), this->link_version(), this->node_gen(), snext);
+ return str + keys_buf;
+ }
+ return str;
}
#ifndef NDEBUG
diff --git a/src/include/homestore/btree/detail/varlen_node.hpp b/src/include/homestore/btree/detail/varlen_node.hpp
index e98b63a52..fcb7ff79c 100644
--- a/src/include/homestore/btree/detail/varlen_node.hpp
+++ b/src/include/homestore/btree/detail/varlen_node.hpp
@@ -508,65 +508,7 @@ class VariableNode : public VariantNode< K, V > {
return str;
}
- std::string to_string_keys(bool print_friendly = false) const override {
-#if 0
- std::string delimiter = print_friendly ? "\n" : "\t";
- auto str = fmt::format("{}{}.{} nEntries={} {} ",
- print_friendly ? "------------------------------------------------------------\n" : "",
- this->node_id(), this->link_version(), this->total_entries(), (this->is_leaf() ? "LEAF" : "INTERIOR"));
- if (!this->is_leaf() && (this->has_valid_edge())) {
- fmt::format_to(std::back_inserter(str), "edge_id={}.{}", this->edge_info().m_bnodeid,
- this->edge_info().m_link_version);
- }
- if (this->total_entries() == 0) {
- fmt::format_to(std::back_inserter(str), " [EMPTY] ");
- return str;
- }
- if (!this->is_leaf()) {
- fmt::format_to(std::back_inserter(str), " [");
- for (uint32_t i{0}; i < this->total_entries(); ++i) {
- uint32_t cur_key = BtreeNode::get_nth_key< K >(i, false).key();
- BtreeLinkInfo child_info;
- get_nth_value(i, &child_info, false /* copy */);
- fmt::format_to(std::back_inserter(str), "{}.{} {}", cur_key, child_info.link_version(), i == this->total_entries() - 1 ? "" : ", ");
- }
- fmt::format_to(std::back_inserter(str), "]");
- return str;
- }
- uint32_t prev_key = BtreeNode::get_nth_key< K >(0, false).key();
- uint32_t cur_key = prev_key;
- uint32_t last_key = BtreeNode::get_nth_key< K >(this->total_entries() - 1, false).key();
- if (last_key - prev_key == this->total_entries() - 1) {
- if (this->total_entries() == 1)
- fmt::format_to(std::back_inserter(str), "{}[{}]", delimiter, prev_key);
- else
- fmt::format_to(std::back_inserter(str), "{}[{}-{}]", delimiter, prev_key, last_key);
- return str;
- }
- fmt::format_to(std::back_inserter(str), "{}0 - [{}", delimiter, prev_key);
- uint32_t start_interval_key = prev_key;
- for (uint32_t i{1}; i < this->total_entries(); ++i) {
- cur_key = BtreeNode::get_nth_key< K >(i, false).key();
- if (cur_key != prev_key + 1) {
- if (start_interval_key == prev_key) {
- fmt::format_to(std::back_inserter(str), "-{}]{}{}- [{}", prev_key, delimiter, i, cur_key);
- } else {
- fmt::format_to(std::back_inserter(str), "]{}{}- [{}", delimiter, i, cur_key);
- }
- start_interval_key = cur_key;
- }
- prev_key = cur_key;
- }
-
- if (start_interval_key == prev_key) {
- fmt::format_to(std::back_inserter(str), "]");
- } else {
- fmt::format_to(std::back_inserter(str), "-{}]", cur_key);
- }
- return str;
-#endif
- return {};
- }
+ std::string to_dot_keys() const override { return "NOT Supported"; }
/*int compare_nth_key_range(const BtreeKeyRange& range, uint32_t ind) const {
return get_nth_key(ind, false).compare_range(range);
diff --git a/src/include/homestore/index/index_internal.hpp b/src/include/homestore/index/index_internal.hpp
index d813fa4e2..85c2a304d 100644
--- a/src/include/homestore/index/index_internal.hpp
+++ b/src/include/homestore/index/index_internal.hpp
@@ -68,11 +68,11 @@ class IndexTableBase {
public:
virtual ~IndexTableBase() = default;
virtual uuid_t uuid() const = 0;
+ virtual void recovery_completed() = 0;
virtual uint32_t ordinal() const = 0;
virtual uint64_t used_size() const = 0;
virtual void destroy() = 0;
virtual void repair_node(IndexBufferPtr const& buf) = 0;
- virtual void repair_root(IndexBufferPtr const& buf) = 0;
};
enum class index_buf_state_t : uint8_t {
@@ -122,6 +122,7 @@ struct IndexBuffer : public sisl::ObjLifeCounter< IndexBuffer > {
bool is_meta_buf() const { return m_is_meta_buf; }
std::string to_string() const;
+ std::string to_string_dot() const;
};
// This is a special buffer which is used to write to the meta block
diff --git a/src/include/homestore/index/index_table.hpp b/src/include/homestore/index/index_table.hpp
index 3461fb357..2bec275e3 100644
--- a/src/include/homestore/index/index_table.hpp
+++ b/src/include/homestore/index/index_table.hpp
@@ -46,6 +46,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
m_sb->parent_uuid = parent_uuid;
m_sb->user_sb_size = user_sb_size;
m_sb.write();
+
m_sb_buffer = std::make_shared< MetaIndexBuffer >(m_sb);
// Create a root node which is a leaf node.
@@ -56,7 +57,24 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
IndexTable(superblk< index_table_sb >&& sb, const BtreeConfig& cfg) : Btree< K, V >{cfg}, m_sb{std::move(sb)} {
m_sb_buffer = std::make_shared< MetaIndexBuffer >(m_sb);
- this->set_root_node_info(BtreeLinkInfo{m_sb->root_node, m_sb->root_link_version});
+
+ // After recovery, we see that root node is empty, which means that after btree is created, we crashed.
+ // So create new root node, which is essential for btree to function.
+ if (m_sb->root_node != empty_bnodeid) {
+ this->set_root_node_info(BtreeLinkInfo{m_sb->root_node, m_sb->root_link_version});
+ }
+ }
+
+ void recovery_completed() override {
+ if (m_sb->root_node == empty_bnodeid) {
+ // After recovery, we see that root node is empty, which means that after btree is created, we crashed.
+ // So create new root node, which is essential for btree to function.
+ auto cp = hs()->cp_mgr().cp_guard();
+ auto const status = this->create_root_node((void*)cp.context(cp_consumer_t::INDEX_SVC));
+ if (status != btree_status_t::success) {
+ throw std::runtime_error(fmt::format("Unable to create root node"));
+ }
+ }
}
void destroy() override {
@@ -96,19 +114,30 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
}
void repair_node(IndexBufferPtr const& idx_buf) override {
- BtreeNode* n = this->init_node(idx_buf->raw_buffer(), idx_buf->blkid().to_integer(), true,
+ if (idx_buf->is_meta_buf()) {
+ // We cannot repair the meta buf on its own, we need to repair the root node which modifies the
+ // meta_buf. It is ok to ignore this call, because repair will be done from root before meta_buf is
+ // attempted to repair, which would have updated the meta_buf already.
+ return;
+ }
+ BtreeNode* n = this->init_node(idx_buf->raw_buffer(), idx_buf->blkid().to_integer(), false /* init_buf */,
BtreeNode::identify_leaf_node(idx_buf->raw_buffer()));
static_cast< IndexBtreeNode* >(n)->attach_buf(idx_buf);
auto cpg = cp_mgr().cp_guard();
- repair_links(BtreeNodePtr{n}, (void*)cpg.context(cp_consumer_t::INDEX_SVC));
- }
- void repair_root(IndexBufferPtr const& root_buf) override {
- BtreeNode* n = this->init_node(root_buf->raw_buffer(), root_buf->blkid().to_integer(), true,
- BtreeNode::identify_leaf_node(root_buf->raw_buffer()));
- static_cast< IndexBtreeNode* >(n)->attach_buf(root_buf);
- auto cpg = cp_mgr().cp_guard();
- on_root_changed(n, (void*)cpg.context(cp_consumer_t::INDEX_SVC));
+ // Set the cp_id to current cp buf, since repair doesn't call get_writable_buf (unlike regular IO path), so
+ // we need to set it here, so that other code path assumes correct cp
+ idx_buf->m_dirtied_cp_id = cpg->id();
+ BtreeNodePtr bn = BtreeNodePtr{n};
+
+ LOGTRACEMOD(wbcache, "repair_node cp={} buf={}", cpg->id(), idx_buf->to_string());
+ repair_links(bn, (void*)cpg.context(cp_consumer_t::INDEX_SVC));
+
+ if (idx_buf->m_up_buffer && idx_buf->m_up_buffer->is_meta_buf()) {
+ // Our up buffer is a meta buffer, which means that we are the new root node, we need to update the
+ // meta_buf with new root as well
+ on_root_changed(bn, (void*)cpg.context(cp_consumer_t::INDEX_SVC));
+ }
}
protected:
@@ -125,21 +154,22 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
auto cp_ctx = r_cast< CPContext* >(context);
auto idx_node = static_cast< IndexBtreeNode* >(node.get());
+ node->set_checksum();
auto prev_state = idx_node->m_idx_buf->m_state.exchange(index_buf_state_t::DIRTY);
if (prev_state == index_buf_state_t::CLEAN) {
// It was clean before, dirtying it first time, add it to the wb_cache list to flush
- BT_DBG_ASSERT_EQ(idx_node->m_idx_buf->m_dirtied_cp_id, cp_ctx->id(),
- "Writing a node which was not acquired by this cp");
+ if (idx_node->m_idx_buf->m_dirtied_cp_id != -1) {
+ BT_DBG_ASSERT_EQ(idx_node->m_idx_buf->m_dirtied_cp_id, cp_ctx->id(),
+ "Writing a node which was not acquired by this cp");
+ }
+ node->set_modified_cp_id(cp_ctx->id());
wb_cache().write_buf(node, idx_node->m_idx_buf, cp_ctx);
- LOGTRACEMOD(wbcache, "add to dirty list cp {} {}", cp_ctx->id(), idx_node->m_idx_buf->to_string());
} else {
BT_DBG_ASSERT_NE(
(int)prev_state, (int)index_buf_state_t::FLUSHING,
"Writing on a node buffer which was currently in flushing state on cur_cp={} buffer_cp_id={}",
- cp_ctx->id(), idx_node->m_idx_buf->m_dirtied_cp_id)
+ cp_ctx->id(), idx_node->m_idx_buf->m_dirtied_cp_id);
}
- node->set_checksum();
- node->set_modified_cp_id(cp_ctx->id());
return btree_status_t::success;
}
@@ -147,24 +177,25 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
const BtreeNodePtr& left_child_node, const BtreeNodePtr& parent_node,
void* context) override {
CPContext* cp_ctx = r_cast< CPContext* >(context);
- auto& left_child_buf = static_cast< IndexBtreeNode* >(left_child_node.get())->m_idx_buf;
- auto& parent_buf = static_cast< IndexBtreeNode* >(parent_node.get())->m_idx_buf;
IndexBufferPtrList new_node_bufs;
for (const auto& right_child_node : new_nodes) {
write_node_impl(right_child_node, context);
- new_node_bufs.push_back(static_cast< IndexBtreeNode* >(right_child_node.get())->m_idx_buf);
+ new_node_bufs.push_back(s_cast< IndexBtreeNode* >(right_child_node.get())->m_idx_buf);
}
write_node_impl(left_child_node, context);
- write_node_impl(parent_node, context);
+ // during recovery it is possible that there is no parent_node
+ if (parent_node.get() != nullptr) { write_node_impl(parent_node, context); }
IndexBufferPtrList freed_node_bufs;
for (const auto& freed_node : freed_nodes) {
- freed_node_bufs.push_back(static_cast< IndexBtreeNode* >(freed_node.get())->m_idx_buf);
+ freed_node_bufs.push_back(s_cast< IndexBtreeNode* >(freed_node.get())->m_idx_buf);
this->free_node(freed_node, locktype_t::WRITE, context);
}
- wb_cache().transact_bufs(ordinal(), parent_buf, left_child_buf, new_node_bufs, freed_node_bufs, cp_ctx);
+ wb_cache().transact_bufs(
+ ordinal(), parent_node.get() ? s_cast< IndexBtreeNode* >(parent_node.get())->m_idx_buf : nullptr,
+ s_cast< IndexBtreeNode* >(left_child_node.get())->m_idx_buf, new_node_bufs, freed_node_bufs, cp_ctx);
return btree_status_t::success;
}
@@ -206,7 +237,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
}
btree_status_t repair_links(BtreeNodePtr const& parent_node, void* cp_ctx) {
- BT_LOG(DEBUG, "Repairing links for parent node {}", parent_node->node_id());
+ BT_LOG(DEBUG, "Repairing links for parent node {}", parent_node->to_string());
// Get the last key in the node
auto const last_parent_key = parent_node->get_last_key< K >();
@@ -216,6 +247,8 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
parent_node->node_id());
return btree_status_t::not_found;
}
+ BT_LOG(INFO, "Repairing node={} with last_parent_key={}", parent_node->to_string(),
+ last_parent_key.to_string());
// Get the first child node and its link info
BtreeLinkInfo child_info;
@@ -239,15 +272,19 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
auto cur_parent = parent_node;
BtreeNodeList new_parent_nodes;
do {
- if (child_node->has_valid_edge()) {
- BT_DBG_ASSERT(!is_parent_edge_node,
+ if (child_node->has_valid_edge() ||
+ (child_node->is_leaf() && (child_node->next_bnode() == empty_bnodeid))) {
+ BT_DBG_ASSERT(is_parent_edge_node,
"Child node={} is an edge node but parent_node={} is not an edge node",
- child_node->node_id(), parent_node->node_id());
+ child_node->node_id(), cur_parent->node_id());
cur_parent->set_edge_value(BtreeLinkInfo{child_node->node_id(), child_node->link_version()});
break;
}
auto const child_last_key = child_node->get_last_key< K >();
+ BT_LOG(INFO, "Repairing node={} child_node={} child_last_key={}", cur_parent->node_id(),
+ child_node->to_string(), child_last_key.to_string());
+
if (child_last_key.compare(last_parent_key) > 0) {
// We have reached the last key, we can stop now
break;
@@ -275,13 +312,16 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
cur_parent->insert(cur_parent->total_entries(), child_last_key,
BtreeLinkInfo{child_node->node_id(), child_node->link_version()});
+ BT_LOG(INFO, "Repairing node={}, repaired so_far={}", cur_parent->node_id(), cur_parent->to_string());
+
// Move to the next child node
+ this->unlock_node(child_node, locktype_t::READ);
auto const next_node_id = child_node->next_bnode();
if (next_node_id == empty_bnodeid) {
- BT_LOG_ASSERT(
- false,
- "Child node={} next_node_id is empty, while its not a edge node, parent_node={} repair is partial",
- child_node->node_id(), parent_node->node_id());
+ BT_LOG_ASSERT(false,
+ "Child node={} next_node_id is empty, while its not a edge node, parent_node={} "
+ "repair is partial",
+ child_node->node_id(), parent_node->node_id());
ret = btree_status_t::not_found;
break;
}
@@ -293,18 +333,19 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
break;
}
} while (true);
+ this->unlock_node(child_node, locktype_t::READ);
if (ret == btree_status_t::success) {
ret = transact_nodes(new_parent_nodes, {}, parent_node, nullptr, cp_ctx);
}
if (ret != btree_status_t::success) {
- BT_LOG(DEBUG, "An error occurred status={} during repair of parent_node={}, aborting the repair",
+ BT_LOG(ERROR, "An error occurred status={} during repair of parent_node={}, aborting the repair",
enum_name(ret), parent_node->node_id());
std::memcpy(parent_node->m_phys_node_buf, tmp_buffer, this->m_bt_cfg.node_size());
}
- delete tmp_buffer;
+ delete[] tmp_buffer;
return ret;
}
};
diff --git a/src/include/homestore/index/wb_cache_base.hpp b/src/include/homestore/index/wb_cache_base.hpp
index d576c2b6b..4624f9444 100644
--- a/src/include/homestore/index/wb_cache_base.hpp
+++ b/src/include/homestore/index/wb_cache_base.hpp
@@ -64,6 +64,7 @@ class IndexWBCacheBase {
/// @param cur_buf
/// @return
// virtual IndexBufferPtr copy_buffer(const IndexBufferPtr& cur_buf, const CPContext* context) const = 0;
+ virtual void recover(sisl::byte_view sb) = 0;
};
} // namespace homestore
diff --git a/src/include/homestore/index_service.hpp b/src/include/homestore/index_service.hpp
index cfd5bd5f7..fb776eb6b 100644
--- a/src/include/homestore/index_service.hpp
+++ b/src/include/homestore/index_service.hpp
@@ -47,6 +47,7 @@ class IndexService {
std::shared_ptr< VirtualDev > m_vdev;
std::pair< meta_blk*, sisl::byte_view > m_wbcache_sb{
std::pair< meta_blk*, sisl::byte_view >{nullptr, sisl::byte_view{}}};
+ std::vector< std::pair< meta_blk*, sisl::byte_view > > m_itable_sbs;
std::unique_ptr< sisl::IDReserver > m_ordinal_reserver;
mutable std::mutex m_index_map_mtx;
@@ -80,12 +81,8 @@ class IndexService {
uint64_t used_size() const;
uint32_t node_size() const;
void repair_index_node(uint32_t ordinal, IndexBufferPtr const& node_buf);
- void repair_index_root(uint32_t ordinal, IndexBufferPtr const& root_buf);
IndexWBCacheBase& wb_cache() { return *m_wb_cache; }
-
-private:
- void itable_meta_blk_found(const sisl::byte_view& buf, void* meta_cookie);
};
extern IndexService& index_service();
diff --git a/src/lib/blkalloc/append_blk_allocator.cpp b/src/lib/blkalloc/append_blk_allocator.cpp
index c7d18e0b1..4a4c7fd18 100644
--- a/src/lib/blkalloc/append_blk_allocator.cpp
+++ b/src/lib/blkalloc/append_blk_allocator.cpp
@@ -68,12 +68,12 @@ BlkAllocStatus AppendBlkAllocator::alloc_contiguous(BlkId& bid) { return alloc(1
//
BlkAllocStatus AppendBlkAllocator::alloc(blk_count_t nblks, const blk_alloc_hints& hint, BlkId& out_bid) {
if (available_blks() < nblks) {
- //COUNTER_INCREMENT(m_metrics, num_alloc_failure, 1);
+ // COUNTER_INCREMENT(m_metrics, num_alloc_failure, 1);
LOGERROR("No space left to serve request nblks: {}, available_blks: {}", nblks, available_blks());
return BlkAllocStatus::SPACE_FULL;
} else if (nblks > max_blks_per_blkid()) {
// consumer(vdev) already handles this case.
- //COUNTER_INCREMENT(m_metrics, num_alloc_failure, 1);
+ // COUNTER_INCREMENT(m_metrics, num_alloc_failure, 1);
LOGERROR("Can't serve request nblks: {} larger than max_blks_in_op: {}", nblks, max_blks_per_blkid());
return BlkAllocStatus::FAILED;
}
@@ -81,7 +81,7 @@ BlkAllocStatus AppendBlkAllocator::alloc(blk_count_t nblks, const blk_alloc_hint
// Push 1 blk to the vector which has all the requested nblks;
out_bid = BlkId{m_last_append_offset.fetch_add(nblks), nblks, m_chunk_id};
- //COUNTER_INCREMENT(m_metrics, num_alloc, 1);
+ // COUNTER_INCREMENT(m_metrics, num_alloc, 1);
return BlkAllocStatus::SUCCESS;
}
diff --git a/src/lib/blkalloc/append_blk_allocator.h b/src/lib/blkalloc/append_blk_allocator.h
index ed256d7f1..384a4936b 100644
--- a/src/lib/blkalloc/append_blk_allocator.h
+++ b/src/lib/blkalloc/append_blk_allocator.h
@@ -109,7 +109,7 @@ class AppendBlkAllocator : public BlkAllocator {
std::string to_string() const override;
void cp_flush(CP* cp) override;
-
+ void recovery_completed() override {}
nlohmann::json get_status(int log_level) const override;
private:
diff --git a/src/lib/blkalloc/bitmap_blk_allocator.h b/src/lib/blkalloc/bitmap_blk_allocator.h
index 03ba713d8..381767bef 100644
--- a/src/lib/blkalloc/bitmap_blk_allocator.h
+++ b/src/lib/blkalloc/bitmap_blk_allocator.h
@@ -76,6 +76,7 @@ class BitmapBlkAllocator : public BlkAllocator {
bool is_blk_alloced_on_disk(BlkId const& b, bool use_lock = false) const override;
void cp_flush(CP* cp) override;
+ void recovery_completed() override {}
blk_num_t get_num_portions() const { return (m_num_blks - 1) / m_blks_per_portion + 1; }
blk_num_t get_blks_per_portion() const { return m_blks_per_portion; }
diff --git a/src/lib/blkalloc/blk_allocator.h b/src/lib/blkalloc/blk_allocator.h
index c95935c77..3ba0ecf82 100644
--- a/src/lib/blkalloc/blk_allocator.h
+++ b/src/lib/blkalloc/blk_allocator.h
@@ -160,6 +160,7 @@ class BlkAllocator {
virtual blk_num_t get_used_blks() const = 0;
virtual bool is_blk_alloced(BlkId const& b, bool use_lock = false) const = 0;
virtual bool is_blk_alloced_on_disk(BlkId const& b, bool use_lock = false) const = 0;
+ virtual void recovery_completed() = 0;
virtual std::string to_string() const = 0;
virtual void cp_flush(CP* cp) = 0;
diff --git a/src/lib/blkalloc/fixed_blk_allocator.cpp b/src/lib/blkalloc/fixed_blk_allocator.cpp
index 1e2af3257..9df3a07ca 100644
--- a/src/lib/blkalloc/fixed_blk_allocator.cpp
+++ b/src/lib/blkalloc/fixed_blk_allocator.cpp
@@ -54,33 +54,23 @@ blk_num_t FixedBlkAllocator::init_portion(BlkAllocPortion& portion, blk_num_t st
bool FixedBlkAllocator::is_blk_alloced(BlkId const& b, bool use_lock) const { return true; }
BlkAllocStatus FixedBlkAllocator::alloc([[maybe_unused]] blk_count_t nblks, blk_alloc_hints const&, BlkId& out_blkid) {
- if (m_state == state_t::RECOVERING) {
- // Possibly first few attempts to allocate; under lock, remove all the blks which are marked to be removed from
- // the free list
- std::lock_guard lg(m_mark_blk_mtx);
- if (!m_marked_blks.empty()) {
- auto const count = available_blks();
- for (uint64_t i{0}; ((i < count) && !m_marked_blks.empty()); ++i) {
- blk_num_t blk_num;
- if (!m_free_blk_q.read(blk_num)) { break; }
-
- if (m_marked_blks.find(blk_num) != m_marked_blks.end()) {
- m_marked_blks.erase(blk_num); // This blk needs to be skipped
- } else {
- m_free_blk_q.write(blk_num); // This blk is not marked, put it back at the end of queue
- }
- }
- HS_DBG_ASSERT(m_marked_blks.empty(), "All marked blks should have been removed from free list");
- }
- m_state = state_t::ACTIVE;
- }
-
#ifdef _PRERELEASE
if (iomgr_flip::instance()->test_flip("fixed_blkalloc_no_blks")) { return BlkAllocStatus::SPACE_FULL; }
#endif
+retry:
blk_num_t blk_num;
if (!m_free_blk_q.read(blk_num)) { return BlkAllocStatus::SPACE_FULL; }
+ if (m_state != state_t::ACTIVE) {
+ // We are not in active state, means we must be recovering. During recovery state, if any of the blks which are
+ // committed, we shouldn't be allocating those blocks. So remove from free blk q and retry another blk
+ std::lock_guard lg(m_reserve_blk_mtx);
+ if ((m_state == state_t::RECOVERING) && (m_reserved_blks.find(blk_num) != m_reserved_blks.end())) {
+ m_reserved_blks.erase(blk_num); // This blk can be removed from reserved, because its no longer free (since
+ // we also removed from free_blk q)
+ goto retry;
+ }
+ }
out_blkid = BlkId{blk_num, 1, m_chunk_id};
return BlkAllocStatus::SUCCESS;
}
@@ -88,18 +78,42 @@ BlkAllocStatus FixedBlkAllocator::alloc([[maybe_unused]] blk_count_t nblks, blk_
BlkAllocStatus FixedBlkAllocator::alloc_contiguous(BlkId& out_blkid) { return alloc(1, {}, out_blkid); }
BlkAllocStatus FixedBlkAllocator::reserve_on_cache(BlkId const& b) {
- std::lock_guard lg(m_mark_blk_mtx);
- HS_DBG_ASSERT(m_state == state_t::RECOVERING, "reserve_on_cache called on non-recovery path");
- m_marked_blks.insert(b.blk_num());
+ std::lock_guard lg(m_reserve_blk_mtx);
+ if (m_state == state_t::RECOVERING) { m_reserved_blks.insert(b.blk_num()); }
return BlkAllocStatus::SUCCESS;
}
+void FixedBlkAllocator::recovery_completed() {
+ std::lock_guard lg(m_reserve_blk_mtx);
+ if (!m_reserved_blks.empty()) {
+ auto const count = available_blks();
+ for (uint64_t i{0}; ((i < count) && !m_reserved_blks.empty()); ++i) {
+ blk_num_t blk_num;
+ if (!m_free_blk_q.read(blk_num)) { break; }
+
+ if (m_reserved_blks.find(blk_num) == m_reserved_blks.end()) {
+ m_free_blk_q.write(blk_num); // This blk is not marked, put it back at the end of queue
+ }
+ }
+ // We no longer need any reserved blks, since all of them are now marked as allocated
+ m_reserved_blks.clear();
+ }
+ m_state = state_t::ACTIVE;
+}
+
void FixedBlkAllocator::free(BlkId const& b) {
HS_DBG_ASSERT_EQ(b.blk_count(), 1, "Multiple blk free for FixedBlkAllocator? allocated by different allocator?");
const auto pushed = m_free_blk_q.write(b.blk_num());
HS_DBG_ASSERT_EQ(pushed, true, "Expected to be able to push the blk on fixed capacity Q");
+ if (m_state != state_t::ACTIVE) {
+ std::lock_guard lg(m_reserve_blk_mtx);
+ if (m_state == state_t::RECOVERING) {
+ // If we are in recovering state and freeing blk would removed it from being reserved as well.
+ m_reserved_blks.erase(b.blk_num());
+ }
+ }
if (is_persistent()) { free_on_disk(b); }
}
diff --git a/src/lib/blkalloc/fixed_blk_allocator.h b/src/lib/blkalloc/fixed_blk_allocator.h
index adfe7b7b4..fa28681f2 100644
--- a/src/lib/blkalloc/fixed_blk_allocator.h
+++ b/src/lib/blkalloc/fixed_blk_allocator.h
@@ -36,6 +36,7 @@ class FixedBlkAllocator : public BitmapBlkAllocator {
BlkAllocStatus alloc(blk_count_t nblks, blk_alloc_hints const& hints, BlkId& out_blkid) override;
BlkAllocStatus reserve_on_cache(BlkId const& b) override;
void free(BlkId const& b) override;
+ void recovery_completed() override;
blk_num_t available_blks() const override;
blk_num_t get_used_blks() const override;
@@ -50,8 +51,8 @@ class FixedBlkAllocator : public BitmapBlkAllocator {
enum class state_t : uint8_t { RECOVERING, ACTIVE };
state_t m_state{state_t::RECOVERING};
- std::unordered_set< blk_num_t > m_marked_blks; // Keep track of all blks which are marked as allocated
- std::mutex m_mark_blk_mtx; // Mutex used while removing marked_blks from blk_q
+ std::unordered_set< blk_num_t > m_reserved_blks; // Keep track of all blks which are reserved as allocated
+ std::mutex m_reserve_blk_mtx; // Mutex used while removing marked_blks from blk_q
folly::MPMCQueue< blk_num_t > m_free_blk_q;
};
} // namespace homestore
diff --git a/src/lib/common/crash_simulator.hpp b/src/lib/common/crash_simulator.hpp
index f1c0b3024..98c22fe17 100644
--- a/src/lib/common/crash_simulator.hpp
+++ b/src/lib/common/crash_simulator.hpp
@@ -9,7 +9,7 @@ namespace homestore {
class CrashSimulator {
public:
- CrashSimulator(std::function< void(void) > cb = nullptr) : m_restart_cb{cb} {}
+ CrashSimulator(std::function< void(void) > cb = nullptr) : m_restart_cb{std::move(cb)} {}
~CrashSimulator() = default;
void crash() {
@@ -17,14 +17,17 @@ class CrashSimulator {
m_crashed.update([](auto* s) { *s = true; });
// We can restart on a new thread to allow other operations to continue
- std::thread t([this]() { m_restart_cb(); });
+ std::thread t([cb = std::move(m_restart_cb)]() {
+ // Restart could destroy this pointer, so we are storing in local variable and then calling.
+ cb();
+ });
t.detach();
} else {
raise(SIGKILL);
}
}
- bool is_crashed() const { return ((m_restart_cb != nullptr) && *(m_crashed.access().get())); }
+ bool is_crashed() const { return *(m_crashed.access().get()); }
bool crash_if_flip_set(const std::string& flip_name) {
if (iomgr_flip::instance()->test_flip(flip_name)) {
diff --git a/src/lib/device/device.h b/src/lib/device/device.h
index 16442e82f..beefdfc7f 100644
--- a/src/lib/device/device.h
+++ b/src/lib/device/device.h
@@ -151,6 +151,7 @@ class DeviceManager {
bool is_first_time_boot() const { return m_first_time_boot; }
void format_devices();
+ void commit_formatting();
void load_devices();
void close_devices();
bool is_boot_in_degraded_mode() const { return m_boot_in_degraded_mode; }
diff --git a/src/lib/device/device_manager.cpp b/src/lib/device/device_manager.cpp
index 5d16fc601..ea8461c67 100644
--- a/src/lib/device/device_manager.cpp
+++ b/src/lib/device/device_manager.cpp
@@ -116,6 +116,7 @@ void DeviceManager::format_devices() {
first_block* fblk = r_cast< first_block* >(buf);
fblk->magic = first_block::HOMESTORE_MAGIC;
fblk->checksum = 0; // Computed while writing the first block
+ fblk->formatting_done = 0x0; // Formatting is not done yet, until homestore is completely started
fblk->hdr = m_first_blk_hdr; // Entire header is copied as is
auto pdev_id = populate_pdev_info(dinfo, attr, m_first_blk_hdr.system_uuid, fblk->this_pdev_hdr);
fblk->checksum = crc32_ieee(init_crc32, uintptr_cast(fblk), first_block::s_atomic_fb_size);
@@ -176,6 +177,27 @@ void DeviceManager::load_devices() {
load_vdevs();
}
+void DeviceManager::commit_formatting() {
+ auto buf = hs_utils::iobuf_alloc(hs_super_blk::first_block_size(), sisl::buftag::superblk, 512);
+ for (auto& pdev : m_all_pdevs) {
+ if (!pdev) { continue; }
+
+ auto err = pdev->read_super_block(buf, hs_super_blk::first_block_size(), hs_super_blk::first_block_offset());
+ if (err) {
+ LOGERROR("Failed to read first block from device={}, error={}", pdev->get_devname(), err.message());
+ continue;
+ }
+
+ first_block* fblk = r_cast< first_block* >(buf);
+ fblk->formatting_done = 0x1;
+ fblk->checksum = crc32_ieee(init_crc32, uintptr_cast(fblk), first_block::s_atomic_fb_size);
+
+ pdev->write_super_block(buf, hs_super_blk::first_block_size(), hs_super_blk::first_block_offset());
+ }
+ hs_utils::iobuf_free(buf, sisl::buftag::superblk);
+ LOGINFO("HomeStore formatting is committed on all physical devices");
+}
+
void DeviceManager::close_devices() {
for (auto& pdev : m_all_pdevs) {
if (pdev) { pdev->close_device(); }
diff --git a/src/lib/device/hs_super_blk.h b/src/lib/device/hs_super_blk.h
index 460894b35..9f909c5ea 100644
--- a/src/lib/device/hs_super_blk.h
+++ b/src/lib/device/hs_super_blk.h
@@ -131,8 +131,10 @@ struct first_block {
static constexpr uint32_t HOMESTORE_MAGIC{0xCEEDDEEB}; // Magic written as first bytes on each device
public:
- uint64_t magic{0}; // Header magic expected to be at the top of block
- uint32_t checksum{0}; // Checksum of the entire first block (excluding this field)
+ uint64_t magic{0}; // Header magic expected to be at the top of block
+ uint32_t checksum{0}; // Checksum of the entire first block (excluding this field)
+ uint32_t formatting_done : 1 {0}; // Has formatting completed yet
+ uint32_t reserved : 31 {0};
first_block_header hdr; // Information about the entire system
pdev_info_header this_pdev_hdr; // Information about the current pdev
@@ -141,7 +143,8 @@ struct first_block {
bool is_valid() const {
return ((magic == HOMESTORE_MAGIC) &&
- (std::string(hdr.product_name) == std::string(first_block_header::PRODUCT_NAME)));
+ (std::string(hdr.product_name) == std::string(first_block_header::PRODUCT_NAME) &&
+ (formatting_done != 0x0)));
}
std::string to_string() const {
@@ -203,8 +206,7 @@ class hs_super_blk {
return (dinfo.dev_size - 1) / min_chunk_size + 1;
}
static uint32_t min_chunk_size(HSDevType dtype) {
- uint64_t min_chunk_size =
- (dtype == HSDevType::Fast) ? MIN_CHUNK_SIZE_FAST_DEVICE : MIN_CHUNK_SIZE_DATA_DEVICE;
+ uint64_t min_chunk_size = (dtype == HSDevType::Fast) ? MIN_CHUNK_SIZE_FAST_DEVICE : MIN_CHUNK_SIZE_DATA_DEVICE;
#ifdef _PRERELEASE
auto chunk_size = iomgr_flip::instance()->get_test_flip< long >("set_minimum_chunk_size");
if (chunk_size) { min_chunk_size = chunk_size.get(); }
diff --git a/src/lib/device/virtual_dev.cpp b/src/lib/device/virtual_dev.cpp
index ae4cf5d18..37b32804f 100644
--- a/src/lib/device/virtual_dev.cpp
+++ b/src/lib/device/virtual_dev.cpp
@@ -696,6 +696,13 @@ void VirtualDev::cp_flush(VDevCPContext* v_cp_ctx) {
// sync-ops during cp_flush, so return 100;
int VirtualDev::cp_progress_percent() { return 100; }
+void VirtualDev::recovery_completed() {
+ if (m_allocator_type != blk_allocator_type_t::append) {
+ m_chunk_selector->foreach_chunks(
+ [this](cshared< Chunk >& chunk) { chunk->blk_allocator_mutable()->recovery_completed(); });
+ }
+}
+
///////////////////////// VirtualDev Private Methods /////////////////////////////
uint64_t VirtualDev::to_dev_offset(BlkId const& b, Chunk** chunk) const {
*chunk = m_dmgr.get_chunk_mutable(b.chunk_num());
diff --git a/src/lib/device/virtual_dev.hpp b/src/lib/device/virtual_dev.hpp
index 6901be0f4..c004e3c03 100644
--- a/src/lib/device/virtual_dev.hpp
+++ b/src/lib/device/virtual_dev.hpp
@@ -275,6 +275,8 @@ class VirtualDev {
std::unique_ptr< CPContext > create_cp_context(CP* cp);
+ void recovery_completed();
+
////////////////////////// Standard Getters ///////////////////////////////
virtual uint64_t available_blks() const;
virtual uint64_t size() const { return m_vdev_info.vdev_size; }
diff --git a/src/lib/homestore.cpp b/src/lib/homestore.cpp
index 24211e24d..af2d521c5 100644
--- a/src/lib/homestore.cpp
+++ b/src/lib/homestore.cpp
@@ -264,9 +264,17 @@ void HomeStore::do_start() {
}
}
- m_cp_mgr->start_timer();
+ // If this is the first time boot, we need to commit the formatting so that it will not be considered as first time
+ // boot going forward on next reboot.
+ if (m_dev_mgr->is_first_time_boot()) {
+ // Take the first CP after we have initialized all subsystems and wait for it to complete.
+ m_cp_mgr->trigger_cp_flush(true /* force */).get();
+ m_dev_mgr->commit_formatting();
+ }
+ m_cp_mgr->start_timer();
m_resource_mgr->start(m_dev_mgr->total_capacity());
+
m_init_done = true;
}
diff --git a/src/lib/index/index_cp.cpp b/src/lib/index/index_cp.cpp
index 6c8d06395..955bd523f 100644
--- a/src/lib/index/index_cp.cpp
+++ b/src/lib/index/index_cp.cpp
@@ -51,7 +51,9 @@ void IndexCPContext::add_to_txn_journal(uint32_t index_ordinal, const IndexBuffe
rec->append(op_t::parent_inplace, parent_buf->blkid());
if (parent_buf->is_meta_buf()) { rec->is_parent_meta = 0x1; }
}
- if (left_child_buf) { rec->append(op_t::child_inplace, left_child_buf->blkid()); }
+ if (left_child_buf && (left_child_buf != parent_buf)) {
+ rec->append(op_t::child_inplace, left_child_buf->blkid());
+ }
for (auto const& buf : created_bufs) {
rec->append(op_t::child_new, buf->blkid());
}
@@ -79,7 +81,7 @@ std::optional< IndexBufferPtr > IndexCPContext::next_dirty() {
}
std::string IndexCPContext::to_string() {
- std::string str{fmt::format("IndexCPContext cpid={} dirty_buf_count={} dirty_buf_list_size={}", m_cp->id(),
+ std::string str{fmt::format("IndexCPContext cpid={} dirty_buf_count={} dirty_buf_list_size={}\n", m_cp->id(),
m_dirty_buf_count.get(), m_dirty_buf_list.size())};
// Mapping from a node to all its parents in the graph.
@@ -106,6 +108,42 @@ std::string IndexCPContext::to_string() {
return str;
}
+void IndexCPContext::to_string_dot(const std::string& filename) {
+ std::ofstream file(filename);
+ if (!file.is_open()) { throw std::runtime_error("Failed to open file: " + filename); }
+
+ file << "digraph G {\n";
+
+ // Mapping from a node to all its parents in the graph.
+ std::unordered_map< IndexBuffer*, std::vector< IndexBuffer* > > parents;
+
+ m_dirty_buf_list.foreach_entry([&parents](IndexBufferPtr buf) {
+ // Add this buf to his children.
+ parents[buf->m_up_buffer.get()].emplace_back(buf.get());
+ });
+ m_dirty_buf_list.foreach_entry([&file, &parents, this](IndexBufferPtr buf) {
+ std::vector< std::string > colors = {"lightgreen", "lightcoral", "lightyellow"};
+ auto sbuf = BtreeNode::to_string_buf(buf->raw_buffer());
+ auto pos = sbuf.find("LEAF");
+ if (pos != std::string::npos) {
+ sbuf.insert(pos + 4, "
");
+ } else {
+ pos = sbuf.find("INTERIOR");
+ if (pos != std::string::npos) { sbuf.insert(pos + 8, "
"); }
+ }
+ file << fmt::format(
+ "\"{}\" [shape={}, label=< {}
{} >, fillcolor=\"{}\", style=\"filled\", fontname=\"bold\"];\n",
+ r_cast< void* >(buf.get()), m_cp->id() == buf->m_created_cp_id ? "ellipse" : "box", buf->to_string_dot(),
+ sbuf, colors[s_cast< int >(buf->state())]);
+ for (const auto& p : parents[buf.get()]) {
+ file << fmt::format("\"{}\" -> \"{}\";\n", r_cast< void* >(p), r_cast< void* >(buf.get()));
+ }
+ });
+ file << "}\n";
+
+ file.close();
+}
+
std::string IndexCPContext::to_string_with_dags() {
struct DagNode {
IndexBufferPtr buf;
@@ -189,13 +227,12 @@ std::map< BlkId, IndexBufferPtr > IndexCPContext::recover(sisl::byte_view sb) {
void IndexCPContext::process_txn_record(txn_record const* rec, std::map< BlkId, IndexBufferPtr >& buf_map) {
auto cpg = cp_mgr().cp_guard();
- auto const rec_to_buf = [&buf_map, &cpg](txn_record const* rec, uint32_t idx,
+ auto const rec_to_buf = [&buf_map, &cpg](txn_record const* rec, bool is_meta, BlkId const& bid,
IndexBufferPtr const& up_buf) -> IndexBufferPtr {
- BlkId const bid = rec->blk_id(idx);
IndexBufferPtr buf;
auto it = buf_map.find(bid);
if (it == buf_map.end()) {
- if (rec->is_parent_meta) {
+ if (is_meta) {
superblk< index_table_sb > tmp_sb;
buf = std::make_shared< MetaIndexBuffer >(tmp_sb);
} else {
@@ -213,27 +250,86 @@ void IndexCPContext::process_txn_record(txn_record const* rec, std::map< BlkId,
if (up_buf) {
DEBUG_ASSERT(((buf->m_up_buffer == nullptr) || (buf->m_up_buffer == up_buf)), "Inconsistent up buffer");
- up_buf->m_wait_for_down_buffers.increment(1);
- buf->m_up_buffer = up_buf;
+ auto real_up_buf = (up_buf->m_created_cp_id == cpg->id()) ? up_buf->m_up_buffer : up_buf;
+
+#ifndef NDEBUG
+ // if (!is_sibling_link || (buf->m_up_buffer == real_up_buf)) { return buf;}
+ // Already linked with same buf or its not a sibling link to override
+ bool found{false};
+ for (auto const& dbuf : real_up_buf->m_down_buffers) {
+ if (dbuf.lock() == buf) {
+ found = true;
+ break;
+ }
+ }
+ if (found) { return buf; }
+ real_up_buf->m_down_buffers.emplace_back(buf);
+#endif
+
+ if (buf->m_up_buffer != real_up_buf) {
+ real_up_buf->m_wait_for_down_buffers.increment(1);
+ buf->m_up_buffer = real_up_buf;
+ }
}
return buf;
};
uint32_t cur_idx = 0;
- IndexBufferPtr parent_buf;
- if (rec->has_inplace_parent) { parent_buf = rec_to_buf(rec, cur_idx++, nullptr); }
+ IndexBufferPtr parent_buf{nullptr};
+ if (rec->has_inplace_parent) { parent_buf = rec_to_buf(rec, rec->is_parent_meta, rec->blk_id(cur_idx++), nullptr); }
- IndexBufferPtr inplace_child_buf;
- if (rec->has_inplace_child) { inplace_child_buf = rec_to_buf(rec, cur_idx++, parent_buf); }
+ IndexBufferPtr inplace_child_buf{nullptr};
+ if (rec->has_inplace_child) {
+ inplace_child_buf = rec_to_buf(rec, false /* is_meta */, rec->blk_id(cur_idx++), parent_buf);
+ }
for (uint8_t idx{0}; idx < rec->num_new_ids; ++idx) {
- auto new_buf = rec_to_buf(rec, cur_idx++, inplace_child_buf);
+ auto new_buf = rec_to_buf(rec, false /* is_meta */, rec->blk_id(cur_idx++),
+ inplace_child_buf ? inplace_child_buf : parent_buf);
new_buf->m_created_cp_id = cpg->id();
}
for (uint8_t idx{0}; idx < rec->num_freed_ids; ++idx) {
- auto freed_buf = rec_to_buf(rec, cur_idx++, inplace_child_buf);
+ auto freed_buf = rec_to_buf(rec, false /* is_meta */, rec->blk_id(cur_idx++),
+ inplace_child_buf ? inplace_child_buf : parent_buf);
freed_buf->m_node_freed = true;
}
}
+
+void IndexCPContext::txn_journal::log_records() const { LOGINFO("{}", to_string()); }
+
+std::string IndexCPContext::txn_journal::to_string() const {
+ std::string str = fmt::format("cp_id={}, num_txns={}, size={}", cp_id, num_txns, size);
+ uint8_t const* cur_ptr = r_cast< uint8_t const* >(this) + sizeof(txn_journal);
+ for (uint32_t t{0}; t < num_txns; ++t) {
+ txn_record const* rec = r_cast< txn_record const* >(cur_ptr);
+ fmt::format_to(std::back_inserter(str), "\n {}: {}", t, rec->to_string());
+ cur_ptr += rec->size();
+ }
+ return str;
+}
+
+std::string IndexCPContext::txn_record::to_string() const {
+ auto add_to_string = [this](std::string& str, uint8_t& idx, uint8_t id_count) {
+ if (id_count == 0) {
+ fmt::format_to(std::back_inserter(str), "empty]");
+ } else {
+ for (uint8_t i{0}; i < id_count; ++i, ++idx) {
+ fmt::format_to(std::back_inserter(str), "[chunk={}, blk={}],", ids[idx].second, ids[idx].first);
+ }
+ fmt::format_to(std::back_inserter(str), "]");
+ }
+ };
+
+ std::string str = fmt::format("ordinal={}, parent=[{}], in_place_child=[{}]", index_ordinal, parent_id_string(),
+ child_id_string(), num_new_ids, num_freed_ids);
+
+ uint8_t idx = (has_inplace_parent == 0x1) ? 1 : 0 + (has_inplace_child == 0x1) ? 1 : 0;
+ fmt::format_to(std::back_inserter(str), ", new_ids=[");
+ add_to_string(str, idx, num_new_ids);
+
+ fmt::format_to(std::back_inserter(str), ", freed_ids=[");
+ add_to_string(str, idx, num_freed_ids);
+ return str;
+}
} // namespace homestore
diff --git a/src/lib/index/index_cp.hpp b/src/lib/index/index_cp.hpp
index 044d935dc..1b8a2a2b0 100644
--- a/src/lib/index/index_cp.hpp
+++ b/src/lib/index/index_cp.hpp
@@ -34,52 +34,76 @@ struct IndexCPContext : public VDevCPContext {
using compact_blkid_t = std::pair< blk_num_t, chunk_num_t >;
enum class op_t : uint8_t { child_new, child_freed, parent_inplace, child_inplace };
struct txn_record {
+ uint8_t has_inplace_parent : 1; // Do we have parent_id in the list of ids. It will be first
+ uint8_t has_inplace_child : 1; // Do we have child_id in the list of ids. It will be second
+ uint8_t is_parent_meta : 1; // Is the parent buffer a meta buffer
+ uint8_t reserved1 : 5;
uint8_t num_new_ids;
uint8_t num_freed_ids;
- uint8_t has_inplace_parent : 1;
- uint8_t has_inplace_child : 1;
- uint8_t is_parent_meta : 1; // Is the parent buffer a meta buffer
- uint8_t reserved1 : 5;
uint8_t reserved{0};
uint32_t index_ordinal;
compact_blkid_t ids[1]; // C++ std probhits 0 size array
txn_record(uint32_t ordinal) :
- num_new_ids{0},
- num_freed_ids{0},
has_inplace_parent{0x0},
has_inplace_child{0x0},
is_parent_meta{0x0},
+ num_new_ids{0},
+ num_freed_ids{0},
index_ordinal{ordinal} {}
uint32_t total_ids() const {
- return (num_new_ids + num_freed_ids + has_inplace_parent ? 1 : 0 + has_inplace_child ? 1 : 0);
+ return (num_new_ids + num_freed_ids + ((has_inplace_parent == 0x1) ? 1 : 0) +
+ ((has_inplace_child == 0x1) ? 1 : 0));
}
- uint32_t size() const { return sizeof(txn_record) - (total_ids() - 1) * sizeof(compact_blkid_t); }
+ uint32_t size() const { return sizeof(txn_record) + (total_ids() - 1) * sizeof(compact_blkid_t); }
static uint32_t size_for_num_ids(uint8_t n) { return sizeof(txn_record) + (n - 1) * sizeof(compact_blkid_t); }
+
+ uint32_t next_slot() const {
+ return ((has_inplace_parent == 0x1) ? 1 : 0) + ((has_inplace_child == 0x1) ? 1 : 0) + num_new_ids +
+ num_freed_ids;
+ }
+
void append(op_t op, BlkId const& blk) {
+ auto const compact_blk = std::make_pair(blk.blk_num(), blk.chunk_num());
+ auto const slot = next_slot();
if (op == op_t::parent_inplace) {
DEBUG_ASSERT(has_inplace_parent == 0x0, "Duplicate inplace parent in same txn record");
+ DEBUG_ASSERT((has_inplace_child == 0x0) && (num_new_ids == 0) && (num_freed_ids == 0),
+ "Ordering of append is not correct");
has_inplace_parent = 0x1;
} else if (op == op_t::child_inplace) {
DEBUG_ASSERT(has_inplace_child == 0x0, "Duplicate inplace child in same txn record");
has_inplace_child = 0x1;
} else if (op == op_t::child_new) {
DEBUG_ASSERT_LT(num_new_ids, 0xff, "Too many new ids in txn record");
- ids[num_new_ids++] = std::make_pair(blk.blk_num(), blk.chunk_num());
+ ++num_new_ids;
} else if (op == op_t::child_freed) {
DEBUG_ASSERT_LT(num_freed_ids, 0xff, "Too many freed ids in txn record");
- ids[num_freed_ids++] = std::make_pair(blk.blk_num(), blk.chunk_num());
+ ++num_freed_ids;
} else {
DEBUG_ASSERT(false, "Invalid op type");
}
+ ids[slot] = compact_blk;
}
BlkId blk_id(uint8_t idx) const {
DEBUG_ASSERT_LT(idx, total_ids(), "Index out of bounds");
return BlkId{ids[idx].first, (blk_count_t)1u, ids[idx].second};
}
+
+ std::string parent_id_string() const {
+ return (has_inplace_parent == 0x1) ? fmt::format("chunk={}, blk={}", ids[0].second, ids[0].first) : "empty";
+ }
+
+ std::string child_id_string() const {
+ auto const idx = (has_inplace_parent == 0x1) ? 1 : 0;
+ return (has_inplace_child == 0x1) ? fmt::format("chunk={}, blk={}", ids[idx].second, ids[idx].first)
+ : "empty";
+ }
+
+ std::string to_string() const;
};
struct txn_journal {
@@ -103,6 +127,9 @@ struct IndexCPContext : public VDevCPContext {
++num_txns;
return append_guard(this, ordinal);
}
+
+ std::string to_string() const;
+ void log_records() const;
};
#pragma pack()
@@ -135,6 +162,7 @@ struct IndexCPContext : public VDevCPContext {
std::optional< IndexBufferPtr > next_dirty();
std::string to_string();
std::string to_string_with_dags();
+ void to_string_dot(const std::string& filename);
private:
void check_cycle();
diff --git a/src/lib/index/index_service.cpp b/src/lib/index/index_service.cpp
index 453b802be..cc199bbd5 100644
--- a/src/lib/index/index_service.cpp
+++ b/src/lib/index/index_service.cpp
@@ -33,7 +33,7 @@ IndexService::IndexService(std::unique_ptr< IndexServiceCallbacks > cbs) : m_svc
meta_service().register_handler(
"index",
[this](meta_blk* mblk, sisl::byte_view buf, size_t size) {
- itable_meta_blk_found(std::move(buf), voidptr_cast(mblk));
+ m_itable_sbs.emplace_back(std::pair{mblk, std::move(buf)});
},
nullptr);
@@ -67,18 +67,26 @@ shared< VirtualDev > IndexService::open_vdev(const vdev_info& vinfo, bool load_e
uint32_t IndexService::reserve_ordinal() { return m_ordinal_reserver->reserve(); }
-void IndexService::itable_meta_blk_found(const sisl::byte_view& buf, void* meta_cookie) {
- // We have found an index table superblock. Notify the callback which should convert the superblock into actual
- // IndexTable instance
- superblk< index_table_sb > sb;
- sb.load(buf, meta_cookie);
- add_index_table(m_svc_cbs->on_index_table_found(std::move(sb)));
-}
-
void IndexService::start() {
// Start Writeback cache
- m_wb_cache = std::make_unique< IndexWBCache >(m_vdev, std::move(m_wbcache_sb), hs()->evictor(),
+ m_wb_cache = std::make_unique< IndexWBCache >(m_vdev, m_wbcache_sb, hs()->evictor(),
hs()->device_mgr()->atomic_page_size(HSDevType::Fast));
+
+ // Load any index tables which are to loaded from meta blk
+ for (auto const& [meta_cookie, buf] : m_itable_sbs) {
+ superblk< index_table_sb > sb;
+ sb.load(buf, meta_cookie);
+ add_index_table(m_svc_cbs->on_index_table_found(std::move(sb)));
+ }
+
+ // Recover the writeback cache, which in-turns recovers any index table nodes
+ m_wb_cache->recover(m_wbcache_sb.second);
+
+ // Notify each table that we have completed recovery
+ std::unique_lock lg(m_index_map_mtx);
+ for (const auto& [_, tbl] : m_index_map) {
+ tbl->recovery_completed();
+ }
}
void IndexService::stop() { m_wb_cache.reset(); }
@@ -109,12 +117,11 @@ std::shared_ptr< IndexTableBase > IndexService::get_index_table(uint32_t ordinal
void IndexService::repair_index_node(uint32_t ordinal, IndexBufferPtr const& node_buf) {
auto tbl = get_index_table(ordinal);
- if (tbl) { tbl->repair_node(node_buf); }
-}
-
-void IndexService::repair_index_root(uint32_t ordinal, IndexBufferPtr const& root_buf) {
- auto tbl = get_index_table(ordinal);
- if (tbl) { tbl->repair_root(root_buf); }
+ if (tbl) {
+ tbl->repair_node(node_buf);
+ } else {
+ HS_DBG_ASSERT(false, "Index corresponding to ordinal={} has not been loaded yet, unexpected", ordinal);
+ }
}
uint32_t IndexService::node_size() const { return m_vdev->atomic_page_size(); }
@@ -144,11 +151,35 @@ std::string IndexBuffer::to_string() const {
voidptr_cast(const_cast< IndexBuffer* >(this)), m_index_ordinal, int_cast(state()),
m_created_cp_id, m_dirtied_cp_id, m_wait_for_down_buffers.get(), m_node_freed);
} else {
- return fmt::format("Buf={} index={} state={} create/dirty_cp={}/{} down_wait#={} {} node=[{}] ",
+ // store m_down_buffers in a string
+ std::string down_bufs = "";
+#ifndef NDEBUG
+ for (auto const& down_buf : m_down_buffers) {
+ if (auto ptr = down_buf.lock()) {
+ fmt::format_to(std::back_inserter(down_bufs), "[{}]", voidptr_cast(ptr.get()));
+ }
+ }
+#endif
+
+ return fmt::format("Buf={} index={} state={} create/dirty_cp={}/{} down_wait#={}{} up={} node=[{}] down=[{}]",
voidptr_cast(const_cast< IndexBuffer* >(this)), m_index_ordinal, int_cast(state()),
- m_created_cp_id, m_dirtied_cp_id, m_wait_for_down_buffers.get(), m_node_freed ? "Freed" : "",
- r_cast< persistent_hdr_t const* >(m_bytes)->to_compact_string());
+ m_created_cp_id, m_dirtied_cp_id, m_wait_for_down_buffers.get(),
+ m_node_freed ? " Freed" : "", voidptr_cast(const_cast< IndexBuffer* >(m_up_buffer.get())),
+ (m_bytes == nullptr) ? "not attached yet"
+ : r_cast< persistent_hdr_t const* >(m_bytes)->to_compact_string(),
+ down_bufs);
+ }
+}
+std::string IndexBuffer::to_string_dot() const {
+ auto str = fmt::format("IndexBuffer {} ", reinterpret_cast< void* >(const_cast< IndexBuffer* >(this)));
+ if (m_bytes == nullptr) {
+ fmt::format_to(std::back_inserter(str), " node_buf=nullptr ");
+ } else {
+ fmt::format_to(std::back_inserter(str), " node_buf={} {} created/dirtied={}/{} {} down_wait#={}",
+ static_cast< void* >(m_bytes), m_is_meta_buf ? "[META]" : "", m_created_cp_id, m_dirtied_cp_id,
+ m_node_freed ? "FREED" : "", m_wait_for_down_buffers.get());
}
+ return str;
}
MetaIndexBuffer::MetaIndexBuffer(superblk< index_table_sb >& sb) : IndexBuffer{nullptr, BlkId{}}, m_sb{sb} {
diff --git a/src/lib/index/wb_cache.cpp b/src/lib/index/wb_cache.cpp
index 6bbb6d809..6c0c722ae 100644
--- a/src/lib/index/wb_cache.cpp
+++ b/src/lib/index/wb_cache.cpp
@@ -54,7 +54,6 @@ IndexWBCache::IndexWBCache(const std::shared_ptr< VirtualDev >& vdev, std::pair<
// We need to register the consumer first before recovery, so that recovery can use the cp_ctx created to add/track
// recovered new nodes.
cp_mgr().register_consumer(cp_consumer_t::INDEX_SVC, std::move(std::make_unique< IndexCPCallbacks >(this)));
- recover(std::move(sb.second));
}
void IndexWBCache::start_flush_threads() {
@@ -102,9 +101,11 @@ BtreeNodePtr IndexWBCache::alloc_buf(node_initializer_t&& node_initializer) {
idx_buf->m_dirtied_cp_id = cpg->id();
auto node = node_initializer(idx_buf);
- // Add the node to the cache
- bool done = m_cache.insert(node);
- HS_REL_ASSERT_EQ(done, true, "Unable to add alloc'd node to cache, low memory or duplicate inserts?");
+ if (!m_in_recovery) {
+ // Add the node to the cache. Skip if we are in recovery mode.
+ bool done = m_cache.insert(node);
+ HS_REL_ASSERT_EQ(done, true, "Unable to add alloc'd node to cache, low memory or duplicate inserts?");
+ }
// The entire index is updated in the commit path, so we alloc the blk and commit them right away
auto alloc_status = m_vdev->commit_blk(blkid);
@@ -115,9 +116,19 @@ BtreeNodePtr IndexWBCache::alloc_buf(node_initializer_t&& node_initializer) {
void IndexWBCache::write_buf(const BtreeNodePtr& node, const IndexBufferPtr& buf, CPContext* cp_ctx) {
// TODO upsert always returns false even if it succeeds.
- if (node != nullptr) { m_cache.upsert(node); }
- r_cast< IndexCPContext* >(cp_ctx)->add_to_dirty_list(buf);
- resource_mgr().inc_dirty_buf_size(m_node_size);
+ if (m_in_recovery) {
+ if (buf->is_meta_buf()) {
+ auto const& sb = r_cast< MetaIndexBuffer* >(buf.get())->m_sb;
+ meta_service().update_sub_sb(buf->m_bytes, sb.size(), sb.meta_blk());
+ } else {
+ m_vdev->sync_write(r_cast< const char* >(buf->raw_buffer()), m_node_size, buf->m_blkid);
+ }
+ } else {
+ if (node != nullptr) { m_cache.upsert(node); }
+ LOGTRACEMOD(wbcache, "add to dirty list cp {} {}", cp_ctx->id(), buf->to_string());
+ r_cast< IndexCPContext* >(cp_ctx)->add_to_dirty_list(buf);
+ resource_mgr().inc_dirty_buf_size(m_node_size);
+ }
}
void IndexWBCache::read_buf(bnodeid_t id, BtreeNodePtr& node, node_initializer_t&& node_initializer) {
@@ -125,7 +136,7 @@ void IndexWBCache::read_buf(bnodeid_t id, BtreeNodePtr& node, node_initializer_t
retry:
// Check if the blkid is already in cache, if not load and put it into the cache
- if (m_cache.get(blkid, node)) { return; }
+ if (!m_in_recovery && m_cache.get(blkid, node)) { return; }
// Read the buffer from virtual device
auto idx_buf = std::make_shared< IndexBuffer >(blkid, m_node_size, m_vdev->align_size());
@@ -135,10 +146,12 @@ void IndexWBCache::read_buf(bnodeid_t id, BtreeNodePtr& node, node_initializer_t
node = node_initializer(idx_buf);
// Push the node into cache
- bool done = m_cache.insert(node);
- if (!done) {
- // There is a race between 2 concurrent reads from vdev and other party won the race. Re-read from cache
- goto retry;
+ if (!m_in_recovery) {
+ bool done = m_cache.insert(node);
+ if (!done) {
+ // There is a race between 2 concurrent reads from vdev and other party won the race. Re-read from cache
+ goto retry;
+ }
}
}
@@ -184,6 +197,7 @@ bool IndexWBCache::refresh_meta_buf(shared< MetaIndexBuffer >& meta_buf, CPConte
new_buf->m_dirtied_cp_id = cp_ctx->id();
write_buf(nullptr, new_buf, cp_ctx);
meta_buf = new_buf; // Replace the meta_buf with new buf
+ LOGTRACEMOD(wbcache, "meta buf {} is created in cp {}", meta_buf->to_string(), cp_ctx->id());
}
return true;
}
@@ -193,13 +207,16 @@ static void set_crash_flips(IndexBufferPtr const& parent_buf, IndexBufferPtr con
IndexBufferPtrList const& new_node_bufs, IndexBufferPtrList const& freed_node_bufs) {
// TODO: Need an API from flip to quickly check if flip is enabled, so this method doesn't check flip_enabled a
// bunch of times.
- if ((new_node_bufs.size() == 1) && freed_node_bufs.empty()) {
+ if (parent_buf && parent_buf->is_meta_buf()) {
+ // Split or merge happening on root
+ if (iomgr_flip::instance()->test_flip("crash_flush_on_meta")) {
+ parent_buf->set_crash_flag();
+ } else if (iomgr_flip::instance()->test_flip("crash_flush_on_root")) {
+ child_buf->set_crash_flag();
+ }
+ } else if ((new_node_bufs.size() == 1) && freed_node_bufs.empty()) {
// Its a split node situation
- if (iomgr_flip::instance()->test_flip("crash_flush_on_split_at_meta")) {
- if (parent_buf->is_meta_buf()) { parent_buf->set_crash_flag(); }
- } else if (iomgr_flip::instance()->test_flip("crash_flush_on_split_at_root")) {
- if (parent_buf->is_meta_buf()) { child_buf->set_crash_flag(); }
- } else if (iomgr_flip::instance()->test_flip("crash_flush_on_split_at_parent")) {
+ if (iomgr_flip::instance()->test_flip("crash_flush_on_split_at_parent")) {
parent_buf->set_crash_flag();
} else if (iomgr_flip::instance()->test_flip("crash_flush_on_split_at_left_child")) {
child_buf->set_crash_flag();
@@ -208,11 +225,7 @@ static void set_crash_flips(IndexBufferPtr const& parent_buf, IndexBufferPtr con
}
} else if (!freed_node_bufs.empty() && (new_node_bufs.size() != freed_node_bufs.size())) {
// Its a merge nodes sitation
- if (iomgr_flip::instance()->test_flip("crash_flush_on_merge_at_meta")) {
- if (parent_buf->is_meta_buf()) { parent_buf->set_crash_flag(); }
- } else if (iomgr_flip::instance()->test_flip("crash_flush_on_merge_at_root")) {
- if (parent_buf->is_meta_buf()) { child_buf->set_crash_flag(); }
- } else if (iomgr_flip::instance()->test_flip("crash_flush_on_merge_at_parent")) {
+ if (iomgr_flip::instance()->test_flip("crash_flush_on_merge_at_parent")) {
parent_buf->set_crash_flag();
} else if (iomgr_flip::instance()->test_flip("crash_flush_on_merge_at_left_child")) {
child_buf->set_crash_flag();
@@ -221,11 +234,7 @@ static void set_crash_flips(IndexBufferPtr const& parent_buf, IndexBufferPtr con
}
} else if (!freed_node_bufs.empty() && (new_node_bufs.size() == freed_node_bufs.size())) {
// Its a rebalance node situation
- if (iomgr_flip::instance()->test_flip("crash_flush_on_rebalance_at_meta")) {
- if (parent_buf->is_meta_buf()) { parent_buf->set_crash_flag(); }
- } else if (iomgr_flip::instance()->test_flip("crash_flush_on_rebalance_at_root")) {
- if (parent_buf->is_meta_buf()) { child_buf->set_crash_flag(); }
- } else if (iomgr_flip::instance()->test_flip("crash_flush_on_rebalance_at_parent")) {
+ if (iomgr_flip::instance()->test_flip("crash_flush_on_rebalance_at_parent")) {
parent_buf->set_crash_flag();
} else if (iomgr_flip::instance()->test_flip("crash_flush_on_rebalance_at_left_child")) {
child_buf->set_crash_flag();
@@ -261,9 +270,28 @@ void IndexWBCache::transact_bufs(uint32_t index_ordinal, IndexBufferPtr const& p
}
}
- icp_ctx->add_to_txn_journal(index_ordinal, child_buf->m_up_buffer,
- new_node_bufs.size() ? new_node_bufs[0]->m_up_buffer : nullptr, new_node_bufs,
- freed_node_bufs);
+ if (new_node_bufs.empty() && freed_node_bufs.empty()) {
+ // This is an update for meta, root transaction.
+ if (child_buf->m_created_cp_id != -1) {
+ DEBUG_ASSERT_EQ(child_buf->m_created_cp_id, icp_ctx->id(),
+ "Root buffer is not created by current cp (for split root), its not expected");
+ }
+ icp_ctx->add_to_txn_journal(index_ordinal, parent_buf, nullptr, {child_buf}, {});
+ } else {
+ icp_ctx->add_to_txn_journal(index_ordinal, // Ordinal
+ child_buf->m_up_buffer, // real up buffer
+ new_node_bufs.empty() ? freed_node_bufs[0]->m_up_buffer
+ : new_node_bufs[0]->m_up_buffer, // real in place child
+ new_node_bufs, // new node bufs
+ freed_node_bufs // free_node_bufs
+ );
+ }
+#if 0
+ static int id = 0;
+ auto filename = "transact_bufs_"+std::to_string(id++)+ "_" +std::to_string(rand()%100)+".dot";
+ LOGINFO("Transact cp is in cp\n{} and storing in {}\n\n\n", icp_ctx->to_string(), filename);
+ icp_ctx->to_string_dot(filename);
+#endif
}
void IndexWBCache::link_buf(IndexBufferPtr const& up_buf, IndexBufferPtr const& down_buf, bool is_sibling_link,
@@ -346,8 +374,10 @@ void IndexWBCache::link_buf(IndexBufferPtr const& up_buf, IndexBufferPtr const&
void IndexWBCache::free_buf(const IndexBufferPtr& buf, CPContext* cp_ctx) {
BtreeNodePtr node;
- bool done = m_cache.remove(buf->m_blkid, node);
- HS_REL_ASSERT_EQ(done, true, "Race on cache removal of btree blkid?");
+ if (!m_in_recovery) {
+ bool done = m_cache.remove(buf->m_blkid, node);
+ HS_REL_ASSERT_EQ(done, true, "Race on cache removal of btree blkid?");
+ }
resource_mgr().inc_free_blk(m_node_size);
m_vdev->free_blk(buf->m_blkid, s_cast< VDevCPContext* >(cp_ctx));
@@ -356,7 +386,12 @@ void IndexWBCache::free_buf(const IndexBufferPtr& buf, CPContext* cp_ctx) {
//////////////////// Recovery Related section /////////////////////////////////
void IndexWBCache::recover(sisl::byte_view sb) {
// If sb is empty, its possible a first time boot.
- if ((sb.bytes() == nullptr) || (sb.size() == 0)) { return; }
+ if ((sb.bytes() == nullptr) || (sb.size() == 0)) {
+ m_vdev->recovery_completed();
+ return;
+ }
+
+ m_in_recovery = true; // For entirity of this call, we should mark it as being recovered.
// Recover the CP Context with the buf_map of all the buffers that were dirtied in the last cp with its
// relationship (up/down buf links) as it was by the cp that was flushing the buffers prior to unclean shutdown.
@@ -364,58 +399,84 @@ void IndexWBCache::recover(sisl::byte_view sb) {
auto icp_ctx = r_cast< IndexCPContext* >(cpg.context(cp_consumer_t::INDEX_SVC));
std::map< BlkId, IndexBufferPtr > bufs = icp_ctx->recover(std::move(sb));
- // With all the buffers recovered, we first make the decision of which blk to keep and which blk to free. This
- // is needed so that subsequent repair can do blk allocation and shouldn't incorrectly allocate the blks which
- // are going to be committed later.
- std::vector< IndexBufferPtr > new_bufs;
+ LOGINFOMOD(wbcache, "Detected unclean shutdown, prior cp={} had to flush {} nodes, recovering... ", icp_ctx->id(),
+ bufs.size());
+
+ // At this point, we have the DAG structure (up/down dependency graph), exactly the same as prior to crash, with one
+ // addition of all freed buffers also put in the DAG structure.
+ //
+ // We do repair/recovery as 2 passes. A quick glance would look like we don't need 2 passes of the walking through
+ // all the buffers, but it is essential.
+ //
+ // In the first pass, we look for any new bufs and any freed bufs and commit/free their corresponding node blkids.
+ // This has to be done before doing any repair, because repair can allocate blkids and we don't want to allocate
+ // the same blkid which could clash with the blkid next in the buf list.
+ //
+ // On the second pass, we only take the new nodes/bufs and then repair their up buffers, if needed.
+ std::vector< IndexBufferPtr > l0_bufs;
for (auto const& [_, buf] : bufs) {
- if (buf->m_node_freed) {
- // If the node was freed according txn records, we need to check if up_buf node was also written
- if (was_node_committed(buf->m_up_buffer)) {
- // Up buffer was written, so this buffer can be freed and thus can free the blk.
- m_vdev->free_blk(buf->m_blkid, s_cast< VDevCPContext* >(icp_ctx));
- }
- buf->m_up_buffer->m_wait_for_down_buffers.decrement();
- } else if (buf->m_created_cp_id == icp_ctx->id()) {
- if (was_node_committed(buf) && was_node_committed(buf->m_up_buffer)) {
- // The buffer was newly created on this cp, and its up_buffer is also written, we need to commit
- // this blk again, so that it will not be reallocated for other node during repair.
- m_vdev->commit_blk(buf->m_blkid);
- new_bufs.push_back(buf);
+ if (buf->m_node_freed || (buf->m_created_cp_id == icp_ctx->id())) {
+ if (was_node_committed(buf)) {
+ if (was_node_committed(buf->m_up_buffer)) {
+ if (buf->m_node_freed) {
+ // Up buffer was written, so this buffer can be freed and thus can free the blk.
+ m_vdev->free_blk(buf->m_blkid, s_cast< VDevCPContext* >(icp_ctx));
+ } else {
+ m_vdev->commit_blk(buf->m_blkid);
+ }
+ l0_bufs.push_back(buf);
+ } else {
+ buf->m_up_buffer->m_wait_for_down_buffers.decrement();
+ }
}
}
}
+ LOGINFOMOD(wbcache, "Index Recovery detected {} nodes out of {} as new/freed nodes to be recovered in prev cp={}",
+ l0_bufs.size(), bufs.size(), icp_ctx->id());
+
+ auto detailed_log = [this](std::map< BlkId, IndexBufferPtr > const& bufs,
+ std::vector< IndexBufferPtr > const& l0_bufs) {
+ // Logs to detect down_waits are set correctly for up buffers list of all recovered bufs
+ std::string log = fmt::format("\trecovered bufs (#of bufs = {})\n", bufs.size());
+ for (auto const& [_, buf] : bufs) {
+ fmt::format_to(std::back_inserter(log), "{}\n", buf->to_string());
+ }
+
+ // list of new_bufs
+ fmt::format_to(std::back_inserter(log), "\n\tl0_bufs (#of bufs = {})\n", l0_bufs.size());
+ for (auto const& buf : l0_bufs) {
+ fmt::format_to(std::back_inserter(log), "{}\n", buf->to_string());
+ }
+ return log;
+ };
+ LOGTRACEMOD(wbcache, "All unclean bufs list\n{}", detailed_log(bufs, l0_bufs));
+
// Second iteration we start from the lowest levels (which are all new_bufs) and check if up_buffers need to be
// repaired. All L1 buffers are not needed to repair, because they are sibling nodes and so we pass false in
// do_repair flag.
- for (auto const& buf : new_bufs) {
- process_up_buf(buf->m_up_buffer, false /* do_repair */);
+ for (auto const& buf : l0_bufs) {
+ recover_buf(buf->m_up_buffer);
}
+ m_in_recovery = false;
+ m_vdev->recovery_completed();
}
-void IndexWBCache::process_up_buf(IndexBufferPtr const& buf, bool do_repair) {
- if (do_repair) {
- // If the buffer needs to be repaired, reset the dirtied_cp_id so once all down buffers are done, it does
- // repair
- buf->m_dirtied_cp_id = -1;
- }
-
+void IndexWBCache::recover_buf(IndexBufferPtr const& buf) {
if (!buf->m_wait_for_down_buffers.decrement_testz()) { return; }
- // One of the down buffer indicated that it has to repair our node, so we issue a repair
- if (buf->m_dirtied_cp_id == -1) { index_service().repair_index_node(buf->m_index_ordinal, buf); }
-
- // If there is an up buffer on next level, we need to process them and ask them to repair in case they were not
- // written as part of this CP.
- if (buf->m_up_buffer) {
- if (buf->m_up_buffer->is_meta_buf()) {
- // If the up_buf is meta buffer, then we found the new root, repair the current node accordingly
- index_service().repair_index_root(buf->m_index_ordinal, buf);
- } else {
- process_up_buf(buf->m_up_buffer, !was_node_committed(buf->m_up_buffer));
- }
+ // All down buffers are completed and given a nod saying that they are committed. If this buffer is not committed,
+ // then we need to repair this node/buffer. After that we will keep going to the next up level to repair them if
+ // needed
+ if (!was_node_committed(buf)) {
+ LOGDEBUGMOD(wbcache, "Index Recovery detected uncommitted up node [{}], repairing it", buf->to_string());
+ index_service().repair_index_node(buf->m_index_ordinal, buf);
+ } else {
+ LOGTRACEMOD(wbcache, "Index Recovery detected up node [{}] as committed no need to repair that",
+ buf->to_string());
}
+
+ if (buf->m_up_buffer) { recover_buf(buf->m_up_buffer); }
}
bool IndexWBCache::was_node_committed(IndexBufferPtr const& buf) {
@@ -442,12 +503,25 @@ bool IndexWBCache::was_node_committed(IndexBufferPtr const& buf) {
//////////////////// CP Related API section /////////////////////////////////
folly::Future< bool > IndexWBCache::async_cp_flush(IndexCPContext* cp_ctx) {
- LOGTRACEMOD(wbcache, "cp_ctx {}", cp_ctx->to_string());
+ LOGTRACEMOD(wbcache, "Starting Index CP Flush with cp context={}", cp_ctx->to_string_with_dags());
if (!cp_ctx->any_dirty_buffers()) {
- CP_PERIODIC_LOG(DEBUG, cp_ctx->id(), "Btree does not have any dirty buffers to flush");
+ if (cp_ctx->id() == 0) {
+ // For the first CP, we need to flush the journal buffer to the meta blk
+ LOGINFO("First time boot cp, we shall flush the vdev to ensure all cp information is created");
+ m_vdev->cp_flush(cp_ctx);
+ } else {
+ CP_PERIODIC_LOG(DEBUG, cp_ctx->id(), "Btree does not have any dirty buffers to flush");
+ }
return folly::makeFuture< bool >(true); // nothing to flush
}
+#ifdef _PRERELEASE
+ if (hs()->crash_simulator().is_crashed()) {
+ LOGINFOMOD(wbcache, "crash simulation is ongoing, so skip the cp flush");
+ return folly::makeFuture< bool >(true);
+ }
+#endif
+
// First thing is to flush the new_blks created as part of the CP.
auto const& journal_buf = cp_ctx->journal_buf();
if (journal_buf.size() != 0) {
@@ -475,16 +549,23 @@ folly::Future< bool > IndexWBCache::async_cp_flush(IndexCPContext* cp_ctx) {
}
void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const& buf, bool part_of_batch) {
- LOGTRACEMOD(wbcache, "cp {} buf {}", cp_ctx->id(), buf->to_string());
- buf->set_state(index_buf_state_t::FLUSHING);
-
#ifdef _PRERELEASE
if (buf->m_crash_flag_on) {
- LOGINFOMOD(wbcache, "Simulating crash while writing buffer {}", buf->to_string());
+ std::string filename = "crash_buf_" + std::to_string(cp_ctx->id()) + ".dot";
+ LOGINFOMOD(wbcache, "Simulating crash while writing buffer {}, stored in file {}", buf->to_string(), filename);
+ cp_ctx->to_string_dot(filename);
hs()->crash_simulator().crash();
+ cp_ctx->complete(true);
+ return;
+ } else if (hs()->crash_simulator().is_crashed()) {
+ LOGINFOMOD(wbcache, "crash simulation is ongoing, aid simulation by not flushing");
+ return;
}
#endif
+ LOGTRACEMOD(wbcache, "cp={} {}", cp_ctx->id(), buf->to_string());
+ buf->set_state(index_buf_state_t::FLUSHING);
+
if (buf->is_meta_buf()) {
LOGTRACEMOD(wbcache, "flushing cp {} meta buf {} possibly because of root split", cp_ctx->id(),
buf->to_string());
@@ -509,6 +590,13 @@ void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr const
}
void IndexWBCache::process_write_completion(IndexCPContext* cp_ctx, IndexBufferPtr const& buf) {
+#ifdef _PRERELEASE
+ if (hs()->crash_simulator().is_crashed()) {
+ LOGINFOMOD(wbcache, "Crash simulation is ongoing, ignore all process_write_completion");
+ return;
+ }
+#endif
+
LOGTRACEMOD(wbcache, "cp {} buf {}", cp_ctx->id(), buf->to_string());
resource_mgr().dec_dirty_buf_size(m_node_size);
auto [next_buf, has_more] = on_buf_flush_done(cp_ctx, buf);
diff --git a/src/lib/index/wb_cache.hpp b/src/lib/index/wb_cache.hpp
index 8f08f73a1..209d3845e 100644
--- a/src/lib/index/wb_cache.hpp
+++ b/src/lib/index/wb_cache.hpp
@@ -40,6 +40,7 @@ class IndexWBCache : public IndexWBCacheBase {
std::vector< iomgr::io_fiber_t > m_cp_flush_fibers;
std::mutex m_flush_mtx;
void* m_meta_blk;
+ bool m_in_recovery{false};
public:
IndexWBCache(const std::shared_ptr< VirtualDev >& vdev, std::pair< meta_blk*, sisl::byte_view > sb,
@@ -59,7 +60,7 @@ class IndexWBCache : public IndexWBCacheBase {
//////////////////// CP Related API section /////////////////////////////////
folly::Future< bool > async_cp_flush(IndexCPContext* context);
IndexBufferPtr copy_buffer(const IndexBufferPtr& cur_buf, const CPContext* cp_ctx) const;
- void recover(sisl::byte_view sb);
+ void recover(sisl::byte_view sb) override;
private:
void start_flush_threads();
@@ -75,7 +76,7 @@ class IndexWBCache : public IndexWBCacheBase {
void get_next_bufs_internal(IndexCPContext* cp_ctx, uint32_t max_count, IndexBufferPtr const& prev_flushed_buf,
IndexBufferPtrList& bufs);
- void process_up_buf(IndexBufferPtr const& buf, bool do_repair);
+ void recover_buf(IndexBufferPtr const& buf);
bool was_node_committed(IndexBufferPtr const& buf);
};
} // namespace homestore
diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt
index a20904b8f..eab66ece1 100644
--- a/src/tests/CMakeLists.txt
+++ b/src/tests/CMakeLists.txt
@@ -71,16 +71,15 @@ if (${io_tests})
set(TEST_INDEXBTREE_SOURCE_FILES test_index_btree.cpp)
add_executable(test_index_btree ${TEST_INDEXBTREE_SOURCE_FILES})
target_link_libraries(test_index_btree homestore ${COMMON_TEST_DEPS} GTest::gtest)
- #TODO : Fix the test case and enable it
add_test(NAME IndexBtree COMMAND test_index_btree --gtest_filter=*/0.*)
set_property(TEST IndexBtree PROPERTY ENVIRONMENT "ASAN_OPTIONS=detect_stack_use_after_return=true")
set_tests_properties(IndexBtree PROPERTIES TIMEOUT 1200)
- #set(TEST_RECOVERY_INDEX_SOURCE_FILES test_index_recovery.cpp)
- #add_executable(test_index_recovery ${TEST_RECOVERY_INDEX_SOURCE_FILES})
- #target_link_libraries(test_index_recovery homestore ${COMMON_TEST_DEPS} GTest::gtest)
- #add_test(NAME IndexRecovery COMMAND test_index_recovery)
- # set_property(TEST IndexRecovery PROPERTY ENVIRONMENT "ASAN_OPTIONS=detect_stack_use_after_return=true")
+ set(TEST_RECOVERY_INDEX_SOURCE_FILES test_index_crash_recovery.cpp)
+ add_executable(test_index_crash_recovery ${TEST_RECOVERY_INDEX_SOURCE_FILES})
+ target_link_libraries(test_index_crash_recovery homestore ${COMMON_TEST_DEPS} GTest::gtest)
+ add_test(NAME IndexCrashRecovery COMMAND test_index_crash_recovery)
+ set_property(TEST IndexCrashRecovery PROPERTY ENVIRONMENT "ASAN_OPTIONS=detect_stack_use_after_return=true")
add_executable(test_data_service)
target_sources(test_data_service PRIVATE test_data_service.cpp)
diff --git a/src/tests/btree_helpers/btree_test_helper.hpp b/src/tests/btree_helpers/btree_test_helper.hpp
index 950884903..298235068 100644
--- a/src/tests/btree_helpers/btree_test_helper.hpp
+++ b/src/tests/btree_helpers/btree_test_helper.hpp
@@ -274,7 +274,7 @@ struct BtreeTestHelper {
qreq.enable_route_tracing();
auto const ret = m_bt->query(qreq, out_vector);
auto const expected_count = std::min(remaining, batch_size);
-
+ // this->print_keys();
ASSERT_EQ(out_vector.size(), expected_count) << "Received incorrect value on query pagination";
if (remaining < batch_size) {
@@ -374,8 +374,29 @@ struct BtreeTestHelper {
run_in_parallel(op_list);
}
- void print(const std::string& file = "") const { m_bt->print_tree(file); }
- void print_keys() const { m_bt->print_tree_keys(); }
+ void dump_to_file(const std::string& file = "") const { m_bt->dump_tree_to_file(file); }
+ void print_keys(const std::string& preamble = "") const {
+ auto print_key_range = [](std::vector< std::pair< K, V > > const& kvs) -> std::string {
+ uint32_t start = 0;
+ std::string str;
+ for (uint32_t i{1}; i <= kvs.size(); ++i) {
+ if ((i == kvs.size()) || (kvs[i].first.key() != kvs[i - 1].first.key() + 1)) {
+ if ((i - start) > 1) {
+ fmt::format_to(std::back_inserter(str), "{}-{}{}", kvs[start].first.key(),
+ kvs[i - 1].first.key(), (i == kvs.size()) ? "" : ", ");
+ } else {
+ fmt::format_to(std::back_inserter(str), "{}{}", kvs[start].first.key(),
+ (i == kvs.size()) ? "" : ", ");
+ }
+ start = i;
+ }
+ }
+ return str;
+ };
+
+ LOGINFO("{}{}", preamble.empty() ? "" : preamble + ":\n", m_bt->to_custom_string(print_key_range));
+ }
+ void visualize_keys(const std::string& file) const { m_bt->visualize_tree_keys(file); }
void compare_files(const std::string& before, const std::string& after) {
std::ifstream b(before, std::ifstream::ate);
diff --git a/src/tests/btree_helpers/shadow_map.hpp b/src/tests/btree_helpers/shadow_map.hpp
index a91dd39b8..9818c8a45 100644
--- a/src/tests/btree_helpers/shadow_map.hpp
+++ b/src/tests/btree_helpers/shadow_map.hpp
@@ -3,6 +3,7 @@
#include "btree_test_kvs.hpp"
+
template < typename K, typename V >
class ShadowMap {
private:
@@ -11,7 +12,11 @@ class ShadowMap {
uint32_t m_max_keys;
using mutex = iomgr::FiberManagerLib::shared_mutex;
mutex m_mutex;
-
+//#define SHOWM(X) cout << #X " = " << (X) << endl
+// void testPrint(std::map< uint32_t, std::string >& m_map, int i) {
+// SHOWM(m[i]);
+// SHOWM(m.find(i)->first);
+// }
public:
ShadowMap(uint32_t num_keys) : m_range_scheduler(num_keys), m_max_keys{num_keys} {}
@@ -142,7 +147,7 @@ class ShadowMap {
auto it2 = other.m_map.begin();
std::vector< std::pair< K, bool > > ret_diff;
- while ((it1 != m_map.end()) && (it2 != m_map.end())) {
+ while ((it1 != m_map.end()) && (it2 != other.m_map.end())) {
auto const x = it1->first.compare(it2->first);
if (x == 0) {
++it1;
@@ -163,7 +168,7 @@ class ShadowMap {
}
while (it2 != other.m_map.end()) {
- ret_diff.emplace_back(it1->first, false /* addition */);
+ ret_diff.emplace_back(it2->first, false /* addition */);
++it2;
}
return ret_diff;
diff --git a/src/tests/test_common/homestore_test_common.hpp b/src/tests/test_common/homestore_test_common.hpp
index b82ae2466..7a25c9b12 100644
--- a/src/tests/test_common/homestore_test_common.hpp
+++ b/src/tests/test_common/homestore_test_common.hpp
@@ -198,7 +198,10 @@ class HSTestHelper {
test_params& params(uint32_t svc) { return m_token.svc_params_[svc]; }
#ifdef _PRERELEASE
- void wait_for_crash_recovery() { m_crash_recovered.getFuture().get(); }
+ void wait_for_crash_recovery() {
+ m_crash_recovered.getFuture().get();
+ m_crash_recovered = folly::Promise< folly::Unit >();
+ }
#endif
void set_min_chunk_size(uint64_t chunk_size) {
diff --git a/src/tests/test_device_manager.cpp b/src/tests/test_device_manager.cpp
index 6ddb8aec5..4077bc917 100644
--- a/src/tests/test_device_manager.cpp
+++ b/src/tests/test_device_manager.cpp
@@ -78,7 +78,12 @@ class DeviceMgrTest : public ::testing::Test {
return std::make_shared< homestore::VirtualDev >(*m_dmgr, vinfo_tmp, nullptr /* event_cb */, false);
});
- m_dmgr->is_first_time_boot() ? m_dmgr->format_devices() : m_dmgr->load_devices();
+ if (m_dmgr->is_first_time_boot()) {
+ m_dmgr->format_devices();
+ m_dmgr->commit_formatting();
+ } else {
+ m_dmgr->load_devices();
+ }
m_pdevs = m_dmgr->get_pdevs_by_dev_type(homestore::HSDevType::Data);
m_vdevs = m_dmgr->get_vdevs();
}
diff --git a/src/tests/test_index_btree.cpp b/src/tests/test_index_btree.cpp
index 68e6bbee0..6b5ff27ab 100644
--- a/src/tests/test_index_btree.cpp
+++ b/src/tests/test_index_btree.cpp
@@ -288,7 +288,7 @@ TYPED_TEST(BtreeTest, CpFlush) {
LOGINFO("Query {} entries and validate with pagination of 75 entries", num_entries);
this->do_query(0, num_entries - 1, 75);
- this->print(std::string("before.txt"));
+ this->dump_to_file(std::string("before.txt"));
this->destroy_btree();
@@ -298,7 +298,7 @@ TYPED_TEST(BtreeTest, CpFlush) {
std::this_thread::sleep_for(std::chrono::seconds{1});
LOGINFO("Restarted homestore with index recovered");
- this->print(std::string("after.txt"));
+ this->dump_to_file(std::string("after.txt"));
LOGINFO("Query {} entries", num_entries);
this->do_query(0, num_entries - 1, 1000);
@@ -336,7 +336,7 @@ TYPED_TEST(BtreeTest, MultipleCpFlush) {
LOGINFO("Query {} entries and validate with pagination of 75 entries", num_entries);
this->do_query(0, num_entries - 1, 75);
- this->print(std::string("before.txt"));
+ this->dump_to_file(std::string("before.txt"));
this->destroy_btree();
@@ -345,7 +345,7 @@ TYPED_TEST(BtreeTest, MultipleCpFlush) {
std::this_thread::sleep_for(std::chrono::seconds{1});
LOGINFO(" Restarted homestore with index recovered");
- this->print(std::string("after.txt"));
+ this->dump_to_file(std::string("after.txt"));
this->compare_files("before.txt", "after.txt");
@@ -400,7 +400,7 @@ TYPED_TEST(BtreeTest, ThreadedCpFlush) {
LOGINFO("Query {} entries and validate with pagination of 75 entries", num_entries);
this->do_query(0, num_entries - 1, 75);
- this->print(std::string("before.txt"));
+ this->dump_to_file(std::string("before.txt"));
this->destroy_btree();
// Restart homestore. m_bt is updated by the TestIndexServiceCallback.
@@ -408,7 +408,7 @@ TYPED_TEST(BtreeTest, ThreadedCpFlush) {
std::this_thread::sleep_for(std::chrono::seconds{1});
LOGINFO(" Restarted homestore with index recovered");
- this->print(std::string("after.txt"));
+ this->dump_to_file(std::string("after.txt"));
this->compare_files("before.txt", "after.txt");
diff --git a/src/tests/test_index_crash_recovery.cpp b/src/tests/test_index_crash_recovery.cpp
new file mode 100644
index 000000000..2cc932315
--- /dev/null
+++ b/src/tests/test_index_crash_recovery.cpp
@@ -0,0 +1,289 @@
+/*********************************************************************************
+ * Modifications Copyright 2017-2019 eBay Inc.
+ *
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed
+ * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+ * CONDITIONS OF ANY KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under the License.
+ *
+ *********************************************************************************/
+#include
+#include
+
+#include
+#include "common/homestore_config.hpp"
+#include "common/resource_mgr.hpp"
+#include "test_common/homestore_test_common.hpp"
+#include "test_common/range_scheduler.hpp"
+#include "btree_helpers/btree_test_helper.hpp"
+#include "btree_helpers/btree_test_kvs.hpp"
+#include "btree_helpers/btree_decls.h"
+
+using namespace homestore;
+
+SISL_LOGGING_INIT(HOMESTORE_LOG_MODS)
+SISL_OPTIONS_ENABLE(logging, test_index_crash_recovery, iomgr, test_common_setup)
+SISL_LOGGING_DECL(test_index_crash_recovery)
+
+// TODO Add tests to do write,remove after recovery.
+// TODO Test with var len key with io mgr page size is 512.
+
+SISL_OPTION_GROUP(
+ test_index_crash_recovery,
+ (num_iters, "", "num_iters", "number of iterations for rand ops",
+ ::cxxopts::value< uint32_t >()->default_value("500"), "number"),
+ (num_entries, "", "num_entries", "number of entries to test with",
+ ::cxxopts::value< uint32_t >()->default_value("5000"), "number"),
+ (run_time, "", "run_time", "run time for io", ::cxxopts::value< uint32_t >()->default_value("360000"), "seconds"),
+ (disable_merge, "", "disable_merge", "disable_merge", ::cxxopts::value< bool >()->default_value("0"), ""),
+ (operation_list, "", "operation_list", "operation list instead of default created following by percentage",
+ ::cxxopts::value< std::vector< std::string > >(), "operations [...]"),
+ (preload_size, "", "preload_size", "number of entries to preload tree with",
+ ::cxxopts::value< uint32_t >()->default_value("1000"), "number"),
+ (init_device, "", "init_device", "init device", ::cxxopts::value< bool >()->default_value("1"), ""),
+ (cleanup_after_shutdown, "", "cleanup_after_shutdown", "cleanup after shutdown",
+ ::cxxopts::value< bool >()->default_value("1"), ""),
+ (seed, "", "seed", "random engine seed, use random if not defined",
+ ::cxxopts::value< uint64_t >()->default_value("0"), "number"))
+
+void log_obj_life_counter() {
+ std::string str;
+ sisl::ObjCounterRegistry::foreach ([&str](const std::string& name, int64_t created, int64_t alive) {
+ fmt::format_to(std::back_inserter(str), "{}: created={} alive={}\n", name, created, alive);
+ });
+ LOGINFO("Object Life Counter\n:{}", str);
+}
+
+#ifdef _PRERELEASE
+template < typename TestType >
+struct IndexCrashTest : public test_common::HSTestHelper, BtreeTestHelper< TestType >, public ::testing::Test {
+ using T = TestType;
+ using K = typename TestType::KeyType;
+ using V = typename TestType::ValueType;
+ class TestIndexServiceCallbacks : public IndexServiceCallbacks {
+ public:
+ TestIndexServiceCallbacks(IndexCrashTest* test) : m_test(test) {}
+
+ std::shared_ptr< IndexTableBase > on_index_table_found(superblk< index_table_sb >&& sb) override {
+ LOGINFO("Index table recovered, root bnode_id {} version {}", sb->root_node, sb->root_link_version);
+ m_test->m_cfg = BtreeConfig(hs()->index_service().node_size());
+ m_test->m_cfg.m_leaf_node_type = T::leaf_node_type;
+ m_test->m_cfg.m_int_node_type = T::interior_node_type;
+ m_test->m_bt = std::make_shared< typename T::BtreeType >(std::move(sb), m_test->m_cfg);
+ return m_test->m_bt;
+ }
+
+ private:
+ IndexCrashTest* m_test;
+ };
+
+ IndexCrashTest() : testing::Test() { this->m_is_multi_threaded = true; }
+
+ void SetUp() override {
+ // Set the cp_timer_us to very high value to avoid any automatic checkpointing.
+ HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) {
+ s.generic.cache_max_throttle_cnt = 10000;
+ s.generic.cp_timer_us = 0x8000000000000000;
+ s.resource_limits.dirty_buf_percent = 100;
+ HS_SETTINGS_FACTORY().save();
+ });
+
+ this->start_homestore(
+ "test_index_crash_recovery",
+ {{HS_SERVICE::META, {.size_pct = 10.0}},
+ {HS_SERVICE::INDEX, {.size_pct = 70.0, .index_svc_cbs = new TestIndexServiceCallbacks(this)}}},
+ nullptr, {}, SISL_OPTIONS["init_device"].as< bool >());
+
+ LOGINFO("Node size {} ", hs()->index_service().node_size());
+ this->m_cfg = BtreeConfig(hs()->index_service().node_size());
+
+ auto uuid = boost::uuids::random_generator()();
+ auto parent_uuid = boost::uuids::random_generator()();
+
+ homestore::hs()->resource_mgr().reset_dirty_buf_qd();
+
+ // Create index table and attach to index service.
+ BtreeTestHelper< TestType >::SetUp();
+ if (this->m_bt == nullptr || SISL_OPTIONS["init_device"].as< bool >()) {
+ this->m_bt = std::make_shared< typename T::BtreeType >(uuid, parent_uuid, 0, this->m_cfg);
+ } else {
+ populate_shadow_map();
+ }
+
+ hs()->index_service().add_index_table(this->m_bt);
+ LOGINFO("Added index table to index service");
+ }
+
+ void populate_shadow_map() {
+ this->m_shadow_map.load(m_shadow_filename);
+ ASSERT_EQ(this->m_shadow_map.size(), this->m_bt->count_keys(this->m_bt->root_node_id()))
+ << "shadow map size and tree size mismatch";
+ this->get_all();
+ }
+
+ void restart_homestore(uint32_t shutdown_delay_sec = 3) override {
+ this->params(HS_SERVICE::INDEX).index_svc_cbs = new TestIndexServiceCallbacks(this);
+ LOGINFO("\n\n\n\n\n\n shutdown homestore for index service Test\n\n\n\n\n");
+ // this->m_shadow_map.save(this->m_shadow_filename);
+ test_common::HSTestHelper::restart_homestore(shutdown_delay_sec);
+ }
+
+ void reapply_after_crash() {
+ ShadowMap< K, V > snapshot_map{this->m_shadow_map.max_keys()};
+ snapshot_map.load(m_shadow_filename);
+ LOGDEBUG("\tSnapshot before crash\n{}", snapshot_map.to_string());
+ auto diff = this->m_shadow_map.diff(snapshot_map);
+
+ // visualize tree after crash
+ // std::string recovered_tree_filename = "tree_after_crash_" + to_string(rand() % 100) + ".dot";
+ // this->visualize_keys(recovered_tree_filename);
+ // LOGINFO(" tree after recovered stored in {}", recovered_tree_filename);
+
+ std::string dif_str = "KEY \tADDITION\n";
+ for (const auto& [k, addition] : diff) {
+ dif_str += fmt::format(" {} \t{}\n", k.key(), addition);
+ }
+ LOGDEBUG("Diff between shadow map and snapshot map\n{}\n", dif_str);
+
+ for (const auto& [k, addition] : diff) {
+ // this->print_keys(fmt::format("reapply: before inserting key {}", k.key()));
+ // this->visualize_keys(recovered_tree_filename);
+ if (addition) { this->force_upsert(k.key()); }
+ }
+ test_common::HSTestHelper::trigger_cp(true);
+ this->m_shadow_map.save(m_shadow_filename);
+ }
+
+ void TearDown() override {
+ bool cleanup = SISL_OPTIONS["cleanup_after_shutdown"].as< bool >();
+ LOGINFO("cleanup the dump map and index data? {}", cleanup);
+ if (!cleanup) {
+ this->m_shadow_map.save(m_shadow_filename);
+ } else {
+ if (std::filesystem::remove(m_shadow_filename)) {
+ LOGINFO("File {} removed successfully", m_shadow_filename);
+ } else {
+ LOGINFO("Error: failed to remove {}", m_shadow_filename);
+ }
+ }
+ LOGINFO("Teardown with Root bnode_id {} tree size: {}", this->m_bt->root_node_id(),
+ this->m_bt->count_keys(this->m_bt->root_node_id()));
+ BtreeTestHelper< TestType >::TearDown();
+ this->shutdown_homestore(false);
+ }
+ void crash_and_recover(uint32_t s_key, uint32_t e_key) {
+ this->print_keys("Btree prior to CP and susbsequent simulated crash: ");
+ test_common::HSTestHelper::trigger_cp(false);
+ this->wait_for_crash_recovery();
+ // this->visualize_keys("tree_after_crash_" + std::to_string(s_key) + "_" + std::to_string(e_key) + ".dot");
+
+ this->print_keys("Post crash and recovery, btree structure: ");
+ this->reapply_after_crash();
+
+ this->get_all();
+ LOGINFO(" except to have [{},{}) in tree and it is actually{} ", s_key, e_key, tree_key_count());
+ ASSERT_EQ(this->m_shadow_map.size(), this->m_bt->count_keys(this->m_bt->root_node_id()))
+ << "shadow map size and tree size mismatch";
+ }
+ uint32_t tree_key_count() { return this->m_bt->count_keys(this->m_bt->root_node_id()); }
+
+protected:
+ const std::string m_shadow_filename = "/tmp/shadow_map_index_recovery.txt";
+};
+
+// Crash recovery can test one simple btree, since focus is not on btree test itself, but index recovery
+using BtreeTypes = testing::Types< FixedLenBtree >;
+TYPED_TEST_SUITE(IndexCrashTest, BtreeTypes);
+
+TYPED_TEST(IndexCrashTest, CrashBeforeFirstCp) {
+ // Simulate the crash even before first cp
+ this->set_basic_flip("crash_flush_on_root");
+
+ auto ops = this->build_op_list({"put:100"});
+ this->multi_op_execute(ops, true /* skip_preload */);
+
+ // Trigger a cp, which should induce the crash and wait for hs to recover
+ test_common::HSTestHelper::trigger_cp(false);
+ this->wait_for_crash_recovery();
+
+ // Post crash, load the shadow_map into a new instance and compute the diff. Redo the operation
+ this->reapply_after_crash();
+}
+
+TYPED_TEST(IndexCrashTest, SplitOnLeftEdge) {
+ // Insert into 4 phases, first fill up the last part, since we need to test split on left edge
+ LOGINFO("Step 1: Fill up the last quarter of the tree");
+ auto const num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >();
+ for (auto k = num_entries * 3 / 4; k < num_entries; ++k) {
+ this->put(k, btree_put_type::INSERT, true /* expect_success */);
+ }
+
+ // Trigger the cp to make sure middle part is successful
+ LOGINFO("Step 2: Flush all the entries so far");
+ test_common::HSTestHelper::trigger_cp(true);
+ this->get_all();
+ this->m_shadow_map.save(this->m_shadow_filename);
+
+ // Now fill the entries from first and the leftmost child will always split, with crash flip set during flush phase
+ LOGINFO("Step 3: Fill the 3rd quarter of the tree, to make sure left child is split and we crash on flush of the "
+ "new child");
+ this->set_basic_flip("crash_flush_on_split_at_right_child");
+ for (auto k = num_entries / 2; k < num_entries * 3 / 4; ++k) {
+ this->put(k, btree_put_type::INSERT, true /* expect_success */);
+ }
+ LOGINFO("Step 4: Crash and reapply the missing entries to tree");
+ this->crash_and_recover(num_entries / 2, num_entries);
+
+ // TODO: Uncomment this once we do a fix for the inconsistent query results
+ LOGINFO("Step 5: Fill the 2nd quarter of the tree, to make sure left child is split and we crash on flush of the "
+ "left child");
+ this->set_basic_flip("crash_flush_on_split_at_left_child");
+ this->visualize_keys("tree_before_insert.dot");
+ for (auto k = num_entries / 4; k < num_entries / 2; ++k) {
+ // LOGINFO("inserting key {}", k);
+ // this->visualize_keys("tree_before_" + to_string(k) + ".dot");
+ this->put(k, btree_put_type::INSERT, true /* expect_success */);
+ }
+ this->visualize_keys("tree_before_crash.dot");
+ this->dump_to_file("tree_before_crash.txt");
+ LOGINFO("Step 6: Simulate crash and then recover, reapply keys to tree");
+ this->crash_and_recover(num_entries / 4, num_entries);
+
+ LOGINFO("Step 7: Fill the 1st quarter of the tree, to make sure left child is split and we crash on flush of the "
+ "parent node");
+ this->set_basic_flip("crash_flush_on_split_at_parent");
+ for (auto k = 0u; k <= num_entries / 4; ++k) {
+ this->put(k, btree_put_type::INSERT, true /* expect_success */);
+ }
+ LOGINFO("Step 8: Post crash we reapply the missing entries to tree");
+ this->crash_and_recover(0, num_entries);
+ LOGINFO("Step 9: Query all entries and validate with pagination of 80 entries");
+ this->query_all_paginate(80);
+}
+#endif
+
+int main(int argc, char* argv[]) {
+ int parsed_argc{argc};
+ ::testing::InitGoogleTest(&parsed_argc, argv);
+ SISL_OPTIONS_LOAD(parsed_argc, argv, logging, test_index_crash_recovery, iomgr, test_common_setup);
+ sisl::logging::SetLogger("test_index_crash_recovery");
+ spdlog::set_pattern("[%D %T%z] [%^%L%$] [%t] %v");
+
+ if (SISL_OPTIONS.count("seed")) {
+ auto seed = SISL_OPTIONS["seed"].as< uint64_t >();
+ LOGINFO("Using seed {} to sow the random generation", seed);
+ g_re.seed(seed);
+ }
+
+#ifdef _PRERELEASE
+ return RUN_ALL_TESTS();
+#else
+ return 0;
+#endif
+}
diff --git a/src/tests/test_pdev.cpp b/src/tests/test_pdev.cpp
index d5670abaf..e28c8a933 100644
--- a/src/tests/test_pdev.cpp
+++ b/src/tests/test_pdev.cpp
@@ -83,7 +83,12 @@ class PDevTest : public ::testing::Test {
m_dev_infos, [this](const homestore::vdev_info&, bool load_existing) -> shared< homestore::VirtualDev > {
return nullptr;
});
- m_dmgr->is_first_time_boot() ? m_dmgr->format_devices() : m_dmgr->load_devices();
+ if (m_dmgr->is_first_time_boot()) {
+ m_dmgr->format_devices();
+ m_dmgr->commit_formatting();
+ } else {
+ m_dmgr->load_devices();
+ }
m_first_data_pdev = m_dmgr->get_pdevs_by_dev_type(homestore::HSDevType::Data)[0];
m_first_fast_pdev = m_dmgr->get_pdevs_by_dev_type(homestore::HSDevType::Fast)[0];
}