Skip to content

Commit

Permalink
Add basic merge testcase & crash recovery
Browse files Browse the repository at this point in the history
Signed-off-by: Jilong Kou <[email protected]>
  • Loading branch information
koujl committed Oct 9, 2024
1 parent 5e6bf9d commit 0c24817
Show file tree
Hide file tree
Showing 19 changed files with 538 additions and 193 deletions.
2 changes: 1 addition & 1 deletion conanfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

class HomestoreConan(ConanFile):
name = "homestore"
version = "6.4.63"
version = "6.4.64"

homepage = "https://github.com/eBay/Homestore"
description = "HomeStore Storage Engine"
Expand Down
Binary file modified docs/imgs/Child_Node_Merge_1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion src/include/homestore/btree/btree.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ class Btree {
virtual btree_status_t transact_nodes(const BtreeNodeList& new_nodes, const BtreeNodeList& freed_nodes,
const BtreeNodePtr& left_child_node, const BtreeNodePtr& parent_node,
void* context) = 0;
virtual btree_status_t on_root_changed(BtreeNodePtr const& root, void* context) = 0;
virtual btree_status_t on_root_changed(BtreeNodePtr const &root, BtreeNodePtr const &freed_root, void *context) = 0;
virtual std::string btree_store_type() const = 0;

/////////////////////////// Methods the application use case is expected to handle ///////////////////////////
Expand Down
4 changes: 3 additions & 1 deletion src/include/homestore/btree/detail/btree_internal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,9 @@ struct BtreeConfig {
uint8_t m_split_pct{50};
uint32_t m_max_merge_nodes{3};
#ifdef _PRERELEASE
uint64_t m_max_keys_in_node{0};
// These are for testing purpose only
uint64_t m_max_keys_in_node{0};
uint64_t m_min_keys_in_node{0};
#endif
bool m_rebalance_turned_on{false};
bool m_merge_turned_on{true};
Expand Down
4 changes: 2 additions & 2 deletions src/include/homestore/btree/detail/btree_mutate_impl.ipp
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ btree_status_t Btree< K, V >::check_split_root(ReqT& req) {
root = std::move(new_root);

// We need to notify about the root change, before splitting the node, so that correct dependencies are set
ret = on_root_changed(root, req.m_op_context);
ret = on_root_changed(root, nullptr, req.m_op_context);
if (ret != btree_status_t::success) {
free_node(root, locktype_t::WRITE, req.m_op_context);
unlock_node(child_node, locktype_t::WRITE);
Expand All @@ -236,9 +236,9 @@ btree_status_t Btree< K, V >::check_split_root(ReqT& req) {

ret = split_node(root, child_node, root->total_entries(), &split_key, req.m_op_context);
if (ret != btree_status_t::success) {
on_root_changed(child_node, root, req.m_op_context); // Revert it back
free_node(root, locktype_t::WRITE, req.m_op_context);
root = std::move(child_node);
on_root_changed(root, req.m_op_context); // Revert it back
unlock_node(root, locktype_t::WRITE);
} else {
if (req.route_tracing) { append_route_trace(req, child_node, btree_event_t::SPLIT); }
Expand Down
13 changes: 13 additions & 0 deletions src/include/homestore/btree/detail/btree_node.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ struct transient_hdr_t {
/* these variables are accessed without taking lock and are not expected to change after init */
uint8_t leaf_node{0};
uint64_t max_keys_in_node{0};
uint64_t min_keys_in_node{0}; // to specify the threshold for triggering merge

bool is_leaf() const { return (leaf_node != 0); }
};
Expand Down Expand Up @@ -116,6 +117,7 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > {
m_trans_hdr.leaf_node = is_leaf;
#ifdef _PRERELEASE
m_trans_hdr.max_keys_in_node = cfg.m_max_keys_in_node;
m_trans_hdr.min_keys_in_node = cfg.m_min_keys_in_node;
#endif

}
Expand Down Expand Up @@ -299,6 +301,7 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > {

template < typename K >
K get_first_key() const {
if (total_entries() == 0) { return K{}; }
return get_nth_key< K >(0, true);
}

Expand Down Expand Up @@ -333,6 +336,7 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > {

// uint32_t total_entries() const { return (has_valid_edge() ? total_entries() + 1 : total_entries()); }
uint64_t max_keys_in_node() const { return m_trans_hdr.max_keys_in_node; }
uint64_t min_keys_in_node() const { return m_trans_hdr.min_keys_in_node; }

void lock(locktype_t l) const {
if (l == locktype_t::READ) {
Expand Down Expand Up @@ -392,6 +396,12 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > {
}
fmt::format_to(std::back_inserter(str), "]");
}

// Should not happen
if (this->is_node_deleted()) {
fmt::format_to(std::back_inserter(str), " **DELETED** ");
}

return str;
}

Expand Down Expand Up @@ -527,6 +537,9 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > {

virtual uint32_t occupied_size() const { return (node_data_size() - available_size()); }
bool is_merge_needed(const BtreeConfig& cfg) const {
if (min_keys_in_node()) {
return total_entries() < min_keys_in_node();
}
#if 0
#ifdef _PRERELEASE
if (iomgr_flip::instance()->test_flip("btree_merge_node") && occupied_size() < node_data_size) {
Expand Down
2 changes: 1 addition & 1 deletion src/include/homestore/btree/detail/btree_node_mgr.ipp
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ btree_status_t Btree< K, V >::create_root_node(void* op_context) {
}

m_root_node_info = BtreeLinkInfo{root->node_id(), root->link_version()};
ret = on_root_changed(root, op_context);
ret = on_root_changed(root, nullptr, op_context);
if (ret != btree_status_t::success) {
free_node(root, locktype_t::NONE, op_context);
m_root_node_info = BtreeLinkInfo{};
Expand Down
3 changes: 1 addition & 2 deletions src/include/homestore/btree/detail/btree_remove_impl.ipp
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ btree_status_t Btree< K, V >::check_collapse_root(ReqT& req) {
goto done;
}

ret = on_root_changed(child, req.m_op_context);
ret = on_root_changed(child, root, req.m_op_context);
if (ret != btree_status_t::success) {
unlock_node(child, locktype_t::WRITE);
unlock_node(root, locktype_t::WRITE);
Expand Down Expand Up @@ -476,7 +476,6 @@ btree_status_t Btree< K, V >::merge_nodes(const BtreeNodePtr& parent_node, const
++idx;
}
#endif

ret = transact_nodes(new_nodes, old_nodes, leftmost_node, parent_node, context);
}

Expand Down
3 changes: 2 additions & 1 deletion src/include/homestore/btree/detail/simple_node.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -229,8 +229,9 @@ class SimpleNode : public VariantNode< K, V > {
}
return str;
}

std::string to_dot_keys() const override {
return to_dot_keys_impl(std::is_same<K, uint64_t>{});
return to_dot_keys_impl(std::is_same<decltype(std::declval<K &>().key()), uint64_t>{});
}

std::string to_dot_keys_impl(std::false_type) const {
Expand Down
4 changes: 3 additions & 1 deletion src/include/homestore/btree/mem_btree.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ class MemBtree : public Btree< K, V > {
return btree_status_t::success;
}

btree_status_t on_root_changed(BtreeNodePtr const&, void*) override { return btree_status_t::success; }
btree_status_t on_root_changed(BtreeNodePtr const &, BtreeNodePtr const &, void *) override {
return btree_status_t::success;
}
};
} // namespace homestore
114 changes: 91 additions & 23 deletions src/include/homestore/index/index_table.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,8 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
}

void destroy() override {
Btree< K, V >::destroy_btree(nullptr);
auto cpg = cp_mgr().cp_guard();
Btree<K, V>::destroy_btree(cpg.context(cp_consumer_t::INDEX_SVC));
m_sb.destroy();
}

Expand Down Expand Up @@ -130,13 +131,16 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
idx_buf->m_dirtied_cp_id = cpg->id();
BtreeNodePtr bn = BtreeNodePtr{n};

LOGTRACEMOD(wbcache, "repair_node cp={} buf={}", cpg->id(), idx_buf->to_string());
repair_links(bn, (void*)cpg.context(cp_consumer_t::INDEX_SVC));
// Only for interior nodes we need to repair its links
if (!bn->is_leaf()) {
LOGTRACEMOD(wbcache, "repair_node cp={} buf={}", cpg->id(), idx_buf->to_string());
repair_links(bn, (void *) cpg.context(cp_consumer_t::INDEX_SVC));
}

if (idx_buf->m_up_buffer && idx_buf->m_up_buffer->is_meta_buf()) {
// Our up buffer is a meta buffer, which means that we are the new root node, we need to update the
// meta_buf with new root as well
on_root_changed(bn, (void*)cpg.context(cp_consumer_t::INDEX_SVC));
on_root_changed(bn, nullptr, (void *) cpg.context(cp_consumer_t::INDEX_SVC));
}
}

Expand Down Expand Up @@ -223,7 +227,8 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
wb_cache().free_buf(n->m_idx_buf, r_cast< CPContext* >(context));
}

btree_status_t on_root_changed(BtreeNodePtr const& new_root, void* context) override {
btree_status_t
on_root_changed(BtreeNodePtr const &new_root, BtreeNodePtr const &freed_root, void *context) override {
m_sb->root_node = new_root->node_id();
m_sb->root_link_version = new_root->link_version();

Expand All @@ -232,12 +237,18 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
}

auto& root_buf = static_cast< IndexBtreeNode* >(new_root.get())->m_idx_buf;
wb_cache().transact_bufs(ordinal(), m_sb_buffer, root_buf, {}, {}, r_cast< CPContext* >(context));
IndexBufferPtrList freed_bufs;
if (freed_root) {
freed_bufs.push_back(static_cast<IndexBtreeNode *>(freed_root.get())->m_idx_buf);
}
// Meta is similar to a leftmost child here - it should always be the up buffer for (both) root(s)
wb_cache().transact_bufs(ordinal(), nullptr, m_sb_buffer, {root_buf}, freed_bufs,
r_cast<CPContext *>(context));
return btree_status_t::success;
}

btree_status_t repair_links(BtreeNodePtr const& parent_node, void* cp_ctx) {
BT_LOG(DEBUG, "Repairing links for parent node {}", parent_node->to_string());
BT_LOG(DEBUG, "Repairing links for parent node [{}]", parent_node->to_string());

// Get the last key in the node
auto const last_parent_key = parent_node->get_last_key< K >();
Expand All @@ -247,7 +258,15 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
parent_node->node_id());
return btree_status_t::not_found;
}
BT_LOG(INFO, "Repairing node={} with last_parent_key={}", parent_node->to_string(),

// Get all original child ids as a support to check if we are beyond the last child node
std::set<bnodeid_t> orig_child_ids;
for (uint32_t i = 0; i < parent_node->total_entries(); ++i) {
BtreeLinkInfo link_info;
parent_node->get_nth_value(i, &link_info, true);
orig_child_ids.insert(link_info.bnode_id());
}
BT_LOG(INFO, "Repairing node=[{}] with last_parent_key={}", parent_node->to_string(),
last_parent_key.to_string());

// Get the first child node and its link info
Expand All @@ -272,24 +291,48 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
auto cur_parent = parent_node;
BtreeNodeList new_parent_nodes;
do {
if (child_node->has_valid_edge() ||
(child_node->is_leaf() && (child_node->next_bnode() == empty_bnodeid))) {
BT_DBG_ASSERT(is_parent_edge_node,
"Child node={} is an edge node but parent_node={} is not an edge node",
child_node->node_id(), cur_parent->node_id());
cur_parent->set_edge_value(BtreeLinkInfo{child_node->node_id(), child_node->link_version()});
if (child_node->has_valid_edge() || (child_node->is_leaf() && child_node->next_bnode() == empty_bnodeid)) {
if (child_node->is_node_deleted()) {
// Edge node is merged, we need to set the current last entry as edge
if (cur_parent->total_entries() > 0) {
auto prev_val = V{};
cur_parent->get_nth_value(cur_parent->total_entries() - 1, &prev_val, true);
cur_parent->remove(cur_parent->total_entries() - 1);
cur_parent->set_edge_value(prev_val);
BT_LOG(INFO, "Reparing node={}, child_node=[{}] is deleted, set previous as edge_value={}",
cur_parent->node_id(), child_node->to_string(), prev_val.to_string());
} else {
BT_LOG(INFO, "Found an empty interior node {} with maybe all childs deleted",
cur_parent->node_id());
}
} else {
// Update edge and finish
BT_LOG(INFO, "Repairing node={}, child_node=[{}] is an edge node, end loop", cur_parent->node_id(),
child_node->to_string());
child_node->set_next_bnode(empty_bnodeid);
write_node_impl(child_node, cp_ctx);
cur_parent->set_edge_value(BtreeLinkInfo{child_node->node_id(), child_node->link_version()});
}
break;
}

auto const child_last_key = child_node->get_last_key< K >();
BT_LOG(INFO, "Repairing node={} child_node={} child_last_key={}", cur_parent->node_id(),
BT_LOG(INFO, "Repairing node={}, child_node=[{}] child_last_key={}", cur_parent->node_id(),
child_node->to_string(), child_last_key.to_string());

if (child_last_key.compare(last_parent_key) > 0) {
// We have reached the last key, we can stop now
break;
// Check if we are beyond the last child node.
//
// There can be cases where the child level merge is successfully persisted but the parent level is not.
// In this case, you may have your rightmost child node with last key greater than the last_parent_key.
// That's why here we have to check if the child node is one of the original child nodes first.
if (!is_parent_edge_node && !orig_child_ids.contains(child_node->node_id())) {
if (child_node->total_entries() == 0 || child_last_key.compare(last_parent_key) > 0) {
// We have reached a child beyond this parent, we can stop now
break;
}
}


if (!cur_parent->has_room_for_put(btree_put_type::INSERT, K::get_max_size(),
BtreeLinkInfo::get_fixed_size())) {
// No room in the parent_node, let us split the parent_node and continue
Expand All @@ -309,31 +352,56 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
}

// Insert the last key of the child node into parent node
cur_parent->insert(cur_parent->total_entries(), child_last_key,
BtreeLinkInfo{child_node->node_id(), child_node->link_version()});
if (!child_node->is_node_deleted()) {
cur_parent->insert(cur_parent->total_entries(),
child_node->total_entries() > 0 ? child_last_key : last_parent_key,
BtreeLinkInfo{child_node->node_id(), child_node->link_version()});
if (child_node->total_entries() == 0) {
// There should be at most one empty child node per parent - if we find one, we should stop here
BT_LOG(INFO, "Repairing node={}, child_node=[{}] is empty, end loop", cur_parent->node_id(),
child_node->to_string());
break;
}
} else {
// Node deleted indicates it's freed & no longer used during recovery
BT_LOG(INFO, "Repairing node={}, child node=[{}] is deleted, skipping the insert",
cur_parent->node_id(), child_node->to_string());
}

BT_LOG(INFO, "Repairing node={}, repaired so_far={}", cur_parent->node_id(), cur_parent->to_string());
BT_LOG(INFO, "Repairing node={}, repaired so_far=[{}]", cur_parent->node_id(), cur_parent->to_string());

// Move to the next child node
this->unlock_node(child_node, locktype_t::READ);
auto const next_node_id = child_node->next_bnode();
this->unlock_node(child_node, locktype_t::READ);
if (next_node_id == empty_bnodeid) {
BT_LOG_ASSERT(false,
"Child node={} next_node_id is empty, while its not a edge node, parent_node={} "
"repair is partial",
child_node->node_id(), parent_node->node_id());
ret = btree_status_t::not_found;
child_node = nullptr;
break;
}

ret = this->read_and_lock_node(next_node_id, child_node, locktype_t::READ, locktype_t::READ, cp_ctx);
if (ret != btree_status_t::success) {
BT_LOG_ASSERT(false, "Parent node={} repair is partial, because child_node get has failed with ret={}",
parent_node->node_id(), enum_name(ret));
child_node = nullptr;
break;
}
} while (true);
this->unlock_node(child_node, locktype_t::READ);

if (child_node) {
this->unlock_node(child_node, locktype_t::READ);
}

if (parent_node->total_entries() == 0 && !parent_node->has_valid_edge()) {
// We shouldn't have an empty interior node in the tree, let's delete it.
// The buf will be released by the caller
BT_LOG(INFO, "Parent node={} is empty, deleting it", parent_node->node_id());
parent_node->set_node_deleted();
}

if (ret == btree_status_t::success) {
ret = transact_nodes(new_parent_nodes, {}, parent_node, nullptr, cp_ctx);
Expand Down
3 changes: 1 addition & 2 deletions src/lib/common/crash_simulator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,8 @@ class CrashSimulator {
~CrashSimulator() = default;

void crash() {
m_crashed.update([](auto *s) { *s = true; });
if (m_restart_cb) {
m_crashed.update([](auto* s) { *s = true; });

// We can restart on a new thread to allow other operations to continue
std::thread t([cb = std::move(m_restart_cb)]() {
// Restart could destroy this pointer, so we are storing in local variable and then calling.
Expand Down
9 changes: 9 additions & 0 deletions src/lib/device/virtual_dev.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,8 @@ std::error_code VirtualDev::sync_write(const char* buf, uint32_t size, BlkId con

Chunk* chunk;
uint64_t const dev_offset = to_dev_offset(bid, &chunk);
HS_LOG(TRACE, device, "Writing sync in device: {}, offset = {}", chunk->physical_dev_mutable()->pdev_id(),
dev_offset);
if (sisl_unlikely(dev_offset == INVALID_DEV_OFFSET)) {
return std::make_error_code(std::errc::resource_unavailable_try_again);
}
Expand All @@ -436,6 +438,9 @@ std::error_code VirtualDev::sync_write(const char* buf, uint32_t size, cshared<
if (hs()->crash_simulator().is_crashed()) { return std::error_code{}; }
#endif

HS_LOG(TRACE, device, "Writing sync in device: {}, offset = {}", chunk->physical_dev_mutable()->pdev_id(),
chunk->start_offset() + offset_in_chunk);

if (sisl_unlikely(!is_chunk_available(chunk))) {
return std::make_error_code(std::errc::resource_unavailable_try_again);
}
Expand All @@ -457,6 +462,8 @@ std::error_code VirtualDev::sync_writev(const iovec* iov, int iovcnt, BlkId cons
auto const size = get_len(iov, iovcnt);
auto* pdev = chunk->physical_dev_mutable();

HS_LOG(TRACE, device, "Writing sync in device: {}, offset = {}", pdev->pdev_id(), dev_offset);

COUNTER_INCREMENT(m_metrics, vdev_write_count, 1);
if (sisl_unlikely(!hs_utils::mod_aligned_sz(dev_offset, pdev->align_size()))) {
COUNTER_INCREMENT(m_metrics, unalign_writes, 1);
Expand All @@ -479,6 +486,8 @@ std::error_code VirtualDev::sync_writev(const iovec* iov, int iovcnt, cshared< C
auto const size = get_len(iov, iovcnt);
auto* pdev = chunk->physical_dev_mutable();

HS_LOG(TRACE, device, "Writing sync in device: {}, offset = {}", pdev->pdev_id(), dev_offset);

COUNTER_INCREMENT(m_metrics, vdev_write_count, 1);
if (sisl_unlikely(!hs_utils::mod_aligned_sz(dev_offset, pdev->align_size()))) {
COUNTER_INCREMENT(m_metrics, unalign_writes, 1);
Expand Down
Loading

0 comments on commit 0c24817

Please sign in to comment.