Skip to content

Commit

Permalink
Add index CR UT for basic merge (#556)
Browse files Browse the repository at this point in the history
Signed-off-by: Jilong Kou <[email protected]>
  • Loading branch information
koujl authored Oct 29, 2024
1 parent a7a9fe5 commit 60eea4a
Show file tree
Hide file tree
Showing 11 changed files with 519 additions and 203 deletions.
2 changes: 1 addition & 1 deletion conanfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

class HomestoreConan(ConanFile):
name = "homestore"
version = "6.5.3"
version = "6.5.4"

homepage = "https://github.com/eBay/Homestore"
description = "HomeStore Storage Engine"
Expand Down
4 changes: 3 additions & 1 deletion src/include/homestore/btree/detail/btree_internal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,9 @@ struct BtreeConfig {
uint8_t m_split_pct{50};
uint32_t m_max_merge_nodes{3};
#ifdef _PRERELEASE
uint64_t m_max_keys_in_node{0};
// These are for testing purpose only
uint64_t m_max_keys_in_node{0};
uint64_t m_min_keys_in_node{0};
#endif
bool m_rebalance_turned_on{false};
bool m_merge_turned_on{true};
Expand Down
21 changes: 13 additions & 8 deletions src/include/homestore/btree/detail/btree_node.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ struct transient_hdr_t {
/* these variables are accessed without taking lock and are not expected to change after init */
uint8_t leaf_node{0};
uint64_t max_keys_in_node{0};
uint64_t min_keys_in_node{0}; // to specify the threshold for triggering merge

bool is_leaf() const { return (leaf_node != 0); }
};
Expand Down Expand Up @@ -116,6 +117,7 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > {
m_trans_hdr.leaf_node = is_leaf;
#ifdef _PRERELEASE
m_trans_hdr.max_keys_in_node = cfg.m_max_keys_in_node;
m_trans_hdr.min_keys_in_node = cfg.m_min_keys_in_node;
#endif

}
Expand Down Expand Up @@ -299,6 +301,7 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > {

template < typename K >
K get_first_key() const {
if (total_entries() == 0) { return K{}; }
return get_nth_key< K >(0, true);
}

Expand Down Expand Up @@ -333,6 +336,7 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > {

// uint32_t total_entries() const { return (has_valid_edge() ? total_entries() + 1 : total_entries()); }
uint64_t max_keys_in_node() const { return m_trans_hdr.max_keys_in_node; }
uint64_t min_keys_in_node() const { return m_trans_hdr.min_keys_in_node; }

void lock(locktype_t l) const {
if (l == locktype_t::READ) {
Expand Down Expand Up @@ -392,6 +396,12 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > {
}
fmt::format_to(std::back_inserter(str), "]");
}

// Should not happen
if (this->is_node_deleted()) {
fmt::format_to(std::back_inserter(str), " **DELETED** ");
}

return str;
}

Expand Down Expand Up @@ -527,15 +537,10 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > {

virtual uint32_t occupied_size() const { return (node_data_size() - available_size()); }
bool is_merge_needed(const BtreeConfig& cfg) const {
#if 0
#ifdef _PRERELEASE
if (iomgr_flip::instance()->test_flip("btree_merge_node") && occupied_size() < node_data_size) {
return true;
}

auto ret = iomgr_flip::instance()->get_test_flip< uint64_t >("btree_merge_node_pct");
if (ret && occupied_size() < (ret.get() * node_data_size() / 100)) { return true; }
#endif
if (min_keys_in_node()) {
return total_entries() < min_keys_in_node();
}
#endif
return (occupied_size() < cfg.suggested_min_size());
}
Expand Down
120 changes: 92 additions & 28 deletions src/include/homestore/index/index_table.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,8 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
}

void destroy() override {
Btree< K, V >::destroy_btree(nullptr);
auto cpg = cp_mgr().cp_guard();
Btree<K, V>::destroy_btree(cpg.context(cp_consumer_t::INDEX_SVC));
m_sb.destroy();
}

Expand Down Expand Up @@ -130,13 +131,16 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
idx_buf->m_dirtied_cp_id = cpg->id();
BtreeNodePtr bn = BtreeNodePtr{n};

LOGTRACEMOD(wbcache, "repair_node cp={} buf={}", cpg->id(), idx_buf->to_string());
repair_links(bn, (void*)cpg.context(cp_consumer_t::INDEX_SVC));
// Only for interior nodes we need to repair its links
if (!bn->is_leaf()) {
LOGTRACEMOD(wbcache, "repair_node cp={} buf={}", cpg->id(), idx_buf->to_string());
repair_links(bn, (void *) cpg.context(cp_consumer_t::INDEX_SVC));
}

if (idx_buf->m_up_buffer && idx_buf->m_up_buffer->is_meta_buf()) {
// Our up buffer is a meta buffer, which means that we are the new root node, we need to update the
// meta_buf with new root as well
on_root_changed(bn, (void*)cpg.context(cp_consumer_t::INDEX_SVC));
on_root_changed(bn, (void *) cpg.context(cp_consumer_t::INDEX_SVC));
}
}

Expand Down Expand Up @@ -223,7 +227,8 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
wb_cache().free_buf(n->m_idx_buf, r_cast< CPContext* >(context));
}

btree_status_t on_root_changed(BtreeNodePtr const& new_root, void* context) override {
btree_status_t
on_root_changed(BtreeNodePtr const &new_root, void *context) override {
// todo: if(m_sb->root_node == new_root->node_id() && m_sb->root_link_version == new_root->link_version()){
// return btree_status_t::success;}
m_sb->root_node = new_root->node_id();
Expand All @@ -235,12 +240,12 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
}

auto& root_buf = static_cast< IndexBtreeNode* >(new_root.get())->m_idx_buf;
wb_cache().transact_bufs(ordinal(), m_sb_buffer, root_buf, {}, {}, r_cast< CPContext* >(context));
wb_cache().transact_bufs(ordinal(), m_sb_buffer, root_buf, {}, {}, r_cast<CPContext *>(context));
return btree_status_t::success;
}

btree_status_t repair_links(BtreeNodePtr const& parent_node, void* cp_ctx) {
BT_LOG(DEBUG, "Repairing links for parent node {}", parent_node->to_string());
BT_LOG(DEBUG, "Repairing links for parent node [{}]", parent_node->to_string());
// TODO: is it possible that repairing many nodes causes an increase to level of btree? If so, then this needs
// to be handled. Get the last key in the node
auto const last_parent_key = parent_node->get_last_key< K >();
Expand All @@ -250,7 +255,15 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
parent_node->node_id());
return btree_status_t::not_found;
}
BT_LOG(INFO, "Repairing node={} with last_parent_key={}", parent_node->to_string(),

// Get all original child ids as a support to check if we are beyond the last child node
std::set<bnodeid_t> orig_child_ids;
for (uint32_t i = 0; i < parent_node->total_entries(); ++i) {
BtreeLinkInfo link_info;
parent_node->get_nth_value(i, &link_info, true);
orig_child_ids.insert(link_info.bnode_id());
}
BT_LOG(INFO, "Repairing node=[{}] with last_parent_key={}", parent_node->to_string(),
last_parent_key.to_string());

// Get the first child node and its link info
Expand All @@ -275,22 +288,45 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
auto cur_parent = parent_node;
BtreeNodeList new_parent_nodes;
do {
if (child_node->has_valid_edge() ||
(child_node->is_leaf() && (child_node->next_bnode() == empty_bnodeid))) {
BT_DBG_ASSERT(is_parent_edge_node,
"Child node={} is an edge node but parent_node={} is not an edge node",
child_node->node_id(), cur_parent->node_id());
cur_parent->set_edge_value(BtreeLinkInfo{child_node->node_id(), child_node->link_version()});
if (child_node->has_valid_edge() || (child_node->is_leaf() && child_node->next_bnode() == empty_bnodeid)) {
if (child_node->is_node_deleted()) {
// Edge node is merged, we need to set the current last entry as edge
if (cur_parent->total_entries() > 0) {
auto prev_val = V{};
cur_parent->get_nth_value(cur_parent->total_entries() - 1, &prev_val, true);
cur_parent->remove(cur_parent->total_entries() - 1);
cur_parent->set_edge_value(prev_val);
BT_LOG(INFO, "Reparing node={}, child_node=[{}] is deleted, set previous as edge_value={}",
cur_parent->node_id(), child_node->to_string(), prev_val.to_string());
} else {
BT_LOG(INFO, "Found an empty interior node {} with maybe all childs deleted",
cur_parent->node_id());
}
} else {
// Update edge and finish
BT_LOG(INFO, "Repairing node={}, child_node=[{}] is an edge node, end loop", cur_parent->node_id(),
child_node->to_string());
child_node->set_next_bnode(empty_bnodeid);
write_node_impl(child_node, cp_ctx);
cur_parent->set_edge_value(BtreeLinkInfo{child_node->node_id(), child_node->link_version()});
}
break;
}

auto const child_last_key = child_node->get_last_key< K >();
BT_LOG(INFO, "Repairing node={} child_node={} child_last_key={}", cur_parent->node_id(),
BT_LOG(INFO, "Repairing node={}, child_node=[{}] child_last_key={}", cur_parent->node_id(),
child_node->to_string(), child_last_key.to_string());

if (child_last_key.compare(last_parent_key) > 0 && !is_parent_edge_node) {
// We have reached the last key, and the parent node doesn't have edge, so we can stop now
break;
// Check if we are beyond the last child node.
//
// There can be cases where the child level merge is successfully persisted but the parent level is not.
// In this case, you may have your rightmost child node with last key greater than the last_parent_key.
// That's why here we have to check if the child node is one of the original child nodes first.
if (!is_parent_edge_node && !orig_child_ids.contains(child_node->node_id())) {
if (child_node->total_entries() == 0 || child_last_key.compare(last_parent_key) > 0) {
// We have reached a child beyond this parent, we can stop now
break;
}
}

if (!cur_parent->has_room_for_put(btree_put_type::INSERT, K::get_max_size(),
Expand All @@ -312,31 +348,59 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
}

// Insert the last key of the child node into parent node
cur_parent->insert(cur_parent->total_entries(), child_last_key,
BtreeLinkInfo{child_node->node_id(), child_node->link_version()});
if (!child_node->is_node_deleted()) {
cur_parent->insert(cur_parent->total_entries(),
child_node->total_entries() > 0 ? child_last_key : last_parent_key,
BtreeLinkInfo{child_node->node_id(), child_node->link_version()});
if (child_node->total_entries() == 0) {
// There should be at most one empty child node per parent - if we find one, we should stop here
BT_LOG(INFO, "Repairing node={}, child_node=[{}] is empty, end loop", cur_parent->node_id(),
child_node->to_string());
break;
}
} else {
// Node deleted indicates it's freed & no longer used during recovery
BT_LOG(INFO, "Repairing node={}, child node=[{}] is deleted, skipping the insert",
cur_parent->node_id(), child_node->to_string());
}

BT_LOG(INFO, "Repairing node={}, repaired so_far={}", cur_parent->node_id(), cur_parent->to_string());
BT_LOG(INFO, "Repairing node={}, repaired so_far=[{}]", cur_parent->node_id(), cur_parent->to_string());

// Move to the next child node
this->unlock_node(child_node, locktype_t::READ);
auto const next_node_id = child_node->next_bnode();
this->unlock_node(child_node, locktype_t::READ);
if (next_node_id == empty_bnodeid) {
BT_LOG_ASSERT(false,
"Child node={} next_node_id is empty, while its not a edge node, parent_node={} "
"repair is partial",
child_node->node_id(), parent_node->node_id());
ret = btree_status_t::not_found;
// This can be a deleted edge node - only check if it is still valid
if (!child_node->is_node_deleted()) {
BT_LOG_ASSERT(false,
"Child node={} next_node_id is empty, while its not a edge node, parent_node={} "
"repair is partial",
child_node->node_id(), parent_node->node_id());
ret = btree_status_t::not_found;
}
child_node = nullptr;
break;
}

ret = this->read_and_lock_node(next_node_id, child_node, locktype_t::READ, locktype_t::READ, cp_ctx);
if (ret != btree_status_t::success) {
BT_LOG_ASSERT(false, "Parent node={} repair is partial, because child_node get has failed with ret={}",
parent_node->node_id(), enum_name(ret));
child_node = nullptr;
break;
}
} while (true);
this->unlock_node(child_node, locktype_t::READ);

if (child_node) {
this->unlock_node(child_node, locktype_t::READ);
}

if (parent_node->total_entries() == 0 && !parent_node->has_valid_edge()) {
// We shouldn't have an empty interior node in the tree, let's delete it.
// The buf will be released by the caller
BT_LOG(INFO, "Parent node={} is empty, deleting it", parent_node->node_id());
parent_node->set_node_deleted();
}

if (ret == btree_status_t::success) {
ret = transact_nodes(new_parent_nodes, {}, parent_node, nullptr, cp_ctx);
Expand Down
9 changes: 9 additions & 0 deletions src/lib/device/virtual_dev.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,8 @@ std::error_code VirtualDev::sync_write(const char* buf, uint32_t size, BlkId con

Chunk* chunk;
uint64_t const dev_offset = to_dev_offset(bid, &chunk);
HS_LOG(TRACE, device, "Writing sync in device: {}, offset = {}", chunk->physical_dev_mutable()->pdev_id(),
dev_offset);
if (sisl_unlikely(dev_offset == INVALID_DEV_OFFSET)) {
return std::make_error_code(std::errc::resource_unavailable_try_again);
}
Expand All @@ -436,6 +438,9 @@ std::error_code VirtualDev::sync_write(const char* buf, uint32_t size, cshared<
if (hs()->crash_simulator().is_crashed()) { return std::error_code{}; }
#endif

HS_LOG(TRACE, device, "Writing sync in device: {}, offset = {}", chunk->physical_dev_mutable()->pdev_id(),
chunk->start_offset() + offset_in_chunk);

if (sisl_unlikely(!is_chunk_available(chunk))) {
return std::make_error_code(std::errc::resource_unavailable_try_again);
}
Expand All @@ -457,6 +462,8 @@ std::error_code VirtualDev::sync_writev(const iovec* iov, int iovcnt, BlkId cons
auto const size = get_len(iov, iovcnt);
auto* pdev = chunk->physical_dev_mutable();

HS_LOG(TRACE, device, "Writing sync in device: {}, offset = {}", pdev->pdev_id(), dev_offset);

COUNTER_INCREMENT(m_metrics, vdev_write_count, 1);
if (sisl_unlikely(!hs_utils::mod_aligned_sz(dev_offset, pdev->align_size()))) {
COUNTER_INCREMENT(m_metrics, unalign_writes, 1);
Expand All @@ -479,6 +486,8 @@ std::error_code VirtualDev::sync_writev(const iovec* iov, int iovcnt, cshared< C
auto const size = get_len(iov, iovcnt);
auto* pdev = chunk->physical_dev_mutable();

HS_LOG(TRACE, device, "Writing sync in device: {}, offset = {}", pdev->pdev_id(), dev_offset);

COUNTER_INCREMENT(m_metrics, vdev_write_count, 1);
if (sisl_unlikely(!hs_utils::mod_aligned_sz(dev_offset, pdev->align_size()))) {
COUNTER_INCREMENT(m_metrics, unalign_writes, 1);
Expand Down
2 changes: 0 additions & 2 deletions src/lib/homestore.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -329,8 +329,6 @@ void HomeStore::shutdown() {
#ifdef _PRERELEASE
flip::Flip::instance().stop_rpc_server();
#endif

HomeStore::reset_instance();
LOGINFO("Homestore is completed its shutdown");
}

Expand Down
15 changes: 14 additions & 1 deletion src/lib/index/index_cp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,6 @@ void IndexCPContext::process_txn_record(txn_record const* rec, std::map< BlkId,
}

if (up_buf) {
DEBUG_ASSERT(((buf->m_up_buffer == nullptr) || (buf->m_up_buffer == up_buf)), "Inconsistent up buffer");
auto real_up_buf = (up_buf->m_created_cp_id == cpg->id()) ? up_buf->m_up_buffer : up_buf;

#ifndef NDEBUG
Expand All @@ -279,6 +278,20 @@ void IndexCPContext::process_txn_record(txn_record const* rec, std::map< BlkId,
#endif

if (buf->m_up_buffer != real_up_buf) {
if (buf->m_up_buffer) {
buf->m_up_buffer->m_wait_for_down_buffers.decrement(1);
#ifndef NDEBUG
bool found{false};
for (auto it = buf->m_up_buffer->m_down_buffers.begin(); it != buf->m_up_buffer->m_down_buffers.end(); ++it) {
if (it->lock() == buf) {
buf->m_up_buffer->m_down_buffers.erase(it);
found = true;
break;
}
}
HS_DBG_ASSERT(found, "Down buffer is linked to Up buf, but up_buf doesn't have down_buf in its list");
#endif
}
real_up_buf->m_wait_for_down_buffers.increment(1);
buf->m_up_buffer = real_up_buf;
}
Expand Down
Loading

0 comments on commit 60eea4a

Please sign in to comment.