-
Notifications
You must be signed in to change notification settings - Fork 21
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Index write back cache fixes. #207
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -64,29 +64,68 @@ enum class index_buf_state_t : uint8_t { | |
}; | ||
|
||
///////////////////////// Btree Node and Buffer Portion ////////////////////////// | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. General comment, it would be very useful if we put a README under wb_cache directory to illustrate the hierarchy of btree_node -> IndexBuffer and NodeBuffer relationship and its clean/dirty copy behavior. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And it will be super helpful if we can explain first, second and third meaning while creating the chain with an example, so that after several month we can read the code easier. :) |
||
|
||
// Multiple IndexBuffer could point to the same NodeBuffer if its clean. | ||
struct NodeBuffer; | ||
typedef std::shared_ptr< NodeBuffer > NodeBufferPtr; | ||
struct NodeBuffer { | ||
uint8_t* m_bytes{nullptr}; // Actual data buffer | ||
std::atomic< index_buf_state_t > m_state{index_buf_state_t::CLEAN}; // Is buffer yet to persist? | ||
NodeBuffer(uint32_t buf_size, uint32_t align_size); | ||
~NodeBuffer(); | ||
}; | ||
|
||
// IndexBuffer is for each CP. The dependent index buffers are chained using | ||
// m_next_buffer and each buffer is flushed only its wait_for_leaders reaches 0 | ||
// which means all its dependent buffers are flushed. | ||
struct IndexBuffer; | ||
typedef std::shared_ptr< IndexBuffer > IndexBufferPtr; | ||
|
||
struct IndexBuffer { | ||
uint8_t* m_node_buf{nullptr}; // Actual buffer | ||
index_buf_state_t m_buf_state{index_buf_state_t::CLEAN}; // Is buffer yet to persist? | ||
BlkId m_blkid; // BlkId where this needs to be persisted | ||
std::weak_ptr< IndexBuffer > m_next_buffer; // Next buffer in the chain | ||
NodeBufferPtr m_node_buf; | ||
BlkId m_blkid; // BlkId where this needs to be persisted | ||
std::weak_ptr< IndexBuffer > m_next_buffer; // Next buffer in the chain | ||
// Number of leader buffers we are waiting for before we write this buffer | ||
sisl::atomic_counter< int > m_wait_for_leaders{0}; | ||
|
||
IndexBuffer(BlkId blkid, uint32_t buf_size, uint32_t align_size); | ||
IndexBuffer(NodeBufferPtr node_buf, BlkId blkid); | ||
~IndexBuffer(); | ||
|
||
BlkId blkid() const { return m_blkid; } | ||
uint8_t* raw_buffer() { return m_node_buf; } | ||
uint8_t* raw_buffer() { | ||
RELEASE_ASSERT(m_node_buf, "Node buffer null blkid {}", m_blkid.to_integer()); | ||
return m_node_buf->m_bytes; | ||
} | ||
|
||
bool is_clean() const { | ||
RELEASE_ASSERT(m_node_buf, "Node buffer null blkid {}", m_blkid.to_integer()); | ||
return (m_node_buf->m_state.load() == index_buf_state_t::CLEAN); | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added some wrapper functions and to_string cleaner |
||
|
||
index_buf_state_t state() const { | ||
RELEASE_ASSERT(m_node_buf, "Node buffer null blkid {}", m_blkid.to_integer()); | ||
return m_node_buf->m_state; | ||
} | ||
|
||
void set_state(index_buf_state_t state) { | ||
RELEASE_ASSERT(m_node_buf, "Node buffer null blkid {}", m_blkid.to_integer()); | ||
m_node_buf->m_state = state; | ||
} | ||
|
||
bool is_clean() const { return (m_buf_state == index_buf_state_t::CLEAN); } | ||
std::string to_string() const { | ||
return fmt::format("IndexBuffer {} blkid={} state={} node_buf={} next_buffer={} wait_for={}", | ||
reinterpret_cast< void* >(const_cast< IndexBuffer* >(this)), m_blkid.to_integer(), | ||
static_cast< int >(m_buf_state), static_cast< void* >(m_node_buf), | ||
voidptr_cast(m_next_buffer.lock().get()), m_wait_for_leaders.get()); | ||
auto str = fmt::format("IndexBuffer {} blkid={}", reinterpret_cast< void* >(const_cast< IndexBuffer* >(this)), | ||
m_blkid.to_integer()); | ||
if (m_node_buf == nullptr) { | ||
fmt::format_to(std::back_inserter(str), " node_buf=nullptr"); | ||
} else { | ||
fmt::format_to(std::back_inserter(str), " state={} node_buf={}", | ||
static_cast< int >(m_node_buf->m_state.load()), static_cast< void* >(m_node_buf->m_bytes)); | ||
} | ||
fmt::format_to(std::back_inserter(str), " next_buffer={} wait_for={}", | ||
m_next_buffer.lock() ? reinterpret_cast< void* >(m_next_buffer.lock().get()) : 0, | ||
m_wait_for_leaders.get()); | ||
return str; | ||
} | ||
}; | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -114,7 +114,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { | |
// Need to put it in wb cache | ||
wb_cache().write_buf(node, idx_node->m_idx_buf, cp_ctx); | ||
idx_node->m_last_mod_cp_id = cp_ctx->id(); | ||
LOGTRACEMOD(wbcache, "{}", idx_node->m_idx_buf->to_string()); | ||
LOGTRACEMOD(wbcache, "add to dirty list cp {} {}", cp_ctx->id(), idx_node->m_idx_buf->to_string()); | ||
} | ||
node->set_checksum(this->m_bt_cfg); | ||
return btree_status_t::success; | ||
|
@@ -129,17 +129,28 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { | |
auto& left_child_buf = left_child_idx_node->m_idx_buf; | ||
auto& parent_buf = parent_idx_node->m_idx_buf; | ||
|
||
LOGTRACEMOD(wbcache, "left {} parent {} ", left_child_buf->to_string(), parent_buf->to_string()); | ||
|
||
// Write new nodes in the list as standalone outside transacted pairs. | ||
// Write the new right child nodes, left node and parent in order. | ||
// Create the relationship of right child to the left node via prepend_to_chain below. | ||
// Parent and left node are linked in the prepare_node_txn | ||
for (const auto& right_child_node : new_nodes) { | ||
auto right_child = IndexBtreeNode::convert(right_child_node.get()); | ||
write_node_impl(right_child_node, context); | ||
wb_cache().prepend_to_chain(right_child->m_idx_buf, left_child_buf); | ||
LOGTRACEMOD(wbcache, "right {} left {} ", right_child->m_idx_buf->to_string(), left_child_buf->to_string()); | ||
} | ||
|
||
auto trace_index_bufs = [&]() { | ||
std::string str; | ||
str = fmt::format("cp {} left {} parent {}", cp_ctx->id(), left_child_buf->to_string(), | ||
parent_buf->to_string()); | ||
for (const auto& right_child_node : new_nodes) { | ||
auto right_child = IndexBtreeNode::convert(right_child_node.get()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add these trace logs only if its enabled. hopefully trace_index_bufs only called when logtrace enabled. |
||
fmt::format_to(std::back_inserter(str), " right {}", right_child->m_idx_buf->to_string()); | ||
} | ||
return str; | ||
}; | ||
|
||
LOGTRACEMOD(wbcache, "{}", trace_index_bufs()); | ||
write_node_impl(left_child_node, context); | ||
write_node_impl(parent_node, context); | ||
|
||
|
@@ -181,18 +192,17 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { | |
// realloc_node(node); | ||
// } | ||
|
||
// If the backing buffer is already in a clean state, we don't need to make a copy of it | ||
if (idx_node->m_idx_buf->is_clean()) { return btree_status_t::success; } | ||
|
||
// Make a new btree buffer and copy the contents and swap it to make it the current node's buffer. The | ||
// We create IndexBuffer for each CP. But if the backing buffer is already in a clean state | ||
// we dont copy the node buffer. Copy buffer will handle it. If the node buffer is dirty, | ||
// make a new btree buffer and copy the contents and swap it to make it the current node's buffer. The | ||
// buffer prior to this copy, would have been written and already added into the dirty buffer list. | ||
idx_node->m_idx_buf = wb_cache().copy_buffer(idx_node->m_idx_buf); | ||
idx_node->m_idx_buf = wb_cache().copy_buffer(idx_node->m_idx_buf, cp_ctx); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Creating per cp IndexBuffer for writes. |
||
idx_node->m_last_mod_cp_id = -1; | ||
|
||
node->m_phys_node_buf = idx_node->m_idx_buf->raw_buffer(); | ||
node->set_checksum(this->m_bt_cfg); | ||
|
||
LOGTRACEMOD(wbcache, "buf {} ", idx_node->m_idx_buf->to_string()); | ||
LOGTRACEMOD(wbcache, "cp {} {} ", cp_ctx->id(), idx_node->m_idx_buf->to_string()); | ||
|
||
#ifndef NO_CHECKSUM | ||
if (!node->verify_node(this->m_bt_cfg)) { | ||
|
@@ -221,6 +231,8 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { | |
auto& child_buf = child_idx_node->m_idx_buf; | ||
auto& parent_buf = parent_idx_node->m_idx_buf; | ||
|
||
LOGTRACEMOD(wbcache, "cp {} left {} parent {} ", cp_ctx->id(), child_buf->to_string(), parent_buf->to_string()); | ||
|
||
auto [child_copied, parent_copied] = wb_cache().create_chain(child_buf, parent_buf, cp_ctx); | ||
if (child_copied) { | ||
child_node->m_phys_node_buf = child_buf->raw_buffer(); | ||
|
@@ -231,7 +243,6 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { | |
parent_idx_node->m_last_mod_cp_id = -1; | ||
} | ||
|
||
LOGTRACEMOD(wbcache, "child {} parent {} ", child_buf->to_string(), parent_buf->to_string()); | ||
return btree_status_t::success; | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Too much trace in logs during shutdown.