From edf0299686929b4df950c42b1e689be70fe484c8 Mon Sep 17 00:00:00 2001 From: Harihara Kadayam Date: Mon, 13 Nov 2023 15:33:43 -0800 Subject: [PATCH 1/9] Initial setup to get the raft based replication in on subsequent PRs (#223) --- conanfile.py | 3 +- src/include/homestore/blk.h | 1 + src/include/homestore/btree/btree_kv.hpp | 8 +- .../homestore/btree/detail/btree_node.hpp | 6 +- .../homestore/btree/detail/prefix_node.hpp | 29 ++- .../homestore/btree/detail/simple_node.hpp | 22 +-- .../homestore/btree/detail/varlen_node.hpp | 60 +++--- src/include/homestore/homestore.hpp | 10 +- src/include/homestore/replication/repl_dev.h | 82 +++++++-- src/include/homestore/replication_service.hpp | 62 ++++--- src/include/homestore/superblk_handler.hpp | 81 +++++++- src/lib/blkalloc/bitmap_blk_allocator.cpp | 4 +- src/lib/blkalloc/blk.cpp | 8 +- src/lib/device/device.h | 2 +- src/lib/device/physical_dev.cpp | 12 +- src/lib/device/physical_dev.hpp | 6 +- src/lib/homestore.cpp | 16 +- src/lib/logstore/log_dev.cpp | 28 +-- src/lib/logstore/log_dev.hpp | 16 +- src/lib/logstore/log_group.cpp | 20 +- src/lib/logstore/log_store.cpp | 2 +- src/lib/logstore/log_stream.cpp | 16 +- src/lib/meta/meta_blk_service.cpp | 35 ++-- src/lib/replication/CMakeLists.txt | 25 ++- .../replication/repl_dev/solo_repl_dev.cpp | 54 +++--- src/lib/replication/repl_dev/solo_repl_dev.h | 44 +---- .../replication/service/generic_repl_svc.cpp | 174 ++++++++++++++++++ ...repl_service_impl.h => generic_repl_svc.h} | 66 +++---- src/test_common/bits_generator.hpp | 39 ---- src/tests/CMakeLists.txt | 1 + src/tests/btree_helpers/btree_test_kvs.hpp | 51 +++-- src/tests/test_blkid.cpp | 2 +- src/tests/test_common/bits_generator.hpp | 2 +- .../test_common/homestore_test_common.hpp | 8 +- src/tests/test_log_store.cpp | 45 +++-- src/tests/test_meta_blk_mgr.cpp | 2 +- src/tests/test_solo_repl_dev.cpp | 45 +++-- 37 files changed, 665 insertions(+), 422 deletions(-) create mode 100644 src/lib/replication/service/generic_repl_svc.cpp rename src/lib/replication/service/{repl_service_impl.h => generic_repl_svc.h} (54%) delete mode 100644 src/test_common/bits_generator.hpp diff --git a/conanfile.py b/conanfile.py index db9fc4a7a..19e28962b 100644 --- a/conanfile.py +++ b/conanfile.py @@ -56,7 +56,8 @@ def build_requirements(self): def requirements(self): self.requires("iomgr/[~=10, include_prerelease=True]@oss/master") - self.requires("sisl/[~=10, include_prerelease=True]@oss/master") + self.requires("sisl/[~=11, include_prerelease=True]@oss/master") + self.requires("nuraft_mesg/[~=2, include_prerelease=True]@oss/main") self.requires("farmhash/cci.20190513@") self.requires("isa-l/2.30.0") diff --git a/src/include/homestore/blk.h b/src/include/homestore/blk.h index fdcaee7d7..eea67e6d0 100644 --- a/src/include/homestore/blk.h +++ b/src/include/homestore/blk.h @@ -133,6 +133,7 @@ struct MultiBlkId : public BlkId { BlkId to_single_blkid() const; static uint32_t expected_serialized_size(uint16_t num_pieces); + static uint32_t max_serialized_size(); static int compare(MultiBlkId const& one, MultiBlkId const& two); struct iterator { diff --git a/src/include/homestore/btree/btree_kv.hpp b/src/include/homestore/btree/btree_kv.hpp index 18dd832a8..c995a7cc9 100644 --- a/src/include/homestore/btree/btree_kv.hpp +++ b/src/include/homestore/btree/btree_kv.hpp @@ -256,8 +256,8 @@ class BtreeLinkInfo : public BtreeValue { sisl::blob serialize() const override { sisl::blob b; - b.size = sizeof(bnode_link_info); - b.bytes = uintptr_cast(const_cast< bnode_link_info* >(&info)); + b.set_size(sizeof(bnode_link_info)); + b.set_bytes(r_cast< const uint8_t* >(&info)); return b; } uint32_t serialized_size() const override { return sizeof(bnode_link_info); } @@ -265,8 +265,8 @@ class BtreeLinkInfo : public BtreeValue { std::string to_string() const override { return fmt::format("{}.{}", info.m_bnodeid, info.m_link_version); } void deserialize(const sisl::blob& b, bool copy) override { - DEBUG_ASSERT_EQ(b.size, sizeof(bnode_link_info), "BtreeLinkInfo deserialize received invalid blob"); - auto other = r_cast< bnode_link_info* >(b.bytes); + DEBUG_ASSERT_EQ(b.size(), sizeof(bnode_link_info), "BtreeLinkInfo deserialize received invalid blob"); + auto other = r_cast< bnode_link_info const* >(b.cbytes()); set_bnode_id(other->m_bnodeid); set_link_version(other->m_link_version); } diff --git a/src/include/homestore/btree/detail/btree_node.hpp b/src/include/homestore/btree/detail/btree_node.hpp index 8f713e534..7fe4ae7bb 100644 --- a/src/include/homestore/btree/detail/btree_node.hpp +++ b/src/include/homestore/btree/detail/btree_node.hpp @@ -273,9 +273,9 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { virtual BtreeLinkInfo get_edge_value() const { return BtreeLinkInfo{edge_id(), edge_link_version()}; } virtual void set_edge_value(const BtreeValue& v) { - const auto b = v.serialize(); - auto l = r_cast< BtreeLinkInfo::bnode_link_info* >(b.bytes); - DEBUG_ASSERT_EQ(b.size, sizeof(BtreeLinkInfo::bnode_link_info)); + auto const b = v.serialize(); + auto const l = r_cast< BtreeLinkInfo::bnode_link_info const* >(b.cbytes()); + DEBUG_ASSERT_EQ(b.size(), sizeof(BtreeLinkInfo::bnode_link_info)); set_edge_info(*l); } diff --git a/src/include/homestore/btree/detail/prefix_node.hpp b/src/include/homestore/btree/detail/prefix_node.hpp index 62003da7a..8f2f3b2fd 100644 --- a/src/include/homestore/btree/detail/prefix_node.hpp +++ b/src/include/homestore/btree/detail/prefix_node.hpp @@ -79,21 +79,20 @@ class FixedPrefixNode : public VariantNode< K, V > { sisl::blob const kblob = s_cast< K const& >(key).serialize_prefix(); sisl::blob const vblob = s_cast< V const& >(val).serialize_prefix(); - DEBUG_ASSERT_EQ(kblob.size, key_size(), "Prefix key size mismatch with serialized prefix size"); - DEBUG_ASSERT_EQ(vblob.size, value_size(), "Prefix value size mismatch with serialized prefix size"); + DEBUG_ASSERT_EQ(kblob.size(), key_size(), "Prefix key size mismatch with serialized prefix size"); + DEBUG_ASSERT_EQ(vblob.size(), value_size(), "Prefix value size mismatch with serialized prefix size"); uint8_t* cur_ptr = uintptr_cast(this) + sizeof(prefix_entry); - std::memcpy(cur_ptr, kblob.bytes, kblob.size); - cur_ptr += kblob.size; - std::memcpy(cur_ptr, vblob.bytes, vblob.size); + std::memcpy(cur_ptr, kblob.cbytes(), kblob.size()); + cur_ptr += kblob.size(); + std::memcpy(cur_ptr, vblob.cbytes(), vblob.size()); } } sisl::blob key_buf() const { - return sisl::blob{const_cast< uint8_t* >(r_cast< uint8_t const* >(this) + sizeof(prefix_entry)), - key_size()}; + return sisl::blob{r_cast< uint8_t const* >(this) + sizeof(prefix_entry), key_size()}; } - sisl::blob val_buf() const { return sisl::blob{key_buf().bytes + key_buf().size, value_size()}; } + sisl::blob val_buf() const { return sisl::blob{key_buf().cbytes() + key_buf().size(), value_size()}; } }; struct suffix_entry { @@ -131,19 +130,19 @@ class FixedPrefixNode : public VariantNode< K, V > { kblob = key.serialize(); vblob = val.serialize(); } - DEBUG_ASSERT_EQ(kblob.size, key_size(), "Suffix key size mismatch with serialized suffix size"); - DEBUG_ASSERT_EQ(vblob.size, value_size(), "Suffix value size mismatch with serialized suffix size"); + DEBUG_ASSERT_EQ(kblob.size(), key_size(), "Suffix key size mismatch with serialized suffix size"); + DEBUG_ASSERT_EQ(vblob.size(), value_size(), "Suffix value size mismatch with serialized suffix size"); - std::memcpy(cur_ptr, kblob.bytes, kblob.size); - cur_ptr += kblob.size; - std::memcpy(cur_ptr, vblob.bytes, vblob.size); + std::memcpy(cur_ptr, kblob.cbytes(), kblob.size()); + cur_ptr += kblob.size(); + std::memcpy(cur_ptr, vblob.cbytes(), vblob.size()); } sisl::blob key_buf() const { return sisl::blob{const_cast< uint8_t* >(r_cast< uint8_t const* >(this) + sizeof(suffix_entry)), key_size()}; } - sisl::blob val_buf() const { return sisl::blob{key_buf().bytes + key_buf().size, value_size()}; } + sisl::blob val_buf() const { return sisl::blob{key_buf().bytes() + key_buf().size(), value_size()}; } }; #pragma pack() @@ -778,7 +777,7 @@ class FixedPrefixNode : public VariantNode< K, V > { K prevKey; while (i < this->total_entries()) { K key = BtreeNode::get_nth_key< K >(i, false); - uint64_t kp = *(uint64_t*)key.serialize().bytes; + uint64_t kp = *(uint64_t*)key.serialize().bytes(); if (i > 0 && prevKey.compare(key) > 0) { DEBUG_ASSERT(false, "Found non sorted entry: {} -> {}", kp, to_string()); } diff --git a/src/include/homestore/btree/detail/simple_node.hpp b/src/include/homestore/btree/detail/simple_node.hpp index b8149374c..2c3014bf2 100644 --- a/src/include/homestore/btree/detail/simple_node.hpp +++ b/src/include/homestore/btree/detail/simple_node.hpp @@ -183,9 +183,7 @@ class SimpleNode : public VariantNode< K, V > { void get_nth_key_internal(uint32_t ind, BtreeKey& out_key, bool copy) const override { DEBUG_ASSERT_LT(ind, this->total_entries(), "node={}", to_string()); - sisl::blob b; - b.bytes = (uint8_t*)(this->node_data_area_const() + (get_nth_obj_size(ind) * ind)); - b.size = get_nth_key_size(ind); + sisl::blob b{this->node_data_area_const() + (get_nth_obj_size(ind) * ind), get_nth_key_size(ind)}; out_key.deserialize(b, copy); } @@ -324,11 +322,11 @@ class SimpleNode : public VariantNode< K, V > { set_nth_value(ind, v); } else { uint8_t* entry = this->node_data_area() + (get_nth_obj_size(ind) * ind); - sisl::blob key_blob = k.serialize(); - memcpy((void*)entry, key_blob.bytes, key_blob.size); + sisl::blob const key_blob = k.serialize(); + memcpy((void*)entry, key_blob.cbytes(), key_blob.size()); - sisl::blob val_blob = v.serialize(); - memcpy((void*)(entry + key_blob.size), val_blob.bytes, val_blob.size); + sisl::blob const val_blob = v.serialize(); + memcpy((void*)(entry + key_blob.size()), val_blob.cbytes(), val_blob.size()); } } @@ -345,20 +343,20 @@ class SimpleNode : public VariantNode< K, V > { void set_nth_key(uint32_t ind, BtreeKey* key) { uint8_t* entry = this->node_data_area() + (get_nth_obj_size(ind) * ind); - sisl::blob b = key->serialize(); - memcpy(entry, b.bytes, b.size); + sisl::blob const b = key->serialize(); + memcpy(entry, b.cbytes(), b.size()); } void set_nth_value(uint32_t ind, const BtreeValue& v) { sisl::blob b = v.serialize(); if (ind >= this->total_entries()) { RELEASE_ASSERT_EQ(this->is_leaf(), false, "setting value outside bounds on leaf node"); - DEBUG_ASSERT_EQ(b.size, sizeof(BtreeLinkInfo::bnode_link_info), + DEBUG_ASSERT_EQ(b.size(), sizeof(BtreeLinkInfo::bnode_link_info), "Invalid value size being set for non-leaf node"); - this->set_edge_info(*r_cast< BtreeLinkInfo::bnode_link_info* >(b.bytes)); + this->set_edge_info(*r_cast< BtreeLinkInfo::bnode_link_info const* >(b.cbytes())); } else { uint8_t* entry = this->node_data_area() + (get_nth_obj_size(ind) * ind) + get_nth_key_size(ind); - std::memcpy(entry, b.bytes, b.size); + std::memcpy(entry, b.cbytes(), b.size()); } } }; diff --git a/src/include/homestore/btree/detail/varlen_node.hpp b/src/include/homestore/btree/detail/varlen_node.hpp index de155f9c5..b1ebd6e6d 100644 --- a/src/include/homestore/btree/detail/varlen_node.hpp +++ b/src/include/homestore/btree/detail/varlen_node.hpp @@ -89,7 +89,7 @@ class VariableNode : public VariantNode< K, V > { K prevKey; while (i < this->total_entries()) { K key = BtreeNode::get_nth_key< K >(i, false); - uint64_t kp = *(uint64_t*)key.serialize().bytes; + uint64_t kp = *(uint64_t*)key.serialize().bytes(); if (i > 0 && prevKey.compare(key) > 0) { DEBUG_ASSERT(false, "Found non sorted entry: {} -> {}", kp, to_string()); } @@ -136,16 +136,16 @@ class VariableNode : public VariantNode< K, V > { sisl::blob kblob = key.serialize(); sisl::blob vblob = val.serialize(); - DEBUG_ASSERT_EQ(kblob.size, key.serialized_size(), + DEBUG_ASSERT_EQ(kblob.size(), key.serialized_size(), "Key Serialized size returned different after serialization"); - DEBUG_ASSERT_EQ(vblob.size, val.serialized_size(), + DEBUG_ASSERT_EQ(vblob.size(), val.serialized_size(), "Value Serialized size returned different after serialization"); // we can avoid memcpy if addresses of val_ptr and vblob.bytes is same. In place update - if (key_ptr != kblob.bytes) { std::memcpy(key_ptr, kblob.bytes, kblob.size); } - if (val_ptr != vblob.bytes) { std::memcpy(val_ptr, vblob.bytes, vblob.size); } - set_nth_key_len(get_nth_record_mutable(ind), kblob.size); - set_nth_value_len(get_nth_record_mutable(ind), vblob.size); + if (key_ptr != kblob.cbytes()) { std::memcpy(key_ptr, kblob.cbytes(), kblob.size()); } + if (val_ptr != vblob.cbytes()) { std::memcpy(val_ptr, vblob.cbytes(), vblob.size()); } + set_nth_key_len(get_nth_record_mutable(ind), kblob.size()); + set_nth_value_len(get_nth_record_mutable(ind), vblob.size()); get_var_node_header()->m_available_space += cur_obj_size - new_obj_size; this->inc_gen(); } else { @@ -224,13 +224,8 @@ class VariableNode : public VariantNode< K, V > { bool full_move{false}; while (ind >= end_ind) { // Get the ith key and value blob and then remove the entry from here and insert to the other node - sisl::blob kb; - kb.bytes = (uint8_t*)get_nth_obj(ind); - kb.size = get_nth_key_size(ind); - - sisl::blob vb; - vb.bytes = kb.bytes + kb.size; - vb.size = get_nth_value_size(ind); + sisl::blob const kb{get_nth_obj(ind), get_nth_key_size(ind)}; + sisl::blob const vb{kb.cbytes() + kb.size(), get_nth_value_size(ind)}; auto sz = other.insert(0, kb, vb); if (!sz) { break; } @@ -265,15 +260,10 @@ class VariableNode : public VariantNode< K, V > { uint32_t ind = this->total_entries() - 1; while (ind > 0) { - sisl::blob kb; - kb.bytes = (uint8_t*)get_nth_obj(ind); - kb.size = get_nth_key_size(ind); - - sisl::blob vb; - vb.bytes = kb.bytes + kb.size; - vb.size = get_nth_value_size(ind); + sisl::blob const kb{get_nth_obj(ind), get_nth_key_size(ind)}; + sisl::blob const vb{kb.cbytes() + kb.size(), get_nth_value_size(ind)}; - if ((kb.size + vb.size + this->get_record_size()) > size_to_move) { + if ((kb.size() + vb.size() + this->get_record_size()) > size_to_move) { // We reached threshold of how much we could move break; } @@ -322,11 +312,11 @@ class VariableNode : public VariantNode< K, V > { auto idx = start_idx; uint32_t n = 0; while (idx < other.total_entries()) { - sisl::blob kb{(uint8_t*)other.get_nth_obj(idx), other.get_nth_key_size(idx)}; - sisl::blob vb{kb.bytes + kb.size, other.get_nth_value_size(idx)}; + sisl::blob const kb{(uint8_t*)other.get_nth_obj(idx), other.get_nth_key_size(idx)}; + sisl::blob const vb{kb.cbytes() + kb.size(), other.get_nth_value_size(idx)}; // We reached threshold of how much we could move - if ((kb.size + vb.size + other.get_record_size()) > copy_size) { break; } + if ((kb.size() + vb.size() + other.get_record_size()) > copy_size) { break; } auto sz = insert(this->total_entries(), kb, vb); if (sz == 0) { break; } @@ -352,8 +342,8 @@ class VariableNode : public VariantNode< K, V > { auto idx = start_idx; uint32_t n = 0; while (n < nentries) { - sisl::blob kb{(uint8_t*)other.get_nth_obj(idx), other.get_nth_key_size(idx)}; - sisl::blob vb{kb.bytes + kb.size, other.get_nth_value_size(idx)}; + sisl::blob const kb{other.get_nth_obj(idx), other.get_nth_key_size(idx)}; + sisl::blob const vb{kb.cbytes() + kb.size(), other.get_nth_value_size(idx)}; auto sz = insert(this->total_entries(), kb, vb); if (sz == 0) { break; } @@ -461,8 +451,8 @@ class VariableNode : public VariantNode< K, V > { void set_nth_key(uint32_t ind, const BtreeKey& key) { const auto kb = key.serialize(); assert(ind < this->total_entries()); - assert(kb.size == get_nth_key_size(ind)); - memcpy(uintptr_cast(get_nth_obj(ind)), kb.bytes, kb.size); + assert(kb.size() == get_nth_key_size(ind)); + memcpy(uintptr_cast(get_nth_obj(ind)), kb.cbytes(), kb.size()); } bool has_room_for_put(btree_put_type put_type, uint32_t key_size, uint32_t value_size) const override { @@ -587,7 +577,7 @@ class VariableNode : public VariantNode< K, V > { assert(ind <= this->total_entries()); LOGTRACEMOD(btree, "{}:{}:{}:{}", ind, get_var_node_header()->tail_offset(), get_arena_free_space(), get_var_node_header()->available_space()); - uint16_t obj_size = key_blob.size + val_blob.size; + uint16_t obj_size = key_blob.size() + val_blob.size(); uint16_t to_insert_size = obj_size + this->get_record_size(); if (to_insert_size > get_var_node_header()->available_space()) { RELEASE_ASSERT(false, "insert failed insert size {} available size {}", to_insert_size, @@ -613,15 +603,15 @@ class VariableNode : public VariantNode< K, V > { get_var_node_header()->m_available_space -= (obj_size + this->get_record_size()); // Create a new record - set_nth_key_len(rec_ptr, key_blob.size); - set_nth_value_len(rec_ptr, val_blob.size); + set_nth_key_len(rec_ptr, key_blob.size()); + set_nth_value_len(rec_ptr, val_blob.size()); set_record_data_offset(rec_ptr, get_var_node_header()->m_tail_arena_offset); // Copy the contents of key and value in the offset uint8_t* raw_data_ptr = offset_to_ptr_mutable(get_var_node_header()->m_tail_arena_offset); - memcpy(raw_data_ptr, key_blob.bytes, key_blob.size); - raw_data_ptr += key_blob.size; - memcpy(raw_data_ptr, val_blob.bytes, val_blob.size); + memcpy(raw_data_ptr, key_blob.cbytes(), key_blob.size()); + raw_data_ptr += key_blob.size(); + memcpy(raw_data_ptr, val_blob.cbytes(), val_blob.size()); // Increment the entries and generation number this->inc_entries(); diff --git a/src/include/homestore/homestore.hpp b/src/include/homestore/homestore.hpp index 9aae7661a..2c9d51d05 100644 --- a/src/include/homestore/homestore.hpp +++ b/src/include/homestore/homestore.hpp @@ -51,6 +51,8 @@ class HomeStore; class CPManager; class VirtualDev; class ChunkSelector; +class ReplDevListener; +class ReplApplication; using HomeStoreSafePtr = std::shared_ptr< HomeStore >; @@ -96,12 +98,6 @@ struct HS_SERVICE { } }; -VENUM(repl_impl_type, uint8_t, - server_side, // Completely homestore controlled replication - client_assisted, // Client assisting in replication - solo // For single node - no replication -); - /* * IO errors handling by homestore. * Write error :- Reason :- Disk error, space full,btree node read fail @@ -149,7 +145,7 @@ class HomeStore { HomeStore& with_data_service(cshared< ChunkSelector >& custom_chunk_selector = nullptr); HomeStore& with_log_service(); HomeStore& with_index_service(std::unique_ptr< IndexServiceCallbacks > cbs); - HomeStore& with_repl_data_service(repl_impl_type repl_type, + HomeStore& with_repl_data_service(cshared< ReplApplication >& repl_app, cshared< ChunkSelector >& custom_chunk_selector = nullptr); bool start(const hs_input_params& input, hs_before_services_starting_cb_t svcs_starting_cb = nullptr); diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index 5bd781928..4ebbd1438 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -1,13 +1,53 @@ #pragma once +#include + #include #include +#include +#include #include #include +namespace nuraft { +template < typename T > +using ptr = std::shared_ptr< T >; + +// class buffer; +class buffer { +public: + static ptr< buffer > alloc(uint32_t size) { return std::make_shared< buffer >(); } +}; // Temporary till we get nuraft included by homestore impl + +} // namespace nuraft + namespace homestore { class ReplDev; +struct repl_req_ctx; +using raft_buf_ptr_t = nuraft::ptr< nuraft::buffer >; +using repl_req_ptr_t = boost::intrusive_ptr< repl_req_ctx >; + +VENUM(repl_req_state_t, uint32_t, + INIT = 0, // Initial state + DATA_RECEIVED = 1 << 1, // Data has been received and being written to the storage + DATA_WRITTEN = 1 << 2, // Data has been written to the storage + LOG_RECEIVED = 1 << 3, // Log is received and waiting for data + LOG_FLUSHED = 1 << 4 // Log has been flushed +) + +struct repl_key { + int32_t server_id{0}; // Server Id which this req is originated from + uint64_t term; // RAFT term number + uint64_t dsn{0}; // Data sequence number to tie the data with the raft journal entry + + struct Hasher { + size_t operator()(repl_key const& rk) const { + return std::hash< int32_t >()(rk.server_id) ^ std::hash< uint64_t >()(rk.term) ^ + std::hash< uint64_t >()(rk.dsn); + } + }; +}; struct repl_journal_entry; struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost::thread_safe_counter > { @@ -17,17 +57,35 @@ struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost:: virtual ~repl_req_ctx(); int64_t get_lsn() const { return lsn; } -private: - sisl::blob header; // User header - sisl::blob key; // Key to replicate - sisl::sg_list value; // Raw value - applicable only to leader req - MultiBlkId local_blkid; // List of corresponding local blkids for the value - RemoteBlkId remote_blkid; // List of remote blkid for the value - std::unique_ptr< uint8_t[] > journal_buf; // Buf for the journal entry - repl_journal_entry* journal_entry{nullptr}; // pointer to the journal entry - int64_t lsn{0}; // Lsn for this replication req - - void alloc_journal_entry(uint32_t size); + uint64_t dsn() const { return rkey.dsn; } + uint64_t term() const { return rkey.term; } + void alloc_journal_entry(uint32_t size, bool is_raft_buf); + raft_buf_ptr_t& raft_journal_buf(); + uint8_t* raw_journal_buf(); + +public: + repl_key rkey; // Unique key for the request + sisl::blob header; // User header + sisl::blob key; // User supplied key for this req + int64_t lsn{0}; // Lsn for this replication req + + //////////////// Value related section ///////////////// + sisl::sg_list value; // Raw value - applicable only to leader req + MultiBlkId local_blkid; // Local BlkId for the value + RemoteBlkId remote_blkid; // Corresponding remote blkid for the value + + //////////////// Journal/Buf related section ///////////////// + std::variant< std::unique_ptr< uint8_t[] >, raft_buf_ptr_t > journal_buf; // Buf for the journal entry + repl_journal_entry* journal_entry{nullptr}; // pointer to the journal entry + + //////////////// Replication state related section ///////////////// + std::mutex state_mtx; + std::atomic< repl_req_state_t > state{repl_req_state_t::INIT}; // State of the replication request + folly::Promise< folly::Unit > data_written_promise; // Promise to be fulfilled when data is written + + //////////////// Communication packet/builder section ///////////////// + sisl::io_blob_list_t pkts; + flatbuffers::FlatBufferBuilder fb_builder; }; // @@ -134,7 +192,7 @@ class ReplDev { /// @param ctx - User supplied context which will be passed to listener /// callbacks virtual void async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& value, - intrusive< repl_req_ctx > ctx) = 0; + repl_req_ptr_t ctx) = 0; /// @brief Reads the data and returns a future to continue on /// @param bid Block id to read diff --git a/src/include/homestore/replication_service.hpp b/src/include/homestore/replication_service.hpp index 379f31767..fe1469e5d 100644 --- a/src/include/homestore/replication_service.hpp +++ b/src/include/homestore/replication_service.hpp @@ -5,8 +5,9 @@ #include #include - +#include #include +#include namespace homestore { @@ -36,15 +37,23 @@ struct hs_stats; template < typename V, typename E > using Result = folly::Expected< V, E >; -template < class V, class E > -using AsyncResult = folly::Future< Result< V, E > >; - template < class V > using ReplResult = Result< V, ReplServiceError >; -template < class V > +template < class V, class E > +using AsyncResult = folly::SemiFuture< Result< V, E > >; + +template < class V = folly::Unit > using AsyncReplResult = AsyncResult< V, ReplServiceError >; +VENUM(repl_impl_type, uint8_t, + server_side, // Completely homestore controlled replication + client_assisted, // Client assisting in replication + solo // For single node - no replication +); + +class ReplApplication; + class ReplicationService { public: ReplicationService() = default; @@ -56,26 +65,9 @@ class ReplicationService { /// @param listener state machine listener of all the events happening on the repl_dev (commit, precommit etc) /// @return A Future ReplDev on success or Future ReplServiceError upon error virtual AsyncReplResult< shared< ReplDev > > create_repl_dev(uuid_t group_id, - std::set< std::string, std::less<> >&& members, - std::unique_ptr< ReplDevListener > listener) = 0; - - /// @brief Opens the Repl Device for a given group id. It is expected that the repl dev is already created and used - /// this method for recovering. It is possible that repl_dev is not ready and in that case it will provide Repl - /// Device after it is ready and thus returns a Future. - /// - /// NOTE 1: If callers does an open for a repl device which was not created before, then at the end of - /// initialization an error is returned saying ReplServiceError::SERVER_NOT_FOUND - /// - /// NOTE 2: If the open repl device is called after Replication service is started, then it returns an error - /// ReplServiceError::BAD_REQUEST - /// @param group_id Group id to open the repl device with - /// @param listener state machine listener of all the events happening on the repl_dev (commit, precommit etc) - /// @return A Future ReplDev on successful open of ReplDev or Future ReplServiceError upon error - virtual AsyncReplResult< shared< ReplDev > > open_repl_dev(uuid_t group_id, - std::unique_ptr< ReplDevListener > listener) = 0; + std::set< uuid_t, std::less<> >&& members) = 0; - virtual folly::Future< ReplServiceError > replace_member(uuid_t group_id, std::string const& member_out, - std::string const& member_in) const = 0; + virtual AsyncReplResult<> replace_member(uuid_t group_id, uuid_t member_out, uuid_t member_in) const = 0; /// @brief Get the repl dev for a given group id if it is already created or opened /// @param group_id Group id interested in @@ -89,5 +81,27 @@ class ReplicationService { /// @brief get the capacity stats form underlying backend; /// @return the capacity stats; virtual hs_stats get_cap_stats() const = 0; + + virtual meta_sub_type get_meta_blk_name() const = 0; }; + +//////////////// Application which uses Replication needs to be provide the following callbacks //////////////// +class ReplApplication { +public: + // Returns the required implementation type of replication + virtual repl_impl_type get_impl_type() const = 0; + + // Is the replica recovery needs timeline consistency. This is used to determine if the replica needs to be + // recovered by key or by block of data. At present only non-timeline consistent replication is supported. + virtual bool need_timeline_consistency() const = 0; + + // Called when the repl dev is found upon restart of the homestore instance. The caller should return an instance of + // Listener corresponding to the ReplDev which will be used to perform the precommit/commit/rollback. + virtual std::unique_ptr< ReplDevListener > create_repl_dev_listener(uuid_t group_id) = 0; + + virtual std::string lookup_peer(uuid_t uuid) const = 0; + + virtual uint16_t lookup_port() const = 0; +}; + } // namespace homestore diff --git a/src/include/homestore/superblk_handler.hpp b/src/include/homestore/superblk_handler.hpp index d0c234dd9..f76262ef5 100644 --- a/src/include/homestore/superblk_handler.hpp +++ b/src/include/homestore/superblk_handler.hpp @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -44,7 +45,7 @@ class superblk { m_meta_mgr_cookie = voidptr_cast(meta_cookie); m_raw_buf = meta_service().is_aligned_buf_needed(buf.size()) ? buf.extract(meta_service().align_size()) : buf.extract(0); - m_sb = r_cast< T* >(m_raw_buf->bytes); + m_sb = r_cast< T* >(m_raw_buf->bytes()); return m_sb; } @@ -55,7 +56,7 @@ class superblk { } else { m_raw_buf = sisl::make_byte_array(uint32_cast(size), 0, sisl::buftag::metablk); } - m_sb = new (m_raw_buf->bytes) T(); + m_sb = new (m_raw_buf->bytes()) T(); return m_sb; } @@ -68,14 +69,14 @@ class superblk { m_sb = nullptr; } - uint32_t size() const { return m_raw_buf->size; } + uint32_t size() const { return m_raw_buf->size(); } sisl::byte_array raw_buf() { return m_raw_buf; } void write() { if (m_meta_mgr_cookie) { - meta_service().update_sub_sb(m_raw_buf->bytes, m_raw_buf->size, m_meta_mgr_cookie); + meta_service().update_sub_sb(m_raw_buf->cbytes(), m_raw_buf->size(), m_meta_mgr_cookie); } else { - meta_service().add_sub_sb(m_metablk_name, m_raw_buf->bytes, m_raw_buf->size, m_meta_mgr_cookie); + meta_service().add_sub_sb(m_metablk_name, m_raw_buf->cbytes(), m_raw_buf->size(), m_meta_mgr_cookie); } } @@ -92,4 +93,74 @@ class superblk { std::string m_metablk_name; }; +class json_superblk { +private: + void* m_meta_mgr_cookie{nullptr}; + nlohmann::json m_json_sb; + std::string m_metablk_name; + +public: + static uint64_t next_count() { + static std::atomic< uint64_t > s_count{0}; + return ++s_count; + } + + json_superblk(const std::string& meta_name = "") { set_name(meta_name); } + + void set_name(const std::string& meta_name) { + if (meta_name.empty()) { + m_metablk_name = "meta_blk_" + std::to_string(next_count()); + } else { + m_metablk_name = meta_name; + } + } + + nlohmann::json& load(const sisl::byte_view& buf, void* meta_cookie) { + m_meta_mgr_cookie = voidptr_cast(meta_cookie); + std::string_view const b{c_charptr_cast(buf.bytes()), buf.size()}; + + try { + m_json_sb = nlohmann::json::from_msgpack(b); + } catch (nlohmann::json::exception const& e) { + DEBUG_ASSERT(false, "Failed to load superblk for meta_blk={}", m_metablk_name); + return m_json_sb; + } + return m_json_sb; + } + + nlohmann::json& create() { return m_json_sb; } + + void destroy() { + if (m_meta_mgr_cookie) { + meta_service().remove_sub_sb(m_meta_mgr_cookie); + m_meta_mgr_cookie = nullptr; + } + m_json_sb = nlohmann::json{}; + } + + uint32_t size() const { return m_json_sb.size(); } + + void write() { + auto do_write = [this](sisl::blob const& b) { + if (m_meta_mgr_cookie) { + meta_service().update_sub_sb(b.cbytes(), b.size(), m_meta_mgr_cookie); + } else { + meta_service().add_sub_sb(m_metablk_name, b.cbytes(), b.size(), m_meta_mgr_cookie); + } + }; + + auto const packed_data = nlohmann::json::to_msgpack(m_json_sb); + auto const size = packed_data.size(); + if (meta_service().is_aligned_buf_needed(size)) { + sisl::io_blob_safe buffer(size, meta_service().align_size()); + std::memcpy(buffer.bytes(), packed_data.data(), size); + do_write(buffer); + } else { + do_write(sisl::blob{r_cast< uint8_t const* >(packed_data.data()), uint32_cast(size)}); + } + } + + nlohmann::json& operator*() { return m_json_sb; } +}; + } // namespace homestore diff --git a/src/lib/blkalloc/bitmap_blk_allocator.cpp b/src/lib/blkalloc/bitmap_blk_allocator.cpp index 78f747c08..42b140594 100644 --- a/src/lib/blkalloc/bitmap_blk_allocator.cpp +++ b/src/lib/blkalloc/bitmap_blk_allocator.cpp @@ -60,9 +60,9 @@ void BitmapBlkAllocator::cp_flush(CP*) { if (m_is_disk_bm_dirty.load()) { sisl::byte_array bitmap_buf = acquire_underlying_buffer(); if (m_meta_blk_cookie) { - meta_service().update_sub_sb(bitmap_buf->bytes, bitmap_buf->size, m_meta_blk_cookie); + meta_service().update_sub_sb(bitmap_buf->cbytes(), bitmap_buf->size(), m_meta_blk_cookie); } else { - meta_service().add_sub_sb(get_name(), bitmap_buf->bytes, bitmap_buf->size, m_meta_blk_cookie); + meta_service().add_sub_sb(get_name(), bitmap_buf->cbytes(), bitmap_buf->size(), m_meta_blk_cookie); } m_is_disk_bm_dirty.store(false); // No longer dirty now, needs to be set before releasing the buffer release_underlying_buffer(); diff --git a/src/lib/blkalloc/blk.cpp b/src/lib/blkalloc/blk.cpp index e8143e59c..0bdc32a33 100644 --- a/src/lib/blkalloc/blk.cpp +++ b/src/lib/blkalloc/blk.cpp @@ -32,7 +32,7 @@ uint32_t BlkId::serialized_size() const { return sizeof(BlkId); } uint32_t BlkId::expected_serialized_size() { return sizeof(BlkId); } void BlkId::deserialize(sisl::blob const& b, bool copy) { - serialized* other = r_cast< serialized* >(b.bytes); + serialized* other = r_cast< serialized const* >(b.cbytes()); s = *other; } @@ -100,9 +100,9 @@ uint32_t MultiBlkId::serialized_size() const { } void MultiBlkId::deserialize(sisl::blob const& b, bool copy) { - MultiBlkId* other = r_cast< MultiBlkId* >(b.bytes); + MultiBlkId* other = r_cast< MultiBlkId const* >(b.cbytes()); s = other->s; - if (b.size == sizeof(BlkId)) { + if (b.size() == sizeof(BlkId)) { n_addln_piece = 0; } else { n_addln_piece = other->n_addln_piece; @@ -117,6 +117,8 @@ uint32_t MultiBlkId::expected_serialized_size(uint16_t num_pieces) { return sz; } +uint32_t MultiBlkId::max_serialized_size() { return expected_serialized_size(max_pieces); } + uint16_t MultiBlkId::num_pieces() const { return BlkId::is_valid() ? n_addln_piece + 1 : 0; } bool MultiBlkId::has_room() const { return (n_addln_piece < max_addln_pieces); } diff --git a/src/lib/device/device.h b/src/lib/device/device.h index ad8749576..86b3f45d5 100644 --- a/src/lib/device/device.h +++ b/src/lib/device/device.h @@ -71,7 +71,7 @@ struct vdev_info { void set_pdev_choice(vdev_multi_pdev_opts_t opts) { multi_pdev_choice = enum_value(opts); } void set_user_private(const sisl::blob& data) { - std::memcpy(&user_private, data.bytes, std::min(data.size, uint32_cast(user_private_size))); + std::memcpy(&user_private, data.cbytes(), std::min(data.size(), uint32_cast(user_private_size))); } uint8_t* get_user_private_mutable() { return &(user_private[0]); } const uint8_t* get_user_private() const { return &(user_private[0]); } diff --git a/src/lib/device/physical_dev.cpp b/src/lib/device/physical_dev.cpp index 33f243824..cd383eefa 100644 --- a/src/lib/device/physical_dev.cpp +++ b/src/lib/device/physical_dev.cpp @@ -117,7 +117,7 @@ PhysicalDev::PhysicalDev(const dev_info& dinfo, int oflags, const pdev_info_head PhysicalDev::~PhysicalDev() { close_device(); } -void PhysicalDev::write_super_block(uint8_t* buf, uint32_t sb_size, uint64_t offset) { +void PhysicalDev::write_super_block(uint8_t const* buf, uint32_t sb_size, uint64_t offset) { auto err_c = m_drive_iface->sync_write(m_iodev.get(), c_charptr_cast(buf), sb_size, offset); if (m_super_blk_in_footer) { @@ -223,7 +223,7 @@ void PhysicalDev::submit_batch() { m_drive_iface->submit_batch(); } void PhysicalDev::format_chunks() { m_chunk_info_slots = std::make_unique< sisl::Bitset >(hs_super_blk::chunk_info_bitmap_size(m_dev_info)); auto bitmap_mem = m_chunk_info_slots->serialize(m_pdev_info.dev_attr.align_size); - write_super_block(bitmap_mem->bytes, bitmap_mem->size, hs_super_blk::chunk_sb_offset()); + write_super_block(bitmap_mem->cbytes(), bitmap_mem->size(), hs_super_blk::chunk_sb_offset()); } std::vector< shared< Chunk > > PhysicalDev::create_chunks(const std::vector< uint32_t >& chunk_ids, uint32_t vdev_id, @@ -261,7 +261,7 @@ std::vector< shared< Chunk > > PhysicalDev::create_chunks(const std::vector< uin // Finally serialize the entire bitset and persist the chunk info bitmap itself auto bitmap_mem = m_chunk_info_slots->serialize(m_pdev_info.dev_attr.align_size); - write_super_block(bitmap_mem->bytes, bitmap_mem->size, hs_super_blk::chunk_sb_offset()); + write_super_block(bitmap_mem->cbytes(), bitmap_mem->size(), hs_super_blk::chunk_sb_offset()); } catch (const std::out_of_range& e) { LOGERROR("Creation of chunks failed because of space, removing {} partially created chunks", ret_chunks.size()); for (auto& chunk : ret_chunks) { @@ -295,7 +295,7 @@ shared< Chunk > PhysicalDev::create_chunk(uint32_t chunk_id, uint32_t vdev_id, u get_stream(chunk).m_chunks_map.insert(std::pair{chunk_id, chunk}); auto bitmap_mem = m_chunk_info_slots->serialize(m_pdev_info.dev_attr.align_size); - write_super_block(bitmap_mem->bytes, bitmap_mem->size, hs_super_blk::chunk_sb_offset()); + write_super_block(bitmap_mem->cbytes(), bitmap_mem->size(), hs_super_blk::chunk_sb_offset()); cinfo->~chunk_info(); hs_utils::iobuf_free(buf, sisl::buftag::superblk); @@ -330,7 +330,7 @@ void PhysicalDev::load_chunks(std::function< bool(cshared< Chunk >&) >&& chunk_f // Read the chunk info bitmap area from super block and load them into in-memory bitmap of chunk slots auto buf_arr = make_byte_array(hs_super_blk::chunk_info_bitmap_size(m_dev_info), m_pdev_info.dev_attr.align_size, sisl::buftag::superblk); - read_super_block(buf_arr->bytes, buf_arr->size, hs_super_blk::chunk_sb_offset()); + read_super_block(buf_arr->bytes(), buf_arr->size(), hs_super_blk::chunk_sb_offset()); m_chunk_info_slots = std::make_unique< sisl::Bitset >(buf_arr); // Walk through each of the chunk info and create corresponding chunks @@ -390,7 +390,7 @@ void PhysicalDev::do_remove_chunk(cshared< Chunk >& chunk) { // Reset the info slot and write it to super block m_chunk_info_slots->reset_bit(chunk->slot_number()); auto bitmap_mem = m_chunk_info_slots->serialize(m_pdev_info.dev_attr.align_size); - write_super_block(bitmap_mem->bytes, bitmap_mem->size, hs_super_blk::chunk_sb_offset()); + write_super_block(bitmap_mem->cbytes(), bitmap_mem->size(), hs_super_blk::chunk_sb_offset()); get_stream(chunk).m_chunks_map.erase(chunk->chunk_id()); cinfo->~chunk_info(); diff --git a/src/lib/device/physical_dev.hpp b/src/lib/device/physical_dev.hpp index 951e61f34..d6023c878 100644 --- a/src/lib/device/physical_dev.hpp +++ b/src/lib/device/physical_dev.hpp @@ -96,10 +96,10 @@ struct chunk_info { void set_free() { chunk_allocated = 0x00; } void set_selector_private(const sisl::blob& data) { - std::memcpy(&chunk_selector_private, data.bytes, std::min(data.size, uint32_cast(selector_private_size))); + std::memcpy(&chunk_selector_private, data.cbytes(), std::min(data.size(), uint32_cast(selector_private_size))); } void set_user_private(const sisl::blob& data) { - std::memcpy(&user_private, data.bytes, std::min(data.size, uint32_cast(user_private_size))); + std::memcpy(&user_private, data.cbytes(), std::min(data.size(), uint32_cast(user_private_size))); } void compute_checksum() { @@ -148,7 +148,7 @@ class PhysicalDev { static uint64_t get_dev_size(const std::string& devname); std::error_code read_super_block(uint8_t* buf, uint32_t sb_size, uint64_t offset); - void write_super_block(uint8_t* buf, uint32_t sb_size, uint64_t offset); + void write_super_block(uint8_t const* buf, uint32_t sb_size, uint64_t offset); void close_device(); //////////////////////////// Chunk Creation/Load related methods ///////////////////////////////////////// diff --git a/src/lib/homestore.cpp b/src/lib/homestore.cpp index 5c8444783..bf91f1a79 100644 --- a/src/lib/homestore.cpp +++ b/src/lib/homestore.cpp @@ -41,7 +41,7 @@ #include "common/resource_mgr.hpp" #include "meta/meta_sb.hpp" #include "logstore/log_store_family.hpp" -#include "replication/service/repl_service_impl.h" +#include "replication/service/generic_repl_svc.h" /* * IO errors handling by homestore. @@ -57,8 +57,8 @@ namespace homestore { HomeStoreSafePtr HomeStore::s_instance{nullptr}; static std::unique_ptr< IndexServiceCallbacks > s_index_cbs; -static repl_impl_type s_repl_impl_type{repl_impl_type::solo}; -shared< ChunkSelector > s_custom_chunk_selector{nullptr}; +static shared< ChunkSelector > s_custom_chunk_selector{nullptr}; +static shared< ReplApplication > s_repl_app{nullptr}; HomeStore* HomeStore::instance() { if (s_instance == nullptr) { s_instance = std::make_shared< HomeStore >(); } @@ -83,11 +83,11 @@ HomeStore& HomeStore::with_log_service() { return *this; } -HomeStore& HomeStore::with_repl_data_service(repl_impl_type repl_type, +HomeStore& HomeStore::with_repl_data_service(cshared< ReplApplication >& repl_app, cshared< ChunkSelector >& custom_chunk_selector) { m_services.svcs |= HS_SERVICE::REPLICATION | HS_SERVICE::LOG_REPLICATED | HS_SERVICE::LOG_LOCAL; m_services.svcs &= ~HS_SERVICE::DATA; // ReplicationDataSvc or DataSvc are mutually exclusive - s_repl_impl_type = repl_type; + s_repl_app = repl_app; s_custom_chunk_selector = std::move(custom_chunk_selector); return *this; } @@ -128,7 +128,7 @@ bool HomeStore::start(const hs_input_params& input, hs_before_services_starting_ if (has_data_service()) { m_data_service = std::make_unique< BlkDataService >(std::move(s_custom_chunk_selector)); } if (has_index_service()) { m_index_service = std::make_unique< IndexService >(std::move(s_index_cbs)); } if (has_repl_data_service()) { - m_repl_service = std::make_unique< ReplicationServiceImpl >(s_repl_impl_type); + m_repl_service = GenericReplService::create(std::move(s_repl_app)); m_data_service = std::make_unique< BlkDataService >(std::move(s_custom_chunk_selector)); } m_cp_mgr = std::make_unique< CPManager >(); @@ -213,7 +213,7 @@ void HomeStore::do_start() { m_data_service->start(); } else if (has_repl_data_service()) { m_data_service->start(); - s_cast< ReplicationServiceImpl* >(m_repl_service.get())->start(); + s_cast< GenericReplService* >(m_repl_service.get())->start(); } // In case of custom recovery, let consumer starts the recovery and it is consumer module's responsibilities @@ -249,7 +249,7 @@ void HomeStore::shutdown() { if (has_data_service()) { m_data_service.reset(); } if (has_repl_data_service()) { - s_cast< ReplicationServiceImpl* >(m_repl_service.get())->stop(); + s_cast< GenericReplService* >(m_repl_service.get())->stop(); m_repl_service.reset(); } m_dev_mgr->close_devices(); diff --git a/src/lib/logstore/log_dev.cpp b/src/lib/logstore/log_dev.cpp index b8ea1e2ef..39e8ff162 100644 --- a/src/lib/logstore/log_dev.cpp +++ b/src/lib/logstore/log_dev.cpp @@ -223,12 +223,12 @@ void LogDev::assert_next_pages(log_stream_reader& lstream) { int64_t LogDev::append_async(const logstore_id_t store_id, const logstore_seq_num_t seq_num, const sisl::io_blob& data, void* cb_context) { - auto prev_size = m_pending_flush_size.fetch_add(data.size, std::memory_order_relaxed); + auto prev_size = m_pending_flush_size.fetch_add(data.size(), std::memory_order_relaxed); const auto idx = m_log_idx.fetch_add(1, std::memory_order_acq_rel); auto threshold_size = LogDev::flush_data_threshold_size(); m_log_records->create(idx, store_id, seq_num, data, cb_context); - if (prev_size < threshold_size && ((prev_size + data.size) >= threshold_size) && + if (prev_size < threshold_size && ((prev_size + data.size()) >= threshold_size) && !m_is_flushing.load(std::memory_order_relaxed)) { flush_if_needed(); } @@ -265,15 +265,15 @@ log_buffer LogDev::read(const logdev_key& key, serialized_log_record& return_rec auto record_header = header->nth_record(key.idx - header->start_log_idx); uint32_t const data_offset = (record_header->offset + (record_header->get_inlined() ? 0 : header->oob_data_offset)); - log_buffer const b = uint32_cast(record_header->size); - if ((data_offset + b.size()) < initial_read_size) { - std::memcpy(static_cast< void* >(b.bytes()), static_cast< const void* >(rbuf + data_offset), - b.size()); // Already read them enough, copy the data + sisl::byte_array b = sisl::make_byte_array(uint32_cast(record_header->size)); + if ((data_offset + b->size()) < initial_read_size) { + std::memcpy(static_cast< void* >(b->bytes()), static_cast< const void* >(rbuf + data_offset), + b->size()); // Already read them enough, copy the data } else { // Round them data offset to dma boundary in-order to make sure pread on direct io succeed. We need to skip // the rounded portion while copying to user buffer auto const rounded_data_offset = sisl::round_down(data_offset, m_vdev->align_size()); - auto const rounded_size = sisl::round_up(b.size() + data_offset - rounded_data_offset, m_vdev->align_size()); + auto const rounded_size = sisl::round_up(b->size() + data_offset - rounded_data_offset, m_vdev->align_size()); // Allocate a fresh aligned buffer, if size cannot fit standard size if (rounded_size > initial_read_size) { @@ -285,8 +285,8 @@ log_buffer LogDev::read(const logdev_key& key, serialized_log_record& return_rec key.group_dev_offset={} " "data_offset={} size={} rounded_data_offset={} rounded_size={}", initial_read_size, key.idx, key.dev_offset, data_offset, b.size(), rounded_data_offset, rounded_size); */ m_vdev->sync_pread(rbuf, rounded_size, key.dev_offset + rounded_data_offset); - std::memcpy(static_cast< void* >(b.bytes()), - static_cast< const void* >(rbuf + data_offset - rounded_data_offset), b.size()); + std::memcpy(static_cast< void* >(b->bytes()), + static_cast< const void* >(rbuf + data_offset - rounded_data_offset), b->size()); // Free the buffer in case we allocated above if (rounded_size > initial_read_size) { hs_utils::iobuf_free(rbuf, sisl::buftag::logread); } @@ -294,7 +294,7 @@ log_buffer LogDev::read(const logdev_key& key, serialized_log_record& return_rec return_record_header = serialized_log_record(record_header->size, record_header->offset, record_header->get_inlined(), record_header->store_seq_num, record_header->store_id); - return b; + return log_buffer{b}; } logstore_id_t LogDev::reserve_store_id() { @@ -774,8 +774,8 @@ bool LogDevMetadata::resize_logdev_sb_if_needed() { logstore_superblk* sb_area = m_sb->get_logstore_superblk(); std::fill_n(sb_area, store_capacity(), logstore_superblk::default_value()); - std::memcpy(voidptr_cast(m_sb.raw_buf()->bytes), static_cast< const void* >(old_buf->bytes), - std::min(old_buf->size, m_sb.size())); + std::memcpy(voidptr_cast(m_sb.raw_buf()->bytes()), static_cast< const void* >(old_buf->cbytes()), + std::min(old_buf->size(), m_sb.size())); return true; } else { return false; @@ -859,8 +859,8 @@ bool LogDevMetadata::resize_rollback_sb_if_needed() { const auto old_buf = m_rollback_sb.raw_buf(); m_rollback_sb.create(req_sz); - std::memcpy(voidptr_cast(m_rollback_sb.raw_buf()->bytes), static_cast< const void* >(old_buf->bytes), - std::min(old_buf->size, m_rollback_sb.size())); + std::memcpy(voidptr_cast(m_rollback_sb.raw_buf()->bytes()), static_cast< const void* >(old_buf->cbytes()), + std::min(old_buf->size(), m_rollback_sb.size())); return true; } else { return false; diff --git a/src/lib/logstore/log_dev.hpp b/src/lib/logstore/log_dev.hpp index 01f36ecce..f6d4fb606 100644 --- a/src/lib/logstore/log_dev.hpp +++ b/src/lib/logstore/log_dev.hpp @@ -104,11 +104,11 @@ struct log_record { log_record& operator=(log_record&&) noexcept = delete; ~log_record() = default; - size_t serialized_size() const { return sizeof(serialized_log_record) + data.size; } + size_t serialized_size() const { return sizeof(serialized_log_record) + data.size(); } bool is_inlineable(const uint64_t flush_size_multiple) const { // Need inlining if size is smaller or size/buffer is not in dma'ble boundary. - return (is_size_inlineable(data.size, flush_size_multiple) || - ((reinterpret_cast< uintptr_t >(data.bytes) % flush_size_multiple) != 0) || !data.aligned); + return (is_size_inlineable(data.size(), flush_size_multiple) || + ((r_cast< const uintptr_t >(data.cbytes()) % flush_size_multiple) != 0) || !data.is_aligned()); } static bool is_size_inlineable(const size_t sz, const uint64_t flush_size_multiple) { @@ -159,11 +159,7 @@ struct log_group_header { assert(idx - start_log_idx < n_log_records); const serialized_log_record* const lr{nth_record(start_log_idx - idx)}; - - sisl::blob b{}; - b.bytes = const_cast< uint8_t* >(lr->get_inlined() ? inline_area() : oob_area()) + lr->offset; - b.size = lr->size; - return b; + return sisl::blob{(lr->get_inlined() ? inline_area() : oob_area() + lr->offset), lr->size}; } uint32_t magic_word() const { return magic; } @@ -252,7 +248,7 @@ class LogGroup { void stop(); void reset(const uint32_t max_records); void create_overflow_buf(const uint32_t min_needed); - bool add_record(const log_record& record, const int64_t log_idx); + bool add_record(log_record& record, const int64_t log_idx); bool can_accomodate(const log_record& record) const { return (m_nrecords <= m_max_records); } const iovec_array& finish(const crc32_t prev_crc); @@ -809,7 +805,7 @@ class LogDev { bool m_stopped{false}; // Is Logdev stopped. We don't need lock here, because it is updated under flush lock logstore_family_id_t m_family_id; // The family id this logdev is part of JournalVirtualDev* m_vdev{nullptr}; - HomeStoreSafePtr m_hs; // Back pointer to homestore + HomeStoreSafePtr m_hs; // Back pointer to homestore std::multimap< logid_t, logstore_id_t > m_garbage_store_ids; Clock::time_point m_last_flush_time; diff --git a/src/lib/logstore/log_group.cpp b/src/lib/logstore/log_group.cpp index 7c68e581d..a7d31881f 100644 --- a/src/lib/logstore/log_group.cpp +++ b/src/lib/logstore/log_group.cpp @@ -74,7 +74,7 @@ void LogGroup::create_overflow_buf(const uint32_t min_needed) { m_iovecs[0].iov_base = m_cur_log_buf; } -bool LogGroup::add_record(const log_record& record, const int64_t log_idx) { +bool LogGroup::add_record(log_record& record, const int64_t log_idx) { if (m_nrecords >= m_max_records) { LOGDEBUGMOD(logstore, "Will exceed estimated records={} if we add idx={} record. Hence stopping adding in this batch", @@ -82,9 +82,9 @@ bool LogGroup::add_record(const log_record& record, const int64_t log_idx) { return false; } - m_actual_data_size += record.data.size; - if ((m_inline_data_pos + record.data.size) >= m_cur_buf_len) { - create_overflow_buf(m_inline_data_pos + record.data.size); + m_actual_data_size += record.data.size(); + if ((m_inline_data_pos + record.data.size()) >= m_cur_buf_len) { + create_overflow_buf(m_inline_data_pos + record.data.size()); } // We use log_idx reference in the header as we expect each slot record is in order. @@ -93,22 +93,22 @@ bool LogGroup::add_record(const log_record& record, const int64_t log_idx) { // assert(header()->start_log_idx - log_idx); // Fill the slots - m_record_slots[m_nrecords].size = record.data.size; + m_record_slots[m_nrecords].size = record.data.size(); m_record_slots[m_nrecords].store_id = record.store_id; m_record_slots[m_nrecords].store_seq_num = record.seq_num; if (record.is_inlineable(m_flush_multiple_size)) { m_record_slots[m_nrecords].offset = m_inline_data_pos; m_record_slots[m_nrecords].set_inlined(true); - std::memcpy(s_cast< void* >(m_cur_log_buf + m_inline_data_pos), s_cast< const void* >(record.data.bytes), - record.data.size); - m_inline_data_pos += record.data.size; + std::memcpy(s_cast< void* >(m_cur_log_buf + m_inline_data_pos), s_cast< const void* >(record.data.cbytes()), + record.data.size()); + m_inline_data_pos += record.data.size(); m_iovecs[0].iov_len = m_inline_data_pos; } else { // We do not round it now, it will be rounded during finish m_record_slots[m_nrecords].offset = m_oob_data_pos; m_record_slots[m_nrecords].set_inlined(false); - m_iovecs.emplace_back(s_cast< void* >(record.data.bytes), record.data.size); - m_oob_data_pos += record.data.size; + m_iovecs.emplace_back(s_cast< void* >(record.data.bytes()), record.data.size()); + m_oob_data_pos += record.data.size(); } ++m_nrecords; diff --git a/src/lib/logstore/log_store.cpp b/src/lib/logstore/log_store.cpp index 637374420..f51e29944 100644 --- a/src/lib/logstore/log_store.cpp +++ b/src/lib/logstore/log_store.cpp @@ -95,7 +95,7 @@ void HomeLogStore::write_async(logstore_req* req, const log_req_comp_cb_t& cb) { m_records.create(req->seq_num); COUNTER_INCREMENT(m_metrics, logstore_append_count, 1); - HISTOGRAM_OBSERVE(m_metrics, logstore_record_size, req->data.size); + HISTOGRAM_OBSERVE(m_metrics, logstore_record_size, req->data.size()); m_logdev.append_async(m_store_id, req->seq_num, req->data, static_cast< void* >(req)); } diff --git a/src/lib/logstore/log_stream.cpp b/src/lib/logstore/log_stream.cpp index d7c0ce8e2..0be849d8f 100644 --- a/src/lib/logstore/log_stream.cpp +++ b/src/lib/logstore/log_stream.cpp @@ -22,7 +22,6 @@ #include "log_dev.hpp" #include "device/journal_vdev.hpp" - namespace homestore { SISL_LOGGING_DECL(logstore) @@ -46,7 +45,7 @@ sisl::byte_view log_stream_reader::next_group(off_t* out_dev_offset) { } HS_REL_ASSERT_GE(m_cur_log_buf.size(), m_read_size_multiple); - const auto* header = r_cast< log_group_header* >(m_cur_log_buf.bytes()); + const auto* header = r_cast< log_group_header const* >(m_cur_log_buf.bytes()); if (header->magic_word() != LOG_GROUP_HDR_MAGIC) { LOGINFOMOD(logstore, "Logdev data not seeing magic at pos {}, must have come to end of logdev", m_vdev->dev_offset(m_cur_read_bytes)); @@ -135,18 +134,13 @@ sisl::byte_view log_stream_reader::group_in_next_page() { sisl::byte_view log_stream_reader::read_next_bytes(uint64_t nbytes) { // TO DO: Might need to address alignment based on data or fast type auto out_buf = - hs_utils::create_byte_view(nbytes + m_cur_log_buf.size(), true, sisl::buftag::logread, m_vdev->align_size()); - auto ret_buf = out_buf; - if (m_cur_log_buf.size()) { - memcpy(out_buf.bytes(), m_cur_log_buf.bytes(), m_cur_log_buf.size()); - out_buf.move_forward(m_cur_log_buf.size()); - } + hs_utils::make_byte_array(nbytes + m_cur_log_buf.size(), true, sisl::buftag::logread, m_vdev->align_size()); + if (m_cur_log_buf.size()) { memcpy(out_buf->bytes(), m_cur_log_buf.bytes(), m_cur_log_buf.size()); } const auto prev_pos = m_vdev->seeked_pos(); - m_vdev->sync_next_read(out_buf.bytes(), nbytes); + m_vdev->sync_next_read(out_buf->bytes() + m_cur_log_buf.size(), nbytes); LOGINFOMOD(logstore, "LogStream read {} bytes from vdev offset {} and vdev cur offset {}", nbytes, prev_pos, m_vdev->seeked_pos()); - ret_buf.set_size(nbytes + m_cur_log_buf.size()); - return ret_buf; + return sisl::byte_view{out_buf}; } } // namespace homestore diff --git a/src/lib/meta/meta_blk_service.cpp b/src/lib/meta/meta_blk_service.cpp index 03cfd1ae7..5c8e1df53 100644 --- a/src/lib/meta/meta_blk_service.cpp +++ b/src/lib/meta/meta_blk_service.cpp @@ -637,16 +637,16 @@ void MetaBlkService::write_meta_blk_internal(meta_blk* mblk, const uint8_t* cont // TO DO: Might need to differentiate based on data or fast type const uint64_t max_dst_size = sisl::round_up(sisl::Compress::max_compress_len(sz), align_size()); if (max_dst_size <= max_compress_memory_size()) { - if (max_dst_size > m_compress_info.size) { + if (max_dst_size > m_compress_info.size()) { free_compress_buf(); alloc_compress_buf(max_dst_size); } - std::memset(voidptr_cast(m_compress_info.bytes), 0, max_dst_size); + std::memset(voidptr_cast(m_compress_info.bytes()), 0, max_dst_size); size_t compressed_size = max_dst_size; const auto ret = sisl::Compress::compress(r_cast< const char* >(context_data), - r_cast< char* >(m_compress_info.bytes), sz, &compressed_size); + r_cast< char* >(m_compress_info.bytes()), sz, &compressed_size); if (ret != 0) { LOGERROR("hs_compress_default indicates a failure trying to compress the data, ret: {}", ret); HS_REL_ASSERT(false, "failed to compress"); @@ -670,7 +670,7 @@ void MetaBlkService::write_meta_blk_internal(meta_blk* mblk, const uint8_t* cont HS_REL_ASSERT_GE(max_dst_size, uint64_cast(mblk->hdr.h.context_sz)); // point context_data to compressed data; - context_data = m_compress_info.bytes; + context_data = m_compress_info.cbytes(); data_sz = mblk->hdr.h.context_sz; } else { // back off compression if compress ratio doesn't meet criteria. @@ -1019,7 +1019,7 @@ sisl::byte_array MetaBlkService::read_sub_sb_internal(const meta_blk* mblk) cons hs_utils::make_byte_array(mblk->hdr.h.context_sz, false /* aligned */, sisl::buftag::metablk, align_size()); HS_DBG_ASSERT_EQ(mblk->hdr.h.ovf_bid.is_valid(), false, "[type={}], unexpected ovf_bid: {}", mblk->hdr.h.type, mblk->hdr.h.ovf_bid.to_string()); - std::memcpy(buf->bytes, mblk->get_context_data(), mblk->hdr.h.context_sz); + std::memcpy(buf->bytes(), mblk->get_context_data(), mblk->hdr.h.context_sz); } else { // // read through the ovf blk chain to get the buffer; @@ -1053,7 +1053,7 @@ sisl::byte_array MetaBlkService::read_sub_sb_internal(const meta_blk* mblk) cons } // TO DO: Might need to differentiate based on data or fast type - read(data_bid[i], buf->bytes + read_offset, sisl::round_up(read_sz_per_db, align_size())); + read(data_bid[i], buf->bytes() + read_offset, sisl::round_up(read_sz_per_db, align_size())); read_offset_in_this_ovf += read_sz_per_db; read_offset += read_sz_per_db; @@ -1120,7 +1120,7 @@ void MetaBlkService::recover_meta_block(meta_blk* mblk) { if (itr != std::end(m_sub_info)) { // if subsystem registered crc protection, verify crc before sending to subsystem; if (itr->second.do_crc) { - const auto crc = crc32_ieee(init_crc32, s_cast< const uint8_t* >(buf->bytes), mblk->hdr.h.context_sz); + const auto crc = crc32_ieee(init_crc32, buf->cbytes(), mblk->hdr.h.context_sz); HS_REL_ASSERT_EQ(crc, uint32_cast(mblk->hdr.h.crc), "[type={}], CRC mismatch: {}/{}, on mblk bid: {}, context_sz: {}", mblk->hdr.h.type, crc, @@ -1140,8 +1140,8 @@ void MetaBlkService::recover_meta_block(meta_blk* mblk) { auto decompressed_buf{hs_utils::make_byte_array(mblk->hdr.h.src_context_sz, true /* aligned */, sisl::buftag::compression, align_size())}; size_t decompressed_size = mblk->hdr.h.src_context_sz; - const auto ret{sisl::Compress::decompress(r_cast< const char* >(buf->bytes), - r_cast< char* >(decompressed_buf->bytes), + const auto ret{sisl::Compress::decompress(r_cast< const char* >(buf->cbytes()), + r_cast< char* >(decompressed_buf->bytes()), mblk->hdr.h.compressed_sz, &decompressed_size)}; if (ret != 0) { LOGERROR("[type={}], negative result: {} from decompress trying to decompress the " @@ -1256,13 +1256,12 @@ bool MetaBlkService::is_aligned_buf_needed(size_t size) const { return (size <= bool MetaBlkService::s_self_recover{false}; -void MetaBlkService::free_compress_buf() { hs_utils::iobuf_free(m_compress_info.bytes, sisl::buftag::compression); } +void MetaBlkService::free_compress_buf() { hs_utils::iobuf_free(m_compress_info.bytes(), sisl::buftag::compression); } void MetaBlkService::alloc_compress_buf(size_t size) { - m_compress_info.size = size; - m_compress_info.bytes = hs_utils::iobuf_alloc(size, sisl::buftag::compression, align_size()); - - HS_REL_ASSERT_NOTNULL(m_compress_info.bytes, "fail to allocate iobuf for compression of size: {}", size); + m_compress_info = + sisl::blob{hs_utils::iobuf_alloc(size, sisl::buftag::compression, align_size()), uint32_cast(size)}; + HS_REL_ASSERT_NOTNULL(m_compress_info.cbytes(), "fail to allocate iobuf for compression of size: {}", size); } uint64_t MetaBlkService::meta_blk_context_sz() const { return block_size() - META_BLK_HDR_MAX_SZ; } @@ -1507,24 +1506,24 @@ nlohmann::json MetaBlkService::populate_json(int log_level, meta_blk_map_t& meta } sisl::byte_array buf = read_sub_sb_internal(it->second); - if (free_space < buf->size) { + if (free_space < buf->size()) { j[x.first]["meta_bids"][std::to_string(bid_cnt)] = "Not_able_to_dump_to_file_exceeding_allowed_space"; HS_LOG_EVERY_N(WARN, metablk, 100, "[type={}] Skip dumping to file, exceeding allowed space: {}, " "requested_size: {}, " "total_free: {}, free_fs_percent: {}", - x.first, free_space, buf->size, total_free, + x.first, free_space, buf->size(), total_free, HS_DYNAMIC_CONFIG(metablk.percent_of_free_space)); continue; } const std::string file_path = fmt::format("{}/{}_{}", dump_dir, x.first, bid_cnt); std::ofstream f{file_path}; - f.write(r_cast< const char* >(buf->bytes), buf->size); + f.write(r_cast< const char* >(buf->bytes()), buf->size()); j[x.first]["meta_bids"][std::to_string(bid_cnt)] = file_path; - free_space -= buf->size; + free_space -= buf->size(); } ++bid_cnt; diff --git a/src/lib/replication/CMakeLists.txt b/src/lib/replication/CMakeLists.txt index c71bb4516..7d3587e5e 100644 --- a/src/lib/replication/CMakeLists.txt +++ b/src/lib/replication/CMakeLists.txt @@ -3,9 +3,30 @@ include (${CMAKE_SOURCE_DIR}/cmake/test_mode.cmake) include_directories (BEFORE ..) include_directories (BEFORE .) +flatbuffers_generate_headers( + TARGET hs_replication_fb + SCHEMAS rpc/push_data_rpc.fbs + FLAGS "--cpp" +) +#build_flatbuffers( +# rpc/push_data_rpc.fbs +# "" +# hs_replication_fb +# "" +# "${CMAKE_CURRENT_BINARY_DIR}/generated/" +# "" +# "" +# ) +#target_link_libraries(hs_replication_fb ${COMMON_DEPS}) + add_library(hs_replication OBJECT) target_sources(hs_replication PRIVATE - service/repl_service_impl.cpp + service/generic_repl_svc.cpp repl_dev/solo_repl_dev.cpp + repl_dev/common.cpp ) -target_link_libraries(hs_replication ${COMMON_DEPS}) +target_link_libraries(hs_replication PRIVATE ${COMMON_DEPS} hs_replication_fb) + +#set(FLATBUFFERS_FLATC_EXECUTABLE ${flatbuffers_LIB_DIRS}/../bin/flatc) +#flatbuffer_gen_cpp(${FLATBUFFERS_FLATC_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/generated/ hs_replication rpc/push_data_rpc.fbs rpc/fetch_data_rpc.fbs) + diff --git a/src/lib/replication/repl_dev/solo_repl_dev.cpp b/src/lib/replication/repl_dev/solo_repl_dev.cpp index 7620c0b98..5cf2bd8eb 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.cpp +++ b/src/lib/replication/repl_dev/solo_repl_dev.cpp @@ -3,10 +3,11 @@ #include #include "common/homestore_assert.hpp" #include "replication/repl_dev/solo_repl_dev.h" +#include "replication/repl_dev/common.h" namespace homestore { SoloReplDev::SoloReplDev(superblk< repl_dev_superblk > const& rd_sb, bool load_existing) : - m_rd_sb{rd_sb}, m_group_id{m_rd_sb->gid} { + m_rd_sb{rd_sb}, m_group_id{m_rd_sb->group_id} { if (load_existing) { logstore_service().open_log_store(LogStoreService::DATA_LOG_FAMILY_IDX, m_rd_sb->data_journal_id, true, bind_this(SoloReplDev::on_data_journal_created, 1)); @@ -24,8 +25,8 @@ void SoloReplDev::on_data_journal_created(shared< HomeLogStore > log_store) { } void SoloReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& value, - intrusive< repl_req_ctx > rreq) { - if (!rreq) { auto rreq = intrusive< repl_req_ctx >(new repl_req_ctx{}); } + repl_req_ptr_t rreq) { + if (!rreq) { auto rreq = repl_req_ptr_t(new repl_req_ctx{}); } rreq->header = header; rreq->key = key; rreq->value = std::move(value); @@ -49,32 +50,32 @@ void SoloReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& } } -void SoloReplDev::write_journal(intrusive< repl_req_ctx > rreq) { - uint32_t entry_size = sizeof(repl_journal_entry) + rreq->header.size + rreq->key.size + +void SoloReplDev::write_journal(repl_req_ptr_t rreq) { + uint32_t entry_size = sizeof(repl_journal_entry) + rreq->header.size() + rreq->key.size() + (rreq->value.size ? rreq->local_blkid.serialized_size() : 0); - rreq->alloc_journal_entry(entry_size); - rreq->journal_entry->code = journal_type_t::HS_DATA; - rreq->journal_entry->user_header_size = rreq->header.size; - rreq->journal_entry->key_size = rreq->key.size; + rreq->alloc_journal_entry(entry_size, false /* is_raft_buf */); + rreq->journal_entry->code = journal_type_t::HS_LARGE_DATA; + rreq->journal_entry->user_header_size = rreq->header.size(); + rreq->journal_entry->key_size = rreq->key.size(); uint8_t* raw_ptr = uintptr_cast(rreq->journal_entry) + sizeof(repl_journal_entry); - if (rreq->header.size) { - std::memcpy(raw_ptr, rreq->header.bytes, rreq->header.size); - raw_ptr += rreq->header.size; + if (rreq->header.size()) { + std::memcpy(raw_ptr, rreq->header.cbytes(), rreq->header.size()); + raw_ptr += rreq->header.size(); } - if (rreq->key.size) { - std::memcpy(raw_ptr, rreq->key.bytes, rreq->key.size); - raw_ptr += rreq->key.size; + if (rreq->key.size()) { + std::memcpy(raw_ptr, rreq->key.cbytes(), rreq->key.size()); + raw_ptr += rreq->key.size(); } if (rreq->value.size) { - auto b = rreq->local_blkid.serialize(); - std::memcpy(raw_ptr, b.bytes, b.size); - raw_ptr += b.size; + auto const b = rreq->local_blkid.serialize(); + std::memcpy(raw_ptr, b.cbytes(), b.size()); + raw_ptr += b.size(); } - m_data_journal->append_async(sisl::io_blob{rreq->journal_buf.get(), entry_size, false /* is_aligned */}, + m_data_journal->append_async(sisl::io_blob{rreq->raw_journal_buf(), entry_size, false /* is_aligned */}, nullptr /* cookie */, [this, rreq](int64_t lsn, sisl::io_blob&, homestore::logdev_key, void*) mutable { rreq->lsn = lsn; @@ -89,13 +90,13 @@ void SoloReplDev::write_journal(intrusive< repl_req_ctx > rreq) { } void SoloReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx) { - repl_journal_entry* entry = r_cast< repl_journal_entry* >(buf.bytes()); + repl_journal_entry const* entry = r_cast< repl_journal_entry const* >(buf.bytes()); uint32_t remain_size = buf.size() - sizeof(repl_journal_entry); HS_REL_ASSERT_EQ(entry->major_version, repl_journal_entry::JOURNAL_ENTRY_MAJOR, "Mismatched version of journal entry found"); - HS_REL_ASSERT_EQ(entry->code, journal_type_t::HS_DATA, "Found a journal entry which is not data"); + HS_REL_ASSERT_EQ(entry->code, journal_type_t::HS_LARGE_DATA, "Found a journal entry which is not data"); - uint8_t* raw_ptr = r_cast< uint8_t* >(entry) + sizeof(repl_journal_entry); + uint8_t const* raw_ptr = r_cast< uint8_t const* >(entry) + sizeof(repl_journal_entry); sisl::blob header{raw_ptr, entry->user_header_size}; HS_REL_ASSERT_GE(remain_size, entry->user_header_size, "Invalid journal entry, header_size mismatch"); raw_ptr += entry->user_header_size; @@ -135,13 +136,4 @@ void SoloReplDev::cp_flush(CP*) { } void SoloReplDev::cp_cleanup(CP*) { m_data_journal->truncate(m_rd_sb->checkpoint_lsn); } - -void repl_req_ctx::alloc_journal_entry(uint32_t size) { - journal_buf = std::unique_ptr< uint8_t[] >(new uint8_t[size]); - journal_entry = new (journal_buf.get()) repl_journal_entry(); -} - -repl_req_ctx::~repl_req_ctx() { - if (journal_entry) { journal_entry->~repl_journal_entry(); } -} } // namespace homestore \ No newline at end of file diff --git a/src/lib/replication/repl_dev/solo_repl_dev.h b/src/lib/replication/repl_dev/solo_repl_dev.h index 684378a13..51204a07a 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.h +++ b/src/lib/replication/repl_dev/solo_repl_dev.h @@ -21,45 +21,9 @@ #include #include -namespace homestore { -#pragma pack(1) -struct repl_dev_superblk { - static constexpr uint64_t REPL_DEV_SB_MAGIC = 0xABCDF00D; - static constexpr uint32_t REPL_DEV_SB_VERSION = 1; - - uint64_t magic{REPL_DEV_SB_MAGIC}; - uint32_t version{REPL_DEV_SB_VERSION}; - uuid_t gid; // gid of this replica set - logstore_id_t data_journal_id; // Logstore id for the data journal - int64_t commit_lsn; // LSN upto which this replica has committed - int64_t checkpoint_lsn; // LSN upto which this replica have checkpointed the data - -#if 0 - logstore_id_t free_pba_store_id; // Logstore id for storing free pba records -#endif - - uint64_t get_magic() const { return magic; } - uint32_t get_version() const { return version; } -}; -#pragma pack() - -VENUM(journal_type_t, uint16_t, HS_DATA = 0) -struct repl_journal_entry { - static constexpr uint16_t JOURNAL_ENTRY_MAJOR = 1; - static constexpr uint16_t JOURNAL_ENTRY_MINOR = 1; - - // Major and minor version. For each major version underlying structures could change. Minor versions can only add - // fields, not change any existing fields. - uint16_t major_version{JOURNAL_ENTRY_MAJOR}; - uint16_t minor_version{JOURNAL_ENTRY_MINOR}; - - journal_type_t code; - uint32_t replica_id; - uint32_t user_header_size; - uint32_t key_size; - // Followed by user_header, then key, then MultiBlkId -}; +#include "replication/repl_dev/common.h" +namespace homestore { class CP; class SoloReplDev : public ReplDev { @@ -74,7 +38,7 @@ class SoloReplDev : public ReplDev { virtual ~SoloReplDev() = default; void async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& value, - intrusive< repl_req_ctx > ctx) override; + repl_req_ptr_t ctx) override; folly::Future< std::error_code > async_read(MultiBlkId const& bid, sisl::sg_list& sgs, uint32_t size, bool part_of_batch = false) override; @@ -92,7 +56,7 @@ class SoloReplDev : public ReplDev { private: void on_data_journal_created(shared< HomeLogStore > log_store); - void write_journal(intrusive< repl_req_ctx > rreq); + void write_journal(repl_req_ptr_t rreq); void on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx); }; diff --git a/src/lib/replication/service/generic_repl_svc.cpp b/src/lib/replication/service/generic_repl_svc.cpp new file mode 100644 index 000000000..0340a018d --- /dev/null +++ b/src/lib/replication/service/generic_repl_svc.cpp @@ -0,0 +1,174 @@ +/********************************************************************************* + * Modifications Copyright 2017-2019 eBay Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ +#include +#include +#include +#include "common/homestore_assert.hpp" +#include "replication/service/generic_repl_svc.h" +#include "replication/repl_dev/solo_repl_dev.h" + +namespace homestore { +ReplicationService& repl_service() { return hs()->repl_service(); } + +std::unique_ptr< GenericReplService > GenericReplService::create(cshared< ReplApplication >& repl_app) { + auto impl_type = repl_app->get_impl_type(); + if (impl_type == repl_impl_type::solo) { + return std::make_unique< SoloReplService >(repl_app); + //} else if (impl_type == repl_impl_type::server_side) { + // return std::make_unique< RaftReplService >(repl_app); + } else { + return nullptr; + } +} + +GenericReplService::GenericReplService(cshared< ReplApplication >& repl_app) : m_repl_app{repl_app} { + meta_service().register_handler( + get_meta_blk_name(), + [this](meta_blk* mblk, sisl::byte_view buf, size_t) { rd_super_blk_found(std::move(buf), voidptr_cast(mblk)); }, + nullptr); +} + +void GenericReplService::stop() { + std::unique_lock lg{m_rd_map_mtx}; + m_rd_map.clear(); +} + +AsyncReplResult< shared< ReplDev > > GenericReplService::create_repl_dev(uuid_t group_id, + std::set< uuid_t, std::less<> >&& members) { + // Ensure idempotency of the repl_dev creation + auto it = m_rd_map.end(); + bool happened = false; + { + std::unique_lock lg(m_rd_map_mtx); + std::tie(it, happened) = m_rd_map.emplace(std::make_pair(group_id, nullptr)); + + if (!happened) { + if (it == m_rd_map.end()) { + // We should never reach here, as we have failed to emplace in map, but couldn't find entry + DEBUG_ASSERT(false, "Unable to put the repl_dev in rd map"); + return make_async_error< shared< ReplDev > >(ReplServiceError::SERVER_ALREADY_EXISTS); + } else if (it->second == nullptr) { + // There is a duplicate create_repl_dev request while one is being done. + return make_async_error< shared< ReplDev > >(ReplServiceError::SERVER_IS_JOINING); + } else { + return make_async_success(it->second); + } + } + } + + // Create whatever underlying implementation of repl_dev needs to be for fresh creation of repl_dev + auto const result = create_replica_set(group_id, std::move(members)).get(); + if (!bool(result)) { return make_async_error< shared< ReplDev > >(result.error()); } + + // Now we need to create local repl dev instance which is actually creates entry, state machine etc + // according the the underlying implementation + superblk< repl_dev_superblk > rd_sb{get_meta_blk_name()}; + rd_sb.create(rd_super_blk_size()); + rd_sb->group_id = group_id; + + auto repl_dev = create_local_repl_dev_instance(rd_sb, false /* load_existing */); + auto listener = m_repl_app->create_repl_dev_listener(group_id); + listener->set_repl_dev(repl_dev.get()); + repl_dev->attach_listener(std::move(listener)); + rd_sb.write(); + + { + std::unique_lock lg(m_rd_map_mtx); + it->second = repl_dev; + } + return make_async_success(repl_dev); +} + +ReplResult< shared< ReplDev > > GenericReplService::get_repl_dev(uuid_t group_id) const { + std::shared_lock lg(m_rd_map_mtx); + if (auto it = m_rd_map.find(group_id); it != m_rd_map.end()) { return it->second; } + return folly::makeUnexpected(ReplServiceError::SERVER_NOT_FOUND); +} + +void GenericReplService::iterate_repl_devs(std::function< void(cshared< ReplDev >&) > const& cb) { + std::shared_lock lg(m_rd_map_mtx); + for (const auto& [uuid, rd] : m_rd_map) { + cb(rd); + } +} + +void GenericReplService::rd_super_blk_found(sisl::byte_view const& buf, void* meta_cookie) { + superblk< repl_dev_superblk > rd_sb; + rd_sb.load(buf, meta_cookie); + HS_DBG_ASSERT_EQ(rd_sb->get_magic(), repl_dev_superblk::REPL_DEV_SB_MAGIC, "Invalid rdev metablk, magic mismatch"); + HS_DBG_ASSERT_EQ(rd_sb->get_version(), repl_dev_superblk::REPL_DEV_SB_VERSION, "Invalid version of rdev metablk"); + + auto repl_dev = create_local_repl_dev_instance(rd_sb, true /* load_existing */); + auto listener = m_repl_app->create_repl_dev_listener(repl_dev->group_id()); + listener->set_repl_dev(repl_dev.get()); + repl_dev->attach_listener(std::move(listener)); + + join_replica_set(rd_sb->group_id, repl_dev); + { + std::unique_lock lg(m_rd_map_mtx); + m_rd_map.emplace(std::make_pair(rd_sb->group_id, repl_dev)); + } +} + +hs_stats GenericReplService::get_cap_stats() const { + hs_stats stats; + stats.total_capacity = data_service().get_total_capacity(); + stats.used_capacity = data_service().get_used_capacity(); + return stats; +} + +///////////////////// SoloReplService specializations and CP Callbacks ///////////////////////////// +SoloReplService::SoloReplService(cshared< ReplApplication >& repl_app) : GenericReplService{repl_app} {} + +void SoloReplService::start() { + // Register to CP to flush the super blk and truncate the logstore + hs()->cp_mgr().register_consumer(cp_consumer_t::REPLICATION_SVC, std::make_unique< SoloReplServiceCPHandler >()); +} + +AsyncReplResult<> SoloReplService::replace_member(uuid_t group_id, uuid_t member_out, uuid_t member_in) const { + return make_async_error<>(ReplServiceError::NOT_IMPLEMENTED); +} + +AsyncReplResult<> SoloReplService::create_replica_set(uuid_t group_id, std::set< uuid_t, std::less<> >&& members) { + return make_async_success<>(); +} + +AsyncReplResult<> SoloReplService::join_replica_set(uuid_t group_id, cshared< ReplDev >& repl_dev) { + return make_async_success<>(); +} + +shared< ReplDev > SoloReplService::create_local_repl_dev_instance(superblk< repl_dev_superblk > const& rd_sb, + bool load_existing) { + return std::make_shared< SoloReplDev >(rd_sb, load_existing); +} + +uint32_t SoloReplService::rd_super_blk_size() const { return sizeof(repl_dev_superblk); } + +std::unique_ptr< CPContext > SoloReplServiceCPHandler::on_switchover_cp(CP* cur_cp, CP* new_cp) { return nullptr; } + +folly::Future< bool > SoloReplServiceCPHandler::cp_flush(CP* cp) { + repl_service().iterate_repl_devs( + [cp](cshared< ReplDev >& repl_dev) { std::dynamic_pointer_cast< SoloReplDev >(repl_dev)->cp_flush(cp); }); + return folly::makeFuture< bool >(true); +} + +void SoloReplServiceCPHandler::cp_cleanup(CP* cp) { + repl_service().iterate_repl_devs( + [cp](cshared< ReplDev >& repl_dev) { std::dynamic_pointer_cast< SoloReplDev >(repl_dev)->cp_cleanup(cp); }); +} + +int SoloReplServiceCPHandler::cp_progress_percent() { return 100; } + +} // namespace homestore diff --git a/src/lib/replication/service/repl_service_impl.h b/src/lib/replication/service/generic_repl_svc.h similarity index 54% rename from src/lib/replication/service/repl_service_impl.h rename to src/lib/replication/service/generic_repl_svc.h index bcee4488f..95bc52bb4 100644 --- a/src/lib/replication/service/repl_service_impl.h +++ b/src/lib/replication/service/generic_repl_svc.h @@ -32,57 +32,57 @@ namespace homestore { struct repl_dev_superblk; -class ReplicationServiceImpl : public ReplicationService { - struct listener_info { - folly::Promise< folly::Expected< shared< ReplDev >, ReplServiceError > > dev_promise{}; - std::unique_ptr< ReplDevListener > listener; - }; - - template < class V > - auto make_async_error(ReplServiceError err) { - return folly::makeFuture< ReplResult< V > >(folly::makeUnexpected(err)); - } - - template < class V > - auto make_async_success(V&& v) { - return folly::makeFuture< ReplResult< V > >(std::move(v)); - } - +class GenericReplService : public ReplicationService { protected: - repl_impl_type m_repl_type; + shared< ReplApplication > m_repl_app; std::shared_mutex m_rd_map_mtx; std::map< uuid_t, shared< ReplDev > > m_rd_map; - std::map< uuid_t, listener_info > m_pending_open; - std::atomic< bool > m_rd_map_loaded{false}; public: - ReplicationServiceImpl(repl_impl_type impl_type); - void start(); - void stop(); + static std::unique_ptr< GenericReplService > create(cshared< ReplApplication >& repl_app); + + GenericReplService(cshared< ReplApplication >& repl_app); + virtual void start() = 0; + virtual void stop(); + meta_sub_type get_meta_blk_name() const override { return "repl_dev"; } + AsyncReplResult< shared< ReplDev > > create_repl_dev(uuid_t group_id, - std::set< std::string, std::less<> >&& members, - std::unique_ptr< ReplDevListener > listener) override; - AsyncReplResult< shared< ReplDev > > open_repl_dev(uuid_t group_id, - std::unique_ptr< ReplDevListener > listener) override; + std::set< uuid_t, std::less<> >&& members) override; ReplResult< shared< ReplDev > > get_repl_dev(uuid_t group_id) const override; void iterate_repl_devs(std::function< void(cshared< ReplDev >&) > const& cb) override; - - folly::Future< ReplServiceError > replace_member(uuid_t group_id, std::string const& member_out, - std::string const& member_in) const override; hs_stats get_cap_stats() const override; +protected: + virtual AsyncReplResult<> create_replica_set(uuid_t group_id, std::set< uuid_t, std::less<> >&& members) = 0; + virtual AsyncReplResult<> join_replica_set(uuid_t group_id, cshared< ReplDev >& repl_dev) = 0; + virtual shared< ReplDev > create_local_repl_dev_instance(superblk< repl_dev_superblk > const& rd_sb, + bool load_existing) = 0; + virtual uint32_t rd_super_blk_size() const = 0; private: - shared< ReplDev > create_repl_dev_instance(superblk< repl_dev_superblk > const& rd_sb, bool load_existing); void rd_super_blk_found(sisl::byte_view const& buf, void* meta_cookie); }; -class ReplServiceCPHandler : public CPCallbacks { +class SoloReplService : public GenericReplService { public: - ReplServiceCPHandler(); - virtual ~ReplServiceCPHandler() = default; + SoloReplService(cshared< ReplApplication >& repl_app); + void start() override; + + AsyncReplResult<> replace_member(uuid_t group_id, uuid_t member_out, uuid_t member_in) const override; + +private: + AsyncReplResult<> create_replica_set(uuid_t group_id, std::set< uuid_t, std::less<> >&& members) override; + AsyncReplResult<> join_replica_set(uuid_t group_id, cshared< ReplDev >& repl_dev) override; + shared< ReplDev > create_local_repl_dev_instance(superblk< repl_dev_superblk > const& rd_sb, + bool load_existing) override; + uint32_t rd_super_blk_size() const override; +}; +class SoloReplServiceCPHandler : public CPCallbacks { public: + SoloReplServiceCPHandler() = default; + virtual ~SoloReplServiceCPHandler() = default; + std::unique_ptr< CPContext > on_switchover_cp(CP* cur_cp, CP* new_cp) override; folly::Future< bool > cp_flush(CP* cp) override; void cp_cleanup(CP* cp) override; diff --git a/src/test_common/bits_generator.hpp b/src/test_common/bits_generator.hpp deleted file mode 100644 index 97fb035d5..000000000 --- a/src/test_common/bits_generator.hpp +++ /dev/null @@ -1,39 +0,0 @@ -/********************************************************************************* - * Modifications Copyright 2017-2019 eBay Inc. - * - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software distributed - * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR - * CONDITIONS OF ANY KIND, either express or implied. See the License for the - * specific language governing permissions and limitations under the License. - * - *********************************************************************************/ -#pragma once -#include -#include -#include -#include - -namespace homestore { - -class BitsGenerator { -public: - static void gen_random_bits(size_t size, uint8_t* buf) { - std::random_device rd; - std::default_random_engine g(rd()); - std::uniform_int_distribution< unsigned long long > dis(std::numeric_limits< std::uint8_t >::min(), - std::numeric_limits< std::uint8_t >::max()); - for (size_t i = 0; i < size; ++i) { - buf[i] = dis(g); - } - } - - static void gen_random_bits(sisl::blob& b) { gen_random_bits(b.size, b.bytes); } -}; - -}; // namespace homestore diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt index 42844d22a..f40a66311 100644 --- a/src/tests/CMakeLists.txt +++ b/src/tests/CMakeLists.txt @@ -3,6 +3,7 @@ cmake_minimum_required(VERSION 3.13) include (${CMAKE_SOURCE_DIR}/cmake/test_mode.cmake) include_directories (BEFORE ../include/) include_directories (BEFORE ../lib/) +include_directories (BEFORE ../tests/) include_directories (BEFORE .) add_subdirectory(test_scripts) diff --git a/src/tests/btree_helpers/btree_test_kvs.hpp b/src/tests/btree_helpers/btree_test_kvs.hpp index a5dada646..3177b2b27 100644 --- a/src/tests/btree_helpers/btree_test_kvs.hpp +++ b/src/tests/btree_helpers/btree_test_kvs.hpp @@ -67,7 +67,7 @@ class TestFixedKey : public BtreeKey { TestFixedKey(uint64_t k) : m_key{k} {} TestFixedKey(const TestFixedKey& other) : TestFixedKey(other.serialize(), true) {} TestFixedKey(const BtreeKey& other) : TestFixedKey(other.serialize(), true) {} - TestFixedKey(const sisl::blob& b, bool copy) : BtreeKey(), m_key{*(r_cast< const uint64_t* >(b.bytes))} {} + TestFixedKey(const sisl::blob& b, bool copy) : BtreeKey(), m_key{*(r_cast< const uint64_t* >(b.cbytes()))} {} TestFixedKey& operator=(const TestFixedKey& other) = default; TestFixedKey& operator=(BtreeKey const& other) { m_key = s_cast< TestFixedKey const& >(other).m_key; @@ -109,7 +109,7 @@ class TestFixedKey : public BtreeKey { static uint32_t get_fixed_size() { return (sizeof(uint64_t)); } std::string to_string() const { return fmt::format("{}", m_key); } - void deserialize(const sisl::blob& b, bool copy) override { m_key = *(r_cast< const uint64_t* >(b.bytes)); } + void deserialize(const sisl::blob& b, bool copy) override { m_key = *(r_cast< const uint64_t* >(b.cbytes())); } static uint32_t get_max_size() { return get_fixed_size(); } friend std::ostream& operator<<(std::ostream& os, const TestFixedKey& k) { @@ -177,7 +177,7 @@ class TestVarLenKey : public BtreeKey { } void deserialize(const sisl::blob& b, bool copy) { - std::string data{r_cast< const char* >(b.bytes), b.size}; + std::string data{r_cast< const char* >(b.cbytes()), b.size()}; std::stringstream ss; ss << std::hex << data.substr(0, 8); ss >> m_key; @@ -250,7 +250,7 @@ class TestIntervalKey : public BtreeIntervalKey { TestIntervalKey(const TestIntervalKey& other) = default; TestIntervalKey(const BtreeKey& other) : TestIntervalKey(other.serialize(), true) {} TestIntervalKey(const sisl::blob& b, bool copy) : BtreeIntervalKey() { - TestIntervalKey* other = r_cast< TestIntervalKey* >(b.bytes); + TestIntervalKey const* other = r_cast< TestIntervalKey const* >(b.cbytes()); m_base = other->m_base; m_offset = other->m_offset; } @@ -285,8 +285,8 @@ class TestIntervalKey : public BtreeIntervalKey { uint32_t serialized_size() const override { return sizeof(TestIntervalKey); } void deserialize(sisl::blob const& b, bool copy) override { - assert(b.size == sizeof(TestIntervalKey)); - TestIntervalKey* other = r_cast< TestIntervalKey* >(b.bytes); + assert(b.size() == sizeof(TestIntervalKey)); + TestIntervalKey const* other = r_cast< TestIntervalKey const* >(b.cbytes()); m_base = other->m_base; m_offset = other->m_offset; } @@ -324,12 +324,12 @@ class TestIntervalKey : public BtreeIntervalKey { uint32_t serialized_suffix_size() const override { return uint32_cast(sizeof(uint32_t)); }; void deserialize(sisl::blob const& prefix, sisl::blob const& suffix, bool) { - DEBUG_ASSERT_EQ(prefix.size, sizeof(uint32_t), "Invalid prefix size on deserialize"); - DEBUG_ASSERT_EQ(suffix.size, sizeof(uint32_t), "Invalid suffix size on deserialize"); - uint32_t* other_p = r_cast< uint32_t* >(prefix.bytes); + DEBUG_ASSERT_EQ(prefix.size(), sizeof(uint32_t), "Invalid prefix size on deserialize"); + DEBUG_ASSERT_EQ(suffix.size(), sizeof(uint32_t), "Invalid suffix size on deserialize"); + uint32_t const* other_p = r_cast< uint32_t const* >(prefix.cbytes()); m_base = *other_p; - uint32_t* other_s = r_cast< uint32_t* >(suffix.bytes); + uint32_t const* other_s = r_cast< uint32_t const* >(suffix.cbytes()); m_offset = *other_s; } @@ -359,7 +359,7 @@ class TestFixedValue : public BtreeValue { TestFixedValue(uint32_t val) : BtreeValue() { m_val = val; } TestFixedValue() : TestFixedValue((uint32_t)-1) {} TestFixedValue(const TestFixedValue& other) : BtreeValue() { m_val = other.m_val; }; - TestFixedValue(const sisl::blob& b, bool copy) : BtreeValue() { m_val = *(r_cast< uint32_t* >(b.bytes)); } + TestFixedValue(const sisl::blob& b, bool copy) : BtreeValue() { m_val = *(r_cast< uint32_t const* >(b.cbytes())); } virtual ~TestFixedValue() = default; static TestFixedValue generate_rand() { return TestFixedValue{g_randval_generator(g_re)}; } @@ -370,15 +370,13 @@ class TestFixedValue : public BtreeValue { } sisl::blob serialize() const override { - sisl::blob b; - b.bytes = uintptr_cast(const_cast< uint32_t* >(&m_val)); - b.size = sizeof(m_val); + sisl::blob b{r_cast< uint8_t const* >(&m_val), uint32_cast(sizeof(m_val))}; return b; } uint32_t serialized_size() const override { return sizeof(m_val); } static uint32_t get_fixed_size() { return sizeof(m_val); } - void deserialize(const sisl::blob& b, bool copy) { m_val = *(r_cast< uint32_t* >(b.bytes)); } + void deserialize(const sisl::blob& b, bool copy) { m_val = *(r_cast< uint32_t const* >(b.cbytes())); } std::string to_string() const override { return fmt::format("{}", m_val); } @@ -407,7 +405,8 @@ class TestVarLenValue : public BtreeValue { TestVarLenValue(const std::string& val) : BtreeValue(), m_val{val} {} TestVarLenValue() = default; TestVarLenValue(const TestVarLenValue& other) : BtreeValue() { m_val = other.m_val; }; - TestVarLenValue(const sisl::blob& b, bool copy) : BtreeValue(), m_val{std::string((const char*)b.bytes, b.size)} {} + TestVarLenValue(const sisl::blob& b, bool copy) : + BtreeValue(), m_val{std::string((const char*)b.cbytes(), b.size())} {} virtual ~TestVarLenValue() = default; TestVarLenValue& operator=(const TestVarLenValue& other) { @@ -418,16 +417,14 @@ class TestVarLenValue : public BtreeValue { static TestVarLenValue generate_rand() { return TestVarLenValue{gen_random_string(rand_val_size())}; } sisl::blob serialize() const override { - sisl::blob b; - b.bytes = uintptr_cast(const_cast< char* >(m_val.c_str())); - b.size = m_val.size(); + sisl::blob b{r_cast< const uint8_t* >(m_val.c_str()), uint32_cast(m_val.size())}; return b; } uint32_t serialized_size() const override { return (uint32_t)m_val.size(); } static uint32_t get_fixed_size() { return 0; } - void deserialize(const sisl::blob& b, bool copy) { m_val = std::string((const char*)b.bytes, b.size); } + void deserialize(const sisl::blob& b, bool copy) { m_val = std::string((const char*)b.cbytes(), b.size()); } std::string to_string() const override { return fmt::format("{}", m_val); } @@ -468,16 +465,14 @@ class TestIntervalValue : public BtreeIntervalValue { ///////////////////////////// Overriding methods of BtreeValue ////////////////////////// TestIntervalValue& operator=(const TestIntervalValue& other) = default; sisl::blob serialize() const override { - sisl::blob b; - b.bytes = uintptr_cast(const_cast< TestIntervalValue* >(this)); - b.size = sizeof(TestIntervalValue); + sisl::blob b{r_cast< uint8_t const* >(this), sizeof(TestIntervalValue)}; return b; } uint32_t serialized_size() const override { return sizeof(TestIntervalValue); } static uint32_t get_fixed_size() { return sizeof(TestIntervalValue); } void deserialize(const sisl::blob& b, bool) { - TestIntervalValue const* other = r_cast< TestIntervalValue const* >(b.bytes); + TestIntervalValue const* other = r_cast< TestIntervalValue const* >(b.cbytes()); m_base_val = other->m_base_val; m_offset = other->m_offset; } @@ -502,10 +497,10 @@ class TestIntervalValue : public BtreeIntervalValue { uint32_t serialized_suffix_size() const override { return uint32_cast(sizeof(uint16_t)); } void deserialize(sisl::blob const& prefix, sisl::blob const& suffix, bool) override { - DEBUG_ASSERT_EQ(prefix.size, sizeof(uint32_t), "Invalid prefix size on deserialize"); - DEBUG_ASSERT_EQ(suffix.size, sizeof(uint16_t), "Invalid suffix size on deserialize"); - m_base_val = *(r_cast< uint32_t* >(prefix.bytes)); - m_offset = *(r_cast< uint16_t* >(suffix.bytes)); + DEBUG_ASSERT_EQ(prefix.size(), sizeof(uint32_t), "Invalid prefix size on deserialize"); + DEBUG_ASSERT_EQ(suffix.size(), sizeof(uint16_t), "Invalid suffix size on deserialize"); + m_base_val = *(r_cast< uint32_t const* >(prefix.cbytes())); + m_offset = *(r_cast< uint16_t const* >(suffix.cbytes())); } bool operator==(TestIntervalValue const& other) const { diff --git a/src/tests/test_blkid.cpp b/src/tests/test_blkid.cpp index 435e41784..c312f94ce 100644 --- a/src/tests/test_blkid.cpp +++ b/src/tests/test_blkid.cpp @@ -29,7 +29,7 @@ TEST(BlkIdTest, SingleBlkIdBasic) { ASSERT_EQ(b2.is_multi(), false); sisl::blob buf = b2.serialize(); - ASSERT_EQ(buf.size, sizeof(uint64_t)); + ASSERT_EQ(buf.size(), sizeof(uint64_t)); BlkId b3; b3.deserialize(buf, true); diff --git a/src/tests/test_common/bits_generator.hpp b/src/tests/test_common/bits_generator.hpp index 97fb035d5..e5d0dac7b 100644 --- a/src/tests/test_common/bits_generator.hpp +++ b/src/tests/test_common/bits_generator.hpp @@ -33,7 +33,7 @@ class BitsGenerator { } } - static void gen_random_bits(sisl::blob& b) { gen_random_bits(b.size, b.bytes); } + static void gen_random_bits(sisl::blob& b) { gen_random_bits(b.size(), b.bytes()); } }; }; // namespace homestore diff --git a/src/tests/test_common/homestore_test_common.hpp b/src/tests/test_common/homestore_test_common.hpp index bdac774eb..4729fc634 100644 --- a/src/tests/test_common/homestore_test_common.hpp +++ b/src/tests/test_common/homestore_test_common.hpp @@ -97,7 +97,7 @@ class HSTestHelper { uint32_t blk_size{0}; shared< ChunkSelector > custom_chunk_selector{nullptr}; IndexServiceCallbacks* index_svc_cbs{nullptr}; - repl_impl_type repl_impl{repl_impl_type::solo}; + shared< ReplApplication > repl_app{nullptr}; chunk_num_t num_chunks{1}; }; @@ -170,7 +170,7 @@ class HSTestHelper { } else if ((svc == HS_SERVICE::LOG_REPLICATED) || (svc == HS_SERVICE::LOG_LOCAL)) { hsi->with_log_service(); } else if (svc == HS_SERVICE::REPLICATION) { - hsi->with_repl_data_service(tp.repl_impl, tp.custom_chunk_selector); + hsi->with_repl_data_service(tp.repl_app, tp.custom_chunk_selector); } } bool need_format = @@ -214,8 +214,8 @@ class HSTestHelper { } } - static void validate_data_buf(uint8_t* buf, uint64_t size, uint64_t pattern = 0) { - uint64_t* ptr = r_cast< uint64_t* >(buf); + static void validate_data_buf(uint8_t const* buf, uint64_t size, uint64_t pattern = 0) { + uint64_t const* ptr = r_cast< uint64_t const* >(buf); for (uint64_t i = 0ul; i < size / sizeof(uint64_t); ++i) { HS_REL_ASSERT_EQ(ptr[i], ((pattern == 0) ? i : pattern), "data_buf mismatch at offset={}", i); } diff --git a/src/tests/test_log_store.cpp b/src/tests/test_log_store.cpp index d7ad85012..656092105 100644 --- a/src/tests/test_log_store.cpp +++ b/src/tests/test_log_store.cpp @@ -68,8 +68,12 @@ struct test_log_data { uint32_t size; uint8_t* get_data() { return uintptr_cast(this) + sizeof(test_log_data); }; + uint8_t const* get_data_const() const { return r_cast< uint8_t const* >(this) + sizeof(test_log_data); } const uint8_t* get_data() const { return r_cast< const uint8_t* >(this) + sizeof(test_log_data); } uint32_t total_size() const { return sizeof(test_log_data) + size; } + std::string get_data_str() const { + return std::string(r_cast< const char* >(get_data_const()), static_cast< size_t >(size)); + } }; typedef std::function< void(logstore_family_id_t, logstore_seq_num_t, logdev_key) > test_log_store_comp_cb_t; @@ -187,7 +191,7 @@ class SampleLogStoreClient { if ((hole_entry != hole_end) && hole_entry->second) { // Hole entry exists, but filled EXPECT_EQ(b.size(), 0ul); } else { - auto* tl = r_cast< test_log_data* >(b.bytes()); + auto const* tl = r_cast< test_log_data const* >(b.bytes()); EXPECT_EQ(tl->total_size(), b.size()); validate_data(tl, seq_num); } @@ -245,15 +249,15 @@ class SampleLogStoreClient { ASSERT_EQ(b.size(), 0ul) << "Expected null entry for lsn=" << m_log_store->get_store_id() << ":" << i; } else { - auto* tl = r_cast< test_log_data* >(b.bytes()); + auto* tl = r_cast< test_log_data const* >(b.bytes()); ASSERT_EQ(tl->total_size(), b.size()) << "Size Mismatch for lsn=" << m_log_store->get_store_id() << ":" << i; validate_data(tl, i); } } catch (const std::exception& e) { if (!expect_all_completed) { - // In case we run truncation in parallel to read, it is possible truncate moved, so adjust the - // truncated_upto accordingly. + // In case we run truncation in parallel to read, it is possible truncate moved, so adjust + // the truncated_upto accordingly. const auto trunc_upto = m_log_store->truncated_upto(); if (i <= trunc_upto) { i = trunc_upto; @@ -328,7 +332,7 @@ class SampleLogStoreClient { LOGDEBUG("Recovered lsn {}:{} with log data of size {}", m_log_store->get_store_id(), lsn, buf.size()) EXPECT_LE(lsn, m_cur_lsn.load()) << "Recovered incorrect lsn " << m_log_store->get_store_id() << ":" << lsn << "Expected less than cur_lsn " << m_cur_lsn.load(); - auto* tl = r_cast< test_log_data* >(buf.bytes()); + auto* tl = r_cast< test_log_data const* >(buf.bytes()); validate_data(tl, lsn); // Count only the ones which are after truncated, because recovery could receive even truncated lsns @@ -378,7 +382,7 @@ class SampleLogStoreClient { private: void validate_data(const test_log_data* d, const logstore_seq_num_t lsn) { const char c = static_cast< char >((lsn % 94) + 33); - const std::string actual{r_cast< const char* >(d->get_data()), static_cast< size_t >(d->size)}; + const std::string actual = d->get_data_str(); const std::string expected(static_cast< size_t >(d->size), c); // needs to be () because of same reason as vector ASSERT_EQ(actual, expected) << "Data mismatch for LSN=" << m_log_store->get_store_id() << ":" << lsn @@ -688,10 +692,11 @@ class LogStoreTest : public ::testing::Test { if (lsc->has_all_lsns_truncated()) ++n_fully_truncated; } - // While inserts are going on, truncation can guaranteed to be forward progressed if none of the log - // stores are fully truncated. If all stores are fully truncated, its obvious no progress, but even - // if one of the store is fully truncated, then it might be possible that logstore is holding lowest - // logdev location and waiting for next flush to finish to move the safe logdev location. + // While inserts are going on, truncation can guaranteed to be forward progressed if none of the + // log stores are fully truncated. If all stores are fully truncated, its obvious no progress, + // but even if one of the store is fully truncated, then it might be possible that logstore is + // holding lowest logdev location and waiting for next flush to finish to move the safe logdev + // location. expect_forward_progress = (n_fully_truncated == 0); } @@ -954,10 +959,10 @@ TEST_F(LogStoreTest, VarRateInsertThenTruncate) { for (uint32_t iteration{0}; iteration < iterations; ++iteration) { LOGINFO("Iteration {}", iteration); - LOGINFO( - "Step 1: Reinit the num records={} and insert them as batch of 10 with qdepth=500 and wait for all records " - "to be inserted and then validate them", - nrecords); + LOGINFO("Step 1: Reinit the num records={} and insert them as batch of 10 with qdepth=500 and wait for all " + "records " + "to be inserted and then validate them", + nrecords); this->init(nrecords); this->kickstart_inserts(10, 500); this->wait_for_inserts(); @@ -980,10 +985,10 @@ TEST_F(LogStoreTest, VarRateInsertThenTruncate) { this->truncate_validate(); } - LOGINFO( - "Step 3: Change data rate on stores 0,1 but still slower than other stores, write num_records={} wait for " - "their completion, validate it is readable, then truncate - all in a loop for 3 times", - nrecords); + LOGINFO("Step 3: Change data rate on stores 0,1 but still slower than other stores, write num_records={} " + "wait for " + "their completion, validate it is readable, then truncate - all in a loop for 3 times", + nrecords); for (auto i{0u}; i < 3u; ++i) { LOGINFO("Step 3.{}.1: Write and wait for {}", i + 1, nrecords); this->init(nrecords, {{0, 5}, {1, 20}}); @@ -1248,10 +1253,10 @@ TEST_F(LogStoreTest, WriteSyncThenRead) { } auto b = tmp_log_store->read_sync(i); - auto* tl = r_cast< test_log_data* >(b.bytes()); + auto* tl = r_cast< test_log_data const* >(b.bytes()); ASSERT_EQ(tl->total_size(), b.size()) << "Size Mismatch for lsn=" << store_id << ":" << i; const char c = static_cast< char >((i % 94) + 33); - const std::string actual{r_cast< const char* >(tl->get_data()), static_cast< size_t >(tl->size)}; + const std::string actual = tl->get_data_str(); const std::string expected(static_cast< size_t >(tl->size), c); // needs to be () because of same reason as vector ASSERT_EQ(actual, expected) << "Data mismatch for LSN=" << store_id << ":" << i << " size=" << tl->size; diff --git a/src/tests/test_meta_blk_mgr.cpp b/src/tests/test_meta_blk_mgr.cpp index 61f782d4c..f0875562e 100644 --- a/src/tests/test_meta_blk_mgr.cpp +++ b/src/tests/test_meta_blk_mgr.cpp @@ -367,7 +367,7 @@ class VMetaBlkMgrTest : public ::testing::Test { iomanager.iobuf_free(buf); } else { if (unaligned_addr) { - delete[](buf - unaligned_shift); + delete[] (buf - unaligned_shift); } else { delete[] buf; } diff --git a/src/tests/test_solo_repl_dev.cpp b/src/tests/test_solo_repl_dev.cpp index b0527648a..86b1dca7d 100644 --- a/src/tests/test_solo_repl_dev.cpp +++ b/src/tests/test_solo_repl_dev.cpp @@ -35,7 +35,7 @@ #include "common/homestore_assert.hpp" #include "common/homestore_utils.hpp" #include "test_common/homestore_test_common.hpp" -#include "replication/service/repl_service_impl.h" +#include "replication/service/generic_repl_svc.h" #include "replication/repl_dev/solo_repl_dev.h" //////////////////////////////////////////////////////////////////////////// @@ -181,6 +181,23 @@ class SoloReplDevTest : public testing::Test { void on_replica_stop() override {} }; + class Application : public ReplApplication { + private: + SoloReplDevTest& m_test; + + public: + Application(SoloReplDevTest& test) : m_test{test} {} + virtual ~Application() = default; + + repl_impl_type get_impl_type() const override { return repl_impl_type::solo; } + bool need_timeline_consistency() const { return true; } + std::unique_ptr< ReplDevListener > create_repl_dev_listener(uuid_t) override { + return std::make_unique< Listener >(m_test); + } + std::string lookup_peer(uuid_t uuid) const override { return std::string(""); } + uint16_t lookup_port() const override { return 0; } + }; + protected: Runner m_io_runner; Waiter m_task_waiter; @@ -194,15 +211,13 @@ class SoloReplDevTest : public testing::Test { test_common::HSTestHelper::start_homestore( "test_solo_repl_dev", {{HS_SERVICE::META, {.size_pct = 5.0}}, - {HS_SERVICE::REPLICATION, {.size_pct = 60.0, .repl_impl = repl_impl_type::solo}}, + {HS_SERVICE::REPLICATION, {.size_pct = 60.0, .repl_app = std::make_unique< Application >(*this)}}, {HS_SERVICE::LOG_REPLICATED, {.size_pct = 20.0}}, {HS_SERVICE::LOG_LOCAL, {.size_pct = 2.0}}}); m_uuid1 = hs_utils::gen_random_uuid(); m_uuid2 = hs_utils::gen_random_uuid(); - m_repl_dev1 = - hs()->repl_service().create_repl_dev(m_uuid1, {}, std::make_unique< Listener >(*this)).get().value(); - m_repl_dev2 = - hs()->repl_service().create_repl_dev(m_uuid2, {}, std::make_unique< Listener >(*this)).get().value(); + m_repl_dev1 = hs()->repl_service().create_repl_dev(m_uuid1, {}).get().value(); + m_repl_dev2 = hs()->repl_service().create_repl_dev(m_uuid2, {}).get().value(); } virtual void TearDown() override { @@ -217,14 +232,10 @@ class SoloReplDevTest : public testing::Test { test_common::HSTestHelper::start_homestore( "test_solo_repl_dev", - {{HS_SERVICE::REPLICATION, {.repl_impl = repl_impl_type::solo}}, + {{HS_SERVICE::REPLICATION, {.repl_app = std::make_unique< Application >(*this)}}, {HS_SERVICE::LOG_REPLICATED, {}}, {HS_SERVICE::LOG_LOCAL, {}}}, - [this]() { - hs()->repl_service().open_repl_dev(m_uuid1, std::make_unique< Listener >(*this)); - hs()->repl_service().open_repl_dev(m_uuid2, std::make_unique< Listener >(*this)); - }, - true /* restart */); + nullptr, true /* restart */); m_repl_dev1 = hs()->repl_service().get_repl_dev(m_uuid1).value(); m_repl_dev2 = hs()->repl_service().get_repl_dev(m_uuid2).value(); @@ -233,7 +244,7 @@ class SoloReplDevTest : public testing::Test { void write_io(uint32_t key_size, uint64_t data_size, uint32_t max_size_per_iov) { auto req = intrusive< test_repl_req >(new test_repl_req()); req->header = sisl::make_byte_array(sizeof(test_repl_req::journal_header)); - auto hdr = r_cast< test_repl_req::journal_header* >(req->header->bytes); + auto hdr = r_cast< test_repl_req::journal_header* >(req->header->bytes()); hdr->key_size = key_size; hdr->key_pattern = ((long long)rand() << 32) | rand(); hdr->data_size = data_size; @@ -241,7 +252,7 @@ class SoloReplDevTest : public testing::Test { if (key_size != 0) { req->key = sisl::make_byte_array(key_size); - HSTestHelper::fill_data_buf(req->key->bytes, key_size, hdr->key_pattern); + HSTestHelper::fill_data_buf(req->key->bytes(), key_size, hdr->key_pattern); } if (data_size != 0) { @@ -258,8 +269,8 @@ class SoloReplDevTest : public testing::Test { void validate_replay(ReplDev& rdev, int64_t lsn, sisl::blob const& header, sisl::blob const& key, MultiBlkId const& blkids) { - auto jhdr = r_cast< test_repl_req::journal_header* >(header.bytes); - HSTestHelper::validate_data_buf(key.bytes, key.size, jhdr->key_pattern); + auto const jhdr = r_cast< test_repl_req::journal_header const* >(header.cbytes()); + HSTestHelper::validate_data_buf(key.cbytes(), key.size(), jhdr->key_pattern); uint32_t size = blkids.blk_count() * g_block_size; if (size) { @@ -299,7 +310,7 @@ class SoloReplDevTest : public testing::Test { LOGDEBUG("[{}] Write complete with lsn={} for size={} blkids={}", boost::uuids::to_string(rdev.group_id()), req->get_lsn(), req->write_sgs.size, req->written_blkids.to_string()); - auto hdr = r_cast< test_repl_req::journal_header* >(req->header->bytes); + auto hdr = r_cast< test_repl_req::journal_header* >(req->header->bytes()); HS_REL_ASSERT_EQ(hdr->data_size, req->read_sgs.size, "journal hdr data size mismatch with actual size"); From a3e421214cb095517e78dd64ada6952e023b5969 Mon Sep 17 00:00:00 2001 From: Sanal Date: Fri, 10 Nov 2023 16:27:59 -0800 Subject: [PATCH 2/9] Index write back cache fixes. (#207) Add atomic for state for indexbuffer to avoid concurrency issues with cp flush and insert threads. Create per CP indexbuffer. Create NodeBuffer which points to actual data buffer. Several indexbuffer an can point to same node buffer. Add locks in test for index btree shadow map as there are concurrent requests. Use shared ptr's in wb cache. --- conanfile.py | 2 +- .../homestore/btree/detail/btree_node_mgr.ipp | 2 +- .../homestore/index/index_internal.hpp | 61 +++++++++--- src/include/homestore/index/index_table.hpp | 33 ++++--- src/include/homestore/index/wb_cache_base.hpp | 6 +- src/include/homestore/index_service.hpp | 2 +- src/lib/checkpoint/cp.hpp | 4 +- src/lib/checkpoint/cp_mgr.cpp | 9 +- src/lib/index/index_cp.hpp | 98 +++++++++++++++++-- src/lib/index/index_service.cpp | 15 ++- src/lib/index/wb_cache.cpp | 91 ++++++++++------- src/lib/index/wb_cache.hpp | 14 +-- src/tests/btree_helpers/btree_test_helper.hpp | 70 ++++++++----- src/tests/btree_helpers/shadow_map.hpp | 67 ++++++++++++- src/tests/test_index_btree.cpp | 66 ++++++++----- src/tests/test_mem_btree.cpp | 15 +-- 16 files changed, 412 insertions(+), 143 deletions(-) diff --git a/conanfile.py b/conanfile.py index 19e28962b..783fbbd23 100644 --- a/conanfile.py +++ b/conanfile.py @@ -5,7 +5,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "4.8.1" + version = "4.8.2" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/include/homestore/btree/detail/btree_node_mgr.ipp b/src/include/homestore/btree/detail/btree_node_mgr.ipp index e81da107f..826c29789 100644 --- a/src/include/homestore/btree/detail/btree_node_mgr.ipp +++ b/src/include/homestore/btree/detail/btree_node_mgr.ipp @@ -305,7 +305,7 @@ BtreeNode* Btree< K, V >::init_node(uint8_t* node_buf, uint32_t node_ctx_size, b /* Note:- This function assumes that access of this node is thread safe. */ template < typename K, typename V > void Btree< K, V >::free_node(const BtreeNodePtr& node, locktype_t cur_lock, void* context) { - BT_NODE_LOG(DEBUG, node, "Freeing node"); + BT_NODE_LOG(TRACE, node, "Freeing node"); COUNTER_DECREMENT_IF_ELSE(m_metrics, node->is_leaf(), btree_leaf_node_count, btree_int_node_count, 1); if (cur_lock != locktype_t::NONE) { diff --git a/src/include/homestore/index/index_internal.hpp b/src/include/homestore/index/index_internal.hpp index 2c8a09849..756f47bbd 100644 --- a/src/include/homestore/index/index_internal.hpp +++ b/src/include/homestore/index/index_internal.hpp @@ -64,29 +64,68 @@ enum class index_buf_state_t : uint8_t { }; ///////////////////////// Btree Node and Buffer Portion ////////////////////////// + + +// Multiple IndexBuffer could point to the same NodeBuffer if its clean. +struct NodeBuffer; +typedef std::shared_ptr< NodeBuffer > NodeBufferPtr; +struct NodeBuffer { + uint8_t* m_bytes{nullptr}; // Actual data buffer + std::atomic< index_buf_state_t > m_state{index_buf_state_t::CLEAN}; // Is buffer yet to persist? + NodeBuffer(uint32_t buf_size, uint32_t align_size); + ~NodeBuffer(); +}; + +// IndexBuffer is for each CP. The dependent index buffers are chained using +// m_next_buffer and each buffer is flushed only its wait_for_leaders reaches 0 +// which means all its dependent buffers are flushed. struct IndexBuffer; typedef std::shared_ptr< IndexBuffer > IndexBufferPtr; - struct IndexBuffer { - uint8_t* m_node_buf{nullptr}; // Actual buffer - index_buf_state_t m_buf_state{index_buf_state_t::CLEAN}; // Is buffer yet to persist? - BlkId m_blkid; // BlkId where this needs to be persisted - std::weak_ptr< IndexBuffer > m_next_buffer; // Next buffer in the chain + NodeBufferPtr m_node_buf; + BlkId m_blkid; // BlkId where this needs to be persisted + std::weak_ptr< IndexBuffer > m_next_buffer; // Next buffer in the chain // Number of leader buffers we are waiting for before we write this buffer sisl::atomic_counter< int > m_wait_for_leaders{0}; IndexBuffer(BlkId blkid, uint32_t buf_size, uint32_t align_size); + IndexBuffer(NodeBufferPtr node_buf, BlkId blkid); ~IndexBuffer(); BlkId blkid() const { return m_blkid; } - uint8_t* raw_buffer() { return m_node_buf; } + uint8_t* raw_buffer() { + RELEASE_ASSERT(m_node_buf, "Node buffer null blkid {}", m_blkid.to_integer()); + return m_node_buf->m_bytes; + } + + bool is_clean() const { + RELEASE_ASSERT(m_node_buf, "Node buffer null blkid {}", m_blkid.to_integer()); + return (m_node_buf->m_state.load() == index_buf_state_t::CLEAN); + } + + index_buf_state_t state() const { + RELEASE_ASSERT(m_node_buf, "Node buffer null blkid {}", m_blkid.to_integer()); + return m_node_buf->m_state; + } + + void set_state(index_buf_state_t state) { + RELEASE_ASSERT(m_node_buf, "Node buffer null blkid {}", m_blkid.to_integer()); + m_node_buf->m_state = state; + } - bool is_clean() const { return (m_buf_state == index_buf_state_t::CLEAN); } std::string to_string() const { - return fmt::format("IndexBuffer {} blkid={} state={} node_buf={} next_buffer={} wait_for={}", - reinterpret_cast< void* >(const_cast< IndexBuffer* >(this)), m_blkid.to_integer(), - static_cast< int >(m_buf_state), static_cast< void* >(m_node_buf), - voidptr_cast(m_next_buffer.lock().get()), m_wait_for_leaders.get()); + auto str = fmt::format("IndexBuffer {} blkid={}", reinterpret_cast< void* >(const_cast< IndexBuffer* >(this)), + m_blkid.to_integer()); + if (m_node_buf == nullptr) { + fmt::format_to(std::back_inserter(str), " node_buf=nullptr"); + } else { + fmt::format_to(std::back_inserter(str), " state={} node_buf={}", + static_cast< int >(m_node_buf->m_state.load()), static_cast< void* >(m_node_buf->m_bytes)); + } + fmt::format_to(std::back_inserter(str), " next_buffer={} wait_for={}", + m_next_buffer.lock() ? reinterpret_cast< void* >(m_next_buffer.lock().get()) : 0, + m_wait_for_leaders.get()); + return str; } }; diff --git a/src/include/homestore/index/index_table.hpp b/src/include/homestore/index/index_table.hpp index 246557a62..bce7e8f36 100644 --- a/src/include/homestore/index/index_table.hpp +++ b/src/include/homestore/index/index_table.hpp @@ -114,7 +114,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { // Need to put it in wb cache wb_cache().write_buf(node, idx_node->m_idx_buf, cp_ctx); idx_node->m_last_mod_cp_id = cp_ctx->id(); - LOGTRACEMOD(wbcache, "{}", idx_node->m_idx_buf->to_string()); + LOGTRACEMOD(wbcache, "add to dirty list cp {} {}", cp_ctx->id(), idx_node->m_idx_buf->to_string()); } node->set_checksum(this->m_bt_cfg); return btree_status_t::success; @@ -129,17 +129,28 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { auto& left_child_buf = left_child_idx_node->m_idx_buf; auto& parent_buf = parent_idx_node->m_idx_buf; - LOGTRACEMOD(wbcache, "left {} parent {} ", left_child_buf->to_string(), parent_buf->to_string()); - // Write new nodes in the list as standalone outside transacted pairs. // Write the new right child nodes, left node and parent in order. + // Create the relationship of right child to the left node via prepend_to_chain below. + // Parent and left node are linked in the prepare_node_txn for (const auto& right_child_node : new_nodes) { auto right_child = IndexBtreeNode::convert(right_child_node.get()); write_node_impl(right_child_node, context); wb_cache().prepend_to_chain(right_child->m_idx_buf, left_child_buf); - LOGTRACEMOD(wbcache, "right {} left {} ", right_child->m_idx_buf->to_string(), left_child_buf->to_string()); } + auto trace_index_bufs = [&]() { + std::string str; + str = fmt::format("cp {} left {} parent {}", cp_ctx->id(), left_child_buf->to_string(), + parent_buf->to_string()); + for (const auto& right_child_node : new_nodes) { + auto right_child = IndexBtreeNode::convert(right_child_node.get()); + fmt::format_to(std::back_inserter(str), " right {}", right_child->m_idx_buf->to_string()); + } + return str; + }; + + LOGTRACEMOD(wbcache, "{}", trace_index_bufs()); write_node_impl(left_child_node, context); write_node_impl(parent_node, context); @@ -181,18 +192,17 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { // realloc_node(node); // } - // If the backing buffer is already in a clean state, we don't need to make a copy of it - if (idx_node->m_idx_buf->is_clean()) { return btree_status_t::success; } - - // Make a new btree buffer and copy the contents and swap it to make it the current node's buffer. The + // We create IndexBuffer for each CP. But if the backing buffer is already in a clean state + // we dont copy the node buffer. Copy buffer will handle it. If the node buffer is dirty, + // make a new btree buffer and copy the contents and swap it to make it the current node's buffer. The // buffer prior to this copy, would have been written and already added into the dirty buffer list. - idx_node->m_idx_buf = wb_cache().copy_buffer(idx_node->m_idx_buf); + idx_node->m_idx_buf = wb_cache().copy_buffer(idx_node->m_idx_buf, cp_ctx); idx_node->m_last_mod_cp_id = -1; node->m_phys_node_buf = idx_node->m_idx_buf->raw_buffer(); node->set_checksum(this->m_bt_cfg); - LOGTRACEMOD(wbcache, "buf {} ", idx_node->m_idx_buf->to_string()); + LOGTRACEMOD(wbcache, "cp {} {} ", cp_ctx->id(), idx_node->m_idx_buf->to_string()); #ifndef NO_CHECKSUM if (!node->verify_node(this->m_bt_cfg)) { @@ -221,6 +231,8 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { auto& child_buf = child_idx_node->m_idx_buf; auto& parent_buf = parent_idx_node->m_idx_buf; + LOGTRACEMOD(wbcache, "cp {} left {} parent {} ", cp_ctx->id(), child_buf->to_string(), parent_buf->to_string()); + auto [child_copied, parent_copied] = wb_cache().create_chain(child_buf, parent_buf, cp_ctx); if (child_copied) { child_node->m_phys_node_buf = child_buf->raw_buffer(); @@ -231,7 +243,6 @@ class IndexTable : public IndexTableBase, public Btree< K, V > { parent_idx_node->m_last_mod_cp_id = -1; } - LOGTRACEMOD(wbcache, "child {} parent {} ", child_buf->to_string(), parent_buf->to_string()); return btree_status_t::success; } diff --git a/src/include/homestore/index/wb_cache_base.hpp b/src/include/homestore/index/wb_cache_base.hpp index f59d4b1ce..caee6c557 100644 --- a/src/include/homestore/index/wb_cache_base.hpp +++ b/src/include/homestore/index/wb_cache_base.hpp @@ -52,13 +52,13 @@ class IndexWBCacheBase { /// @brief Start a chain of related btree buffers. Typically a chain is creating from second and third pairs and /// then first is prepended to the chain. In case the second buffer is already with the WB cache, it will create a - /// new buffer for both second and third. + /// new buffer for both second and third. We append the buffers to a list in dependency chain. /// @param second Second btree buffer in the chain. It will be updated to copy of second buffer if buffer already /// has dependencies. /// @param third Thrid btree buffer in the chain. It will be updated to copy of third buffer if buffer already /// has dependencies. /// @return Returns if the buffer had to be copied - virtual std::tuple< bool, bool > create_chain(IndexBufferPtr& second, IndexBufferPtr& third, CPContext* cp_ctx) = 0; + virtual std::pair< bool, bool > create_chain(IndexBufferPtr& second, IndexBufferPtr& third, CPContext* cp_ctx) = 0; /// @brief Prepend to the chain that was already created with second /// @param first @@ -73,7 +73,7 @@ class IndexWBCacheBase { /// @brief Copy buffer /// @param cur_buf /// @return - virtual IndexBufferPtr copy_buffer(const IndexBufferPtr& cur_buf) const = 0; + virtual IndexBufferPtr copy_buffer(const IndexBufferPtr& cur_buf, const CPContext* context) const = 0; }; } // namespace homestore diff --git a/src/include/homestore/index_service.hpp b/src/include/homestore/index_service.hpp index 9952e3852..09543a5fb 100644 --- a/src/include/homestore/index_service.hpp +++ b/src/include/homestore/index_service.hpp @@ -22,7 +22,7 @@ #include #include #include - +#include namespace homestore { class IndexWBCacheBase; diff --git a/src/lib/checkpoint/cp.hpp b/src/lib/checkpoint/cp.hpp index 5644adac7..05bda5be7 100644 --- a/src/lib/checkpoint/cp.hpp +++ b/src/lib/checkpoint/cp.hpp @@ -21,7 +21,7 @@ #include #include - +#include #include "common/homestore_assert.hpp" /* @@ -73,7 +73,7 @@ struct CP { bool m_cp_waiting_to_trigger{false}; // it is waiting for previous cp to complete cp_id_t m_cp_id; std::array< std::unique_ptr< CPContext >, (size_t)cp_consumer_t::SENTINEL > m_contexts; - folly::Promise< bool > m_comp_promise; + folly::SharedPromise< bool > m_comp_promise; public: CP(CPManager* mgr) : m_cp_mgr{mgr} {} diff --git a/src/lib/checkpoint/cp_mgr.cpp b/src/lib/checkpoint/cp_mgr.cpp index 89921620a..d26424248 100644 --- a/src/lib/checkpoint/cp_mgr.cpp +++ b/src/lib/checkpoint/cp_mgr.cpp @@ -142,8 +142,11 @@ folly::Future< bool > CPManager::trigger_cp_flush(bool force) { std::unique_lock< std::mutex > lk(trigger_cp_mtx); auto cur_cp = cp_guard(); HS_DBG_ASSERT_NE(cur_cp->m_cp_status, cp_status_t::cp_flush_prepare); - cur_cp->m_comp_promise = std::move(folly::Promise< bool >{}); - cur_cp->m_cp_waiting_to_trigger = true; + // If multiple threads call trigger, they all get the future from the same promise. + if (!cur_cp->m_cp_waiting_to_trigger) { + cur_cp->m_comp_promise = std::move(folly::SharedPromise< bool >{}); + cur_cp->m_cp_waiting_to_trigger = true; + } return cur_cp->m_comp_promise.getFuture(); } else { return folly::makeFuture< bool >(false); @@ -177,7 +180,7 @@ folly::Future< bool > CPManager::trigger_cp_flush(bool force) { // originally by the caller will be untouched and completed upto CP completion/ ret_fut = folly::makeFuture< bool >(true); } else { - cur_cp->m_comp_promise = std::move(folly::Promise< bool >{}); + cur_cp->m_comp_promise = std::move(folly::SharedPromise< bool >{}); ret_fut = cur_cp->m_comp_promise.getFuture(); } cur_cp->m_cp_status = cp_status_t::cp_flush_prepare; diff --git a/src/lib/index/index_cp.hpp b/src/lib/index/index_cp.hpp index f06f53091..d7aad88dc 100644 --- a/src/lib/index/index_cp.hpp +++ b/src/lib/index/index_cp.hpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include "checkpoint/cp.hpp" @@ -32,20 +33,18 @@ struct IndexCPContext : public VDevCPContext { std::atomic< uint64_t > m_num_nodes_removed{0}; sisl::ConcurrentInsertVector< IndexBufferPtr > m_dirty_buf_list; sisl::atomic_counter< int64_t > m_dirty_buf_count{0}; - IndexBufferPtr m_last_in_chain; std::mutex m_flush_buffer_mtx; sisl::ConcurrentInsertVector< IndexBufferPtr >::iterator m_dirty_buf_it; public: + IndexCPContext(CP* cp) : VDevCPContext(cp) {} virtual ~IndexCPContext() = default; void add_to_dirty_list(const IndexBufferPtr& buf) { - buf->m_buf_state = index_buf_state_t::DIRTY; m_dirty_buf_list.push_back(buf); + buf->set_state(index_buf_state_t::DIRTY); m_dirty_buf_count.increment(1); - m_last_in_chain = buf; - LOGTRACEMOD(wbcache, "{}", buf->to_string()); } bool any_dirty_buffers() const { return !m_dirty_buf_count.testz(); } @@ -59,15 +58,102 @@ struct IndexCPContext : public VDevCPContext { return ret; } - std::string to_string() const { + std::string to_string() { std::string str{fmt::format("IndexCPContext cpid={} dirty_buf_count={} dirty_buf_list_size={}", m_cp->id(), m_dirty_buf_count.get(), m_dirty_buf_list.size())}; - // TODO dump all index buffers. + // Mapping from a node to all its parents in the graph. + // Display all buffers and its dependencies and state. + std::unordered_map< IndexBuffer*, std::vector< IndexBuffer* > > parents; + + auto it = m_dirty_buf_list.begin(); + while (it != m_dirty_buf_list.end()) { + // Add this buf to his children. + IndexBufferPtr buf = *it; + parents[buf->m_next_buffer.lock().get()].emplace_back(buf.get()); + ++it; + } + + it = m_dirty_buf_list.begin(); + while (it != m_dirty_buf_list.end()) { + IndexBufferPtr buf = *it; + fmt::format_to(std::back_inserter(str), "{}", buf->to_string()); + auto first = true; + for (const auto& p : parents[buf.get()]) { + if (first) { + fmt::format_to(std::back_inserter(str), "\nDepends:"); + first = false; + } + fmt::format_to(std::back_inserter(str), " {}({})", r_cast< void* >(p), s_cast< int >(p->state())); + } + fmt::format_to(std::back_inserter(str), "\n"); + ++it; + } + return str; } + + void check_cycle() { + // Use dfs to find if the graph is cycle + auto it = m_dirty_buf_list.begin(); + while (it != m_dirty_buf_list.end()) { + IndexBufferPtr buf = *it;; + std::set< IndexBuffer* > visited; + check_cycle_recurse(buf, visited); + ++it; + } + } + + void check_cycle_recurse(IndexBufferPtr buf, std::set< IndexBuffer* >& visited) const { + if (visited.count(buf.get()) != 0) { + LOGERROR("Cycle found for {}", buf->to_string()); + for (auto& x : visited) { + LOGERROR("Path : {}", x->to_string()); + } + return; + } + + visited.insert(buf.get()); + if (buf->m_next_buffer.lock()) { check_cycle_recurse(buf->m_next_buffer.lock(), visited); } + } + + void check_wait_for_leaders() { + // Use the next buffer as indegree to find if wait_for_leaders is invalid. + std::unordered_map< IndexBuffer*, int > wait_for_leaders; + IndexBufferPtr buf; + + // Store the wait for leader count for each buffer. + auto it = m_dirty_buf_list.begin(); + while (it != m_dirty_buf_list.end()) { + buf = *it; + wait_for_leaders[buf.get()] = buf->m_wait_for_leaders.get(); + ++it; + } + + // Decrement the count using the next buffer. + it = m_dirty_buf_list.begin(); + while (it != m_dirty_buf_list.end()) { + buf = *it; + auto next_buf = buf->m_next_buffer.lock(); + if (next_buf.get() == nullptr) continue; + wait_for_leaders[next_buf.get()]--; + ++it; + } + + bool issue = false; + for (const auto& [buf, waits] : wait_for_leaders) { + // Any value other than zero means the dependency graph is invalid. + if (waits != 0) { + issue = true; + LOGERROR("Leaders wait not zero cp {} buf {} waits {}", id(), buf->to_string(), waits); + } + } + + RELEASE_ASSERT_EQ(issue, false, "Found issue with wait_for_leaders"); + } }; + class IndexWBCache; class IndexCPCallbacks : public CPCallbacks { public: diff --git a/src/lib/index/index_service.cpp b/src/lib/index/index_service.cpp index d3d0984b5..bfa96f8bc 100644 --- a/src/lib/index/index_service.cpp +++ b/src/lib/index/index_service.cpp @@ -82,7 +82,9 @@ void IndexService::stop() { HS_REL_ASSERT_EQ(success, true, "CP Flush failed"); LOGINFO("CP Flush completed"); - for (auto [id, tbl] : m_index_map) { tbl->destroy(); } + for (auto [id, tbl] : m_index_map) { + tbl->destroy(); + } } void IndexService::add_index_table(const std::shared_ptr< IndexTableBase >& tbl) { std::unique_lock lg(m_index_map_mtx); @@ -107,9 +109,16 @@ uint64_t IndexService::used_size() const { return size; } +NodeBuffer::NodeBuffer(uint32_t buf_size, uint32_t align_size) : + m_bytes{hs_utils::iobuf_alloc(buf_size, sisl::buftag::btree_node, align_size)} {} + +NodeBuffer::~NodeBuffer() { hs_utils::iobuf_free(m_bytes, sisl::buftag::btree_node); } + IndexBuffer::IndexBuffer(BlkId blkid, uint32_t buf_size, uint32_t align_size) : - m_node_buf{hs_utils::iobuf_alloc(buf_size, sisl::buftag::btree_node, align_size)}, m_blkid{blkid} {} + m_node_buf{std::make_shared< NodeBuffer >(buf_size, align_size)}, m_blkid{blkid} {} + +IndexBuffer::IndexBuffer(NodeBufferPtr node_buf, BlkId blkid) : m_node_buf(node_buf), m_blkid(blkid) {} -IndexBuffer::~IndexBuffer() { hs_utils::iobuf_free(m_node_buf, sisl::buftag::btree_node); } +IndexBuffer::~IndexBuffer() { m_node_buf.reset(); } } // namespace homestore diff --git a/src/lib/index/wb_cache.cpp b/src/lib/index/wb_cache.cpp index a00a46dfa..12cbbfc31 100644 --- a/src/lib/index/wb_cache.cpp +++ b/src/lib/index/wb_cache.cpp @@ -84,7 +84,6 @@ BtreeNodePtr IndexWBCache::alloc_buf(node_initializer_t&& node_initializer) { // Alloc buffer and initialize the node auto idx_buf = std::make_shared< IndexBuffer >(blkid, m_node_size, m_vdev->align_size()); auto node = node_initializer(idx_buf); - LOGTRACEMOD(wbcache, "idx_buf {} blkid {}", static_cast< void* >(idx_buf.get()), blkid.to_integer()); // Add the node to the cache bool done = m_cache.insert(node); @@ -101,16 +100,31 @@ void IndexWBCache::realloc_buf(const IndexBufferPtr& buf) { } void IndexWBCache::write_buf(const BtreeNodePtr& node, const IndexBufferPtr& buf, CPContext* cp_ctx) { + // TODO upsert always returns false even if it succeeds. m_cache.upsert(node); r_cast< IndexCPContext* >(cp_ctx)->add_to_dirty_list(buf); resource_mgr().inc_dirty_buf_size(m_node_size); } -IndexBufferPtr IndexWBCache::copy_buffer(const IndexBufferPtr& cur_buf) const { - auto new_buf = std::make_shared< IndexBuffer >(cur_buf->m_blkid, m_node_size, m_vdev->align_size()); - std::memcpy(new_buf->raw_buffer(), cur_buf->raw_buffer(), m_node_size); - LOGTRACEMOD(wbcache, "new_buf {} cur_buf {} cur_buf_blkid {}", static_cast< void* >(new_buf.get()), - static_cast< void* >(cur_buf.get()), cur_buf->m_blkid.to_integer()); +IndexBufferPtr IndexWBCache::copy_buffer(const IndexBufferPtr& cur_buf, const CPContext* cp_ctx) const { + IndexBufferPtr new_buf = nullptr; + bool copied = false; + + // When we copy the buffer we check if the node buffer is clean or not. If its clean + // we could reuse it otherwise create a copy. + if (cur_buf->is_clean()) { + // Refer to the same node buffer. + new_buf = std::make_shared< IndexBuffer >(cur_buf->m_node_buf, cur_buf->m_blkid); + } else { + // If its not clean, we do deep copy. + new_buf = std::make_shared< IndexBuffer >(cur_buf->m_blkid, m_node_size, m_vdev->align_size()); + std::memcpy(new_buf->raw_buffer(), cur_buf->raw_buffer(), m_node_size); + copied = true; + } + + LOGTRACEMOD(wbcache, "cp {} new_buf {} cur_buf {} cur_buf_blkid {} copied {}", cp_ctx->id(), + static_cast< void* >(new_buf.get()), static_cast< void* >(cur_buf.get()), cur_buf->m_blkid.to_integer(), + copied); return new_buf; } @@ -138,32 +152,34 @@ void IndexWBCache::read_buf(bnodeid_t id, BtreeNodePtr& node, node_initializer_t } } -std::tuple< bool, bool > IndexWBCache::create_chain(IndexBufferPtr& second, IndexBufferPtr& third, CPContext* cp_ctx) { +std::pair< bool, bool > IndexWBCache::create_chain(IndexBufferPtr& second, IndexBufferPtr& third, CPContext* cp_ctx) { bool second_copied{false}, third_copied{false}; - + auto chain = second; + auto old_third = third; if (!second->is_clean()) { - auto new_second = copy_buffer(second); - LOGTRACEMOD(wbcache, "second copied blkid {} {} new_second {}", second->m_blkid.to_integer(), - static_cast< void* >(second.get()), static_cast< void* >(new_second.get())); + auto new_second = copy_buffer(second, cp_ctx); second = new_second; second_copied = true; } + if (!third->is_clean()) { - auto new_third = copy_buffer(third); - LOGTRACEMOD(wbcache, "third copied blkid {} {} new_third {}", third->m_blkid.to_integer(), - static_cast< void* >(third.get()), static_cast< void* >(new_third.get())); + auto new_third = copy_buffer(third, cp_ctx); third = new_third; third_copied = true; } // Append parent(third) to the left child(second). - prepend_to_chain(second, third); + second->m_next_buffer = third; + third->m_wait_for_leaders.increment(1); + if (chain != second) { + // We want buffers to be append to the end of the chain which are related. + // If we split a node multiple times in same or different CP's, each dirty buffer will be + // added to the end of that chain. + while (chain->m_next_buffer.lock() != nullptr) { + chain = chain->m_next_buffer.lock(); + } - // TODO the index buffer are added to end of the chain, instead add to the dependency. - auto& last_in_chain = r_cast< IndexCPContext* >(cp_ctx)->m_last_in_chain; - if (last_in_chain) { - // Add this to the end of the chain. - last_in_chain->m_next_buffer = second; + chain->m_next_buffer = second; second->m_wait_for_leaders.increment(1); } @@ -171,10 +187,10 @@ std::tuple< bool, bool > IndexWBCache::create_chain(IndexBufferPtr& second, Inde } void IndexWBCache::prepend_to_chain(const IndexBufferPtr& first, const IndexBufferPtr& second) { + assert(first->m_next_buffer.lock() != second); assert(first->m_next_buffer.lock() == nullptr); first->m_next_buffer = second; second->m_wait_for_leaders.increment(1); - LOGTRACEMOD(wbcache, "first {} second {}", first->to_string(), second->to_string()); } void IndexWBCache::free_buf(const IndexBufferPtr& buf, CPContext* cp_ctx) { @@ -187,6 +203,7 @@ void IndexWBCache::free_buf(const IndexBufferPtr& buf, CPContext* cp_ctx) { } //////////////////// CP Related API section ///////////////////////////////// + folly::Future< bool > IndexWBCache::async_cp_flush(IndexCPContext* cp_ctx) { LOGTRACEMOD(wbcache, "cp_ctx {}", cp_ctx->to_string()); if (!cp_ctx->any_dirty_buffers()) { @@ -194,6 +211,13 @@ folly::Future< bool > IndexWBCache::async_cp_flush(IndexCPContext* cp_ctx) { return folly::makeFuture< bool >(true); // nothing to flush } +#ifndef NDEBUG + // Check no cycles or invalid wait_for_leader count in the dirty buffer + // dependency graph. + // cp_ctx->check_wait_for_leaders(); + // cp_ctx->check_cycle(); +#endif + cp_ctx->prepare_flush_iteration(); for (auto& fiber : m_cp_flush_fibers) { @@ -211,22 +235,22 @@ folly::Future< bool > IndexWBCache::async_cp_flush(IndexCPContext* cp_ctx) { return std::move(cp_ctx->get_future()); } -void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, const IndexBufferPtr& buf, bool part_of_batch) { - LOGTRACEMOD(wbcache, "buf {}", buf->to_string()); - buf->m_buf_state = index_buf_state_t::FLUSHING; +void IndexWBCache::do_flush_one_buf(IndexCPContext* cp_ctx, IndexBufferPtr buf, bool part_of_batch) { + LOGTRACEMOD(wbcache, "cp {} buf {}", cp_ctx->id(), buf->to_string()); + buf->set_state(index_buf_state_t::FLUSHING); m_vdev->async_write(r_cast< const char* >(buf->raw_buffer()), m_node_size, buf->m_blkid, part_of_batch) - .thenValue([pbuf = buf.get(), cp_ctx](auto) { + .thenValue([buf, cp_ctx](auto) { auto& pthis = s_cast< IndexWBCache& >(wb_cache()); // Avoiding more than 16 bytes capture - pthis.process_write_completion(cp_ctx, pbuf); + pthis.process_write_completion(cp_ctx, buf); }); if (!part_of_batch) { m_vdev->submit_batch(); } } -void IndexWBCache::process_write_completion(IndexCPContext* cp_ctx, IndexBuffer* pbuf) { - LOGTRACEMOD(wbcache, "buf {}", pbuf->to_string()); +void IndexWBCache::process_write_completion(IndexCPContext* cp_ctx, IndexBufferPtr buf) { + LOGTRACEMOD(wbcache, "cp {} buf {}", cp_ctx->id(), buf->to_string()); resource_mgr().dec_dirty_buf_size(m_node_size); - auto [next_buf, has_more] = on_buf_flush_done(cp_ctx, pbuf); + auto [next_buf, has_more] = on_buf_flush_done(cp_ctx, buf); if (next_buf) { do_flush_one_buf(cp_ctx, next_buf, false); } else if (!has_more) { @@ -240,7 +264,7 @@ void IndexWBCache::process_write_completion(IndexCPContext* cp_ctx, IndexBuffer* } } -std::pair< IndexBufferPtr, bool > IndexWBCache::on_buf_flush_done(IndexCPContext* cp_ctx, IndexBuffer* buf) { +std::pair< IndexBufferPtr, bool > IndexWBCache::on_buf_flush_done(IndexCPContext* cp_ctx, IndexBufferPtr& buf) { if (m_cp_flush_fibers.size() > 1) { std::unique_lock lg(m_flush_mtx); return on_buf_flush_done_internal(cp_ctx, buf); @@ -249,9 +273,10 @@ std::pair< IndexBufferPtr, bool > IndexWBCache::on_buf_flush_done(IndexCPContext } } -std::pair< IndexBufferPtr, bool > IndexWBCache::on_buf_flush_done_internal(IndexCPContext* cp_ctx, IndexBuffer* buf) { +std::pair< IndexBufferPtr, bool > IndexWBCache::on_buf_flush_done_internal(IndexCPContext* cp_ctx, + IndexBufferPtr& buf) { static thread_local std::vector< IndexBufferPtr > t_buf_list; - buf->m_buf_state = index_buf_state_t::CLEAN; + buf->set_state(index_buf_state_t::CLEAN); t_buf_list.clear(); @@ -272,7 +297,7 @@ void IndexWBCache::get_next_bufs(IndexCPContext* cp_ctx, uint32_t max_count, std } } -void IndexWBCache::get_next_bufs_internal(IndexCPContext* cp_ctx, uint32_t max_count, IndexBuffer* prev_flushed_buf, +void IndexWBCache::get_next_bufs_internal(IndexCPContext* cp_ctx, uint32_t max_count, IndexBufferPtr prev_flushed_buf, std::vector< IndexBufferPtr >& bufs) { uint32_t count{0}; diff --git a/src/lib/index/wb_cache.hpp b/src/lib/index/wb_cache.hpp index 7639d0714..9a652ce2a 100644 --- a/src/lib/index/wb_cache.hpp +++ b/src/lib/index/wb_cache.hpp @@ -49,23 +49,23 @@ class IndexWBCache : public IndexWBCacheBase { void realloc_buf(const IndexBufferPtr& buf) override; void write_buf(const BtreeNodePtr& node, const IndexBufferPtr& buf, CPContext* cp_ctx) override; void read_buf(bnodeid_t id, BtreeNodePtr& node, node_initializer_t&& node_initializer) override; - std::tuple< bool, bool > create_chain(IndexBufferPtr& second, IndexBufferPtr& third, CPContext* cp_ctx) override; + std::pair< bool, bool > create_chain(IndexBufferPtr& second, IndexBufferPtr& third, CPContext* cp_ctx) override; void prepend_to_chain(const IndexBufferPtr& first, const IndexBufferPtr& second) override; void free_buf(const IndexBufferPtr& buf, CPContext* cp_ctx) override; //////////////////// CP Related API section ///////////////////////////////// folly::Future< bool > async_cp_flush(IndexCPContext* context); - IndexBufferPtr copy_buffer(const IndexBufferPtr& cur_buf) const; + IndexBufferPtr copy_buffer(const IndexBufferPtr& cur_buf, const CPContext *cp_ctx) const; private: void start_flush_threads(); - void process_write_completion(IndexCPContext* cp_ctx, IndexBuffer* pbuf); - void do_flush_one_buf(IndexCPContext* cp_ctx, const IndexBufferPtr& buf, bool part_of_batch); - std::pair< IndexBufferPtr, bool > on_buf_flush_done(IndexCPContext* cp_ctx, IndexBuffer* buf); - std::pair< IndexBufferPtr, bool > on_buf_flush_done_internal(IndexCPContext* cp_ctx, IndexBuffer* buf); + void process_write_completion(IndexCPContext* cp_ctx, IndexBufferPtr pbuf); + void do_flush_one_buf(IndexCPContext* cp_ctx, const IndexBufferPtr buf, bool part_of_batch); + std::pair< IndexBufferPtr, bool > on_buf_flush_done(IndexCPContext* cp_ctx, IndexBufferPtr& buf); + std::pair< IndexBufferPtr, bool > on_buf_flush_done_internal(IndexCPContext* cp_ctx, IndexBufferPtr& buf); void get_next_bufs(IndexCPContext* cp_ctx, uint32_t max_count, std::vector< IndexBufferPtr >& bufs); - void get_next_bufs_internal(IndexCPContext* cp_ctx, uint32_t max_count, IndexBuffer* prev_flushed_buf, + void get_next_bufs_internal(IndexCPContext* cp_ctx, uint32_t max_count, IndexBufferPtr prev_flushed_buf, std::vector< IndexBufferPtr >& bufs); }; } // namespace homestore diff --git a/src/tests/btree_helpers/btree_test_helper.hpp b/src/tests/btree_helpers/btree_test_helper.hpp index 3bc943fc0..1072ef981 100644 --- a/src/tests/btree_helpers/btree_test_helper.hpp +++ b/src/tests/btree_helpers/btree_test_helper.hpp @@ -41,7 +41,7 @@ struct BtreeTestHelper : public testing::Test { using mutex = iomgr::FiberManagerLib::shared_mutex; using op_func_t = std::function< void(void) >; - BtreeTestHelper() : testing::Test(), m_range_scheduler{SISL_OPTIONS["num_entries"].as< uint32_t >()} {} + BtreeTestHelper() : testing::Test(), m_shadow_map{SISL_OPTIONS["num_entries"].as< uint32_t >()} {} void SetUp() override { m_cfg.m_leaf_node_type = T::leaf_node_type; @@ -71,7 +71,6 @@ struct BtreeTestHelper : public testing::Test { std::shared_ptr< typename T::BtreeType > m_bt; ShadowMap< K, V > m_shadow_map; BtreeConfig m_cfg{g_node_size}; - RangeScheduler m_range_scheduler; uint32_t m_max_range_input{1000}; bool m_is_multi_threaded{false}; @@ -94,7 +93,6 @@ struct BtreeTestHelper : public testing::Test { iomanager.run_on_forget(m_fibers[i], [this, start_range, end_range, &test_count]() { for (uint32_t i = start_range; i < end_range; i++) { put(i, btree_put_type::INSERT); - m_range_scheduler.put_key(i); } { std::unique_lock lg(m_test_done_mtx); @@ -114,7 +112,7 @@ struct BtreeTestHelper : public testing::Test { void put(uint64_t k, btree_put_type put_type) { do_put(k, put_type, V::generate_rand()); } void put_random() { - auto [start_k, end_k] = m_range_scheduler.pick_random_non_existing_keys(1); + auto [start_k, end_k] = m_shadow_map.pick_random_non_existing_keys(1); RELEASE_ASSERT_EQ(start_k, end_k, "Range scheduler pick_random_non_existing_keys issue"); do_put(start_k, btree_put_type::INSERT, V::generate_rand()); @@ -132,10 +130,8 @@ struct BtreeTestHelper : public testing::Test { if (update) { m_shadow_map.range_update(start_key, nkeys, value); - m_range_scheduler.remove_keys_from_working(start_k, end_k); } else { m_shadow_map.range_upsert(start_k, nkeys, value); - m_range_scheduler.put_keys(start_k, end_k); } } @@ -146,8 +142,8 @@ struct BtreeTestHelper : public testing::Test { static thread_local std::uniform_int_distribution< uint32_t > s_rand_range_generator{1, 50}; auto const [start_k, end_k] = is_update - ? m_range_scheduler.pick_random_existing_keys(s_rand_range_generator(m_re)) - : m_range_scheduler.pick_random_non_working_keys(s_rand_range_generator(m_re)); + ? m_shadow_map.pick_random_existing_keys(s_rand_range_generator(m_re)) + : m_shadow_map.pick_random_non_working_keys(s_rand_range_generator(m_re)); range_put(start_k, end_k, V::generate_rand(), is_update); } @@ -167,15 +163,13 @@ struct BtreeTestHelper : public testing::Test { m_shadow_map.validate_data(rreq.key(), (const V&)rreq.value()); m_shadow_map.erase(rreq.key()); } - m_range_scheduler.remove_key(k); } void remove_random() { - auto const [start_k, end_k] = m_range_scheduler.pick_random_existing_keys(1); + auto const [start_k, end_k] = m_shadow_map.pick_random_existing_keys(1); RELEASE_ASSERT_EQ(start_k, end_k, "Range scheduler pick_random_existing_keys issue"); remove_one(start_k); - m_range_scheduler.remove_key(start_k); } void range_remove_existing(uint32_t start_k, uint32_t count) { @@ -186,7 +180,7 @@ struct BtreeTestHelper : public testing::Test { void range_remove_existing_random() { static std::uniform_int_distribution< uint32_t > s_rand_range_generator{2, 5}; - auto const [start_k, end_k] = m_range_scheduler.pick_random_existing_keys(s_rand_range_generator(m_re)); + auto const [start_k, end_k] = m_shadow_map.pick_random_existing_keys(s_rand_range_generator(m_re)); do_range_remove(start_k, end_k, true /* only_existing */); } @@ -203,6 +197,7 @@ struct BtreeTestHelper : public testing::Test { void do_query(uint32_t start_k, uint32_t end_k, uint32_t batch_size) { std::vector< std::pair< K, V > > out_vector; + m_shadow_map.guard().lock(); uint32_t remaining = m_shadow_map.num_elems_in_range(start_k, end_k); auto it = m_shadow_map.map_const().lower_bound(K{start_k}); @@ -234,21 +229,23 @@ struct BtreeTestHelper : public testing::Test { ASSERT_EQ(ret, btree_status_t::success) << "Expected success on query"; ASSERT_EQ(out_vector.size(), 0) << "Received incorrect value on empty query pagination"; + m_shadow_map.guard().unlock(); + if (start_k < m_max_range_input) { - m_range_scheduler.remove_keys_from_working(start_k, std::min(end_k, m_max_range_input - 1)); + m_shadow_map.remove_keys_from_working(start_k, std::min(end_k, m_max_range_input - 1)); } } void query_random() { static thread_local std::uniform_int_distribution< uint32_t > s_rand_range_generator{1, 100}; - auto const [start_k, end_k] = m_range_scheduler.pick_random_non_working_keys(s_rand_range_generator(m_re)); + auto const [start_k, end_k] = m_shadow_map.pick_random_non_working_keys(s_rand_range_generator(m_re)); do_query(start_k, end_k, 79); } ////////////////////// All get operation variants /////////////////////////////// void get_all() const { - for (const auto& [key, value] : m_shadow_map.map_const()) { + m_shadow_map.foreach ([this](K key, V value) { auto copy_key = std::make_unique< K >(); *copy_key = key; auto out_v = std::make_unique< V >(); @@ -258,7 +255,7 @@ struct BtreeTestHelper : public testing::Test { ASSERT_EQ(ret, btree_status_t::success) << "Missing key " << key << " in btree but present in shadow map"; ASSERT_EQ((const V&)req.value(), value) << "Found value in btree doesn't return correct data for key=" << key; - } + }); } void get_specific(uint32_t k) const { @@ -280,6 +277,7 @@ struct BtreeTestHelper : public testing::Test { auto req = BtreeGetAnyRequest< K >{BtreeKeyRange< K >{K{start_k}, true, K{end_k}, true}, out_k.get(), out_v.get()}; const auto status = m_bt->get(req); + if (status == btree_status_t::success) { ASSERT_EQ(m_shadow_map.exists_in_range(*(K*)req.m_outkey, start_k, end_k), true) << "Get Any returned key=" << *(K*)req.m_outkey << " which is not in range " << start_k << "-" << end_k @@ -303,14 +301,32 @@ struct BtreeTestHelper : public testing::Test { void print_keys() const { m_bt->print_tree_keys(); } void compare_files(const std::string& before, const std::string& after) { - std::ifstream b(before); - std::ifstream a(after); - std::ostringstream ss_before, ss_after; - ss_before << b.rdbuf(); - ss_after << a.rdbuf(); - std::string s1 = ss_before.str(); - std::string s2 = ss_after.str(); - ASSERT_EQ(s1, s2) << "Mismatch in btree structure"; + std::ifstream b(before, std::ifstream::ate); + std::ifstream a(after, std::ifstream::ate); + if (a.fail() || b.fail()) { + LOGINFO("Failed to open file"); + assert(false); + } + if (a.tellg() != b.tellg()) { + LOGINFO("Mismatch in btree files"); + assert(false); + } + + int64_t pending = a.tellg(); + const int64_t batch_size = 4096; + a.seekg(0, ifstream::beg); + b.seekg(0, ifstream::beg); + char a_buffer[batch_size], b_buffer[batch_size]; + while (pending > 0) { + auto count = std::min(pending, batch_size); + a.read(a_buffer, count); + b.read(b_buffer, count); + if (std::memcmp(a_buffer, b_buffer, count) != 0) { + LOGINFO("Mismatch in btree files"); + assert(false); + } + pending -= count; + } } private: @@ -327,7 +343,6 @@ struct BtreeTestHelper : public testing::Test { } m_shadow_map.put_and_check(key, value, *existing_v, done); - m_range_scheduler.put_key(k); } void do_range_remove(uint64_t start_k, uint64_t end_k, bool all_existing) { @@ -336,6 +351,7 @@ struct BtreeTestHelper : public testing::Test { auto rreq = BtreeRangeRemoveRequest< K >{BtreeKeyRange< K >{start_key, true, end_key, true}}; auto const ret = m_bt->remove(rreq); + m_shadow_map.range_erase(start_key, end_key); if (all_existing) { @@ -344,7 +360,7 @@ struct BtreeTestHelper : public testing::Test { } if (start_k < m_max_range_input) { - m_range_scheduler.remove_keys(start_k, std::min(end_k, uint64_cast(m_max_range_input - 1))); + m_shadow_map.remove_keys(start_k, std::min(end_k, uint64_cast(m_max_range_input - 1))); } } @@ -380,4 +396,4 @@ struct BtreeTestHelper : public testing::Test { } LOGINFO("ALL parallel jobs joined"); } -}; \ No newline at end of file +}; diff --git a/src/tests/btree_helpers/shadow_map.hpp b/src/tests/btree_helpers/shadow_map.hpp index 1e7418122..f8c40e140 100644 --- a/src/tests/btree_helpers/shadow_map.hpp +++ b/src/tests/btree_helpers/shadow_map.hpp @@ -7,26 +7,36 @@ template < typename K, typename V > class ShadowMap { private: std::map< K, V > m_map; + RangeScheduler m_range_scheduler; + using mutex = iomgr::FiberManagerLib::shared_mutex; + mutex m_mutex; public: + ShadowMap(uint32_t num_keys) : m_range_scheduler(num_keys) {} + void put_and_check(const K& key, const V& val, const V& old_val, bool expected_success) { + std::lock_guard lock{m_mutex}; auto const [it, happened] = m_map.insert(std::make_pair(key, val)); ASSERT_EQ(happened, expected_success) << "Testcase issue, expected inserted slots to be in shadow map"; if (!happened) { ASSERT_EQ(old_val, it->second) << "Put: Existing value doesn't return correct data for key: " << it->first; } + m_range_scheduler.put_key(key.key()); } void range_upsert(uint64_t start_k, uint32_t count, const V& val) { + std::lock_guard lock{m_mutex}; for (uint32_t i{0}; i < count; ++i) { K key{start_k + i}; V range_value{val}; if constexpr (std::is_same_v< V, TestIntervalValue >) { range_value.shift(i); } m_map.insert_or_assign(key, range_value); } + m_range_scheduler.put_keys(start_k, start_k + count - 1); } void range_update(const K& start_key, uint32_t count, const V& new_val) { + std::lock_guard lock{m_mutex}; auto const start_it = m_map.lower_bound(start_key); auto it = start_it; uint32_t c = 0; @@ -34,9 +44,11 @@ class ShadowMap { it->second = new_val; ++it; } + m_range_scheduler.remove_keys_from_working(start_key.key(), start_key.key() + count - 1); } std::pair< K, K > pick_existing_range(const K& start_key, uint32_t max_count) const { + std::shared_lock lock{m_mutex}; auto const start_it = m_map.lower_bound(start_key); auto it = start_it; uint32_t count = 0; @@ -46,9 +58,13 @@ class ShadowMap { return std::pair(start_it->first, it->first); } - bool exists(const K& key) const { return m_map.find(key) != m_map.end(); } + bool exists(const K& key) const { + std::shared_lock lock{m_mutex}; + return m_map.find(key) != m_map.end(); + } bool exists_in_range(const K& key, uint64_t start_k, uint64_t end_k) const { + std::shared_lock lock{m_mutex}; const auto itlower = m_map.lower_bound(K{start_k}); const auto itupper = m_map.upper_bound(K{end_k}); auto it = itlower; @@ -59,7 +75,10 @@ class ShadowMap { return false; } - uint64_t size() const { return m_map.size(); } + uint64_t size() const { + std::shared_lock lock{m_mutex}; + return m_map.size(); + } uint32_t num_elems_in_range(uint64_t start_k, uint64_t end_k) const { const auto itlower = m_map.lower_bound(K{start_k}); @@ -68,29 +87,71 @@ class ShadowMap { } void validate_data(const K& key, const V& btree_val) const { + std::shared_lock lock{m_mutex}; const auto r = m_map.find(key); ASSERT_NE(r, m_map.end()) << "Key " << key.to_string() << " is not present in shadow map"; ASSERT_EQ(btree_val, r->second) << "Found value in btree doesn't return correct data for key=" << r->first; } - void erase(const K& key) { m_map.erase(key); } + void erase(const K& key) { + std::lock_guard lock{m_mutex}; + m_map.erase(key); + m_range_scheduler.remove_key(key.key()); + } void range_erase(const K& start_key, uint32_t count) { + std::lock_guard lock{m_mutex}; auto const it = m_map.lower_bound(start_key); uint32_t i{0}; while ((it != m_map.cend()) && (i++ < count)) { it = m_map.erase(it); } + m_range_scheduler.remove_keys(start_key.key(), start_key.key() + count); } void range_erase(const K& start_key, const K& end_key) { + std::lock_guard lock{m_mutex}; auto it = m_map.lower_bound(start_key); auto const end_it = m_map.upper_bound(end_key); while ((it != m_map.cend()) && (it != end_it)) { it = m_map.erase(it); } + m_range_scheduler.remove_keys(start_key.key(), end_key.key()); } + mutex& guard() { return m_mutex; } std::map< K, V >& map() { return m_map; } const std::map< K, V >& map_const() const { return m_map; } + + void foreach (std::function< void(K, V) > func) const { + std::shared_lock lock{m_mutex}; + for (const auto& [key, value] : m_map) { + func(key, value); + } + } + + std::pair< uint32_t, uint32_t > pick_random_non_existing_keys(uint32_t max_keys) { + std::shared_lock lock{m_mutex}; + return m_range_scheduler.pick_random_non_existing_keys(max_keys); + } + + std::pair< uint32_t, uint32_t > pick_random_existing_keys(uint32_t max_keys) { + std::shared_lock lock{m_mutex}; + return m_range_scheduler.pick_random_existing_keys(max_keys); + } + + std::pair< uint32_t, uint32_t > pick_random_non_working_keys(uint32_t max_keys) { + std::shared_lock lock{m_mutex}; + return m_range_scheduler.pick_random_non_working_keys(max_keys); + } + + void remove_keys_from_working(uint32_t s, uint32_t e) { + std::lock_guard lock{m_mutex}; + m_range_scheduler.remove_keys_from_working(s, e); + } + + void remove_keys(uint32_t start_key, uint32_t end_key) { + std::lock_guard lock{m_mutex}; + m_range_scheduler.remove_keys(start_key, end_key); + } }; diff --git a/src/tests/test_index_btree.cpp b/src/tests/test_index_btree.cpp index 14062cd1f..4bab73ad0 100644 --- a/src/tests/test_index_btree.cpp +++ b/src/tests/test_index_btree.cpp @@ -42,13 +42,15 @@ SISL_OPTIONS_ENABLE(logging, test_index_btree, iomgr, test_common_setup) SISL_LOGGING_DECL(test_index_btree) std::vector< std::string > test_common::HSTestHelper::s_dev_names; -// TODO increase num_entries to 65k as io mgr page size is 512 and its slow. + +// TODO Add tests to do write,remove after recovery. +// TODO Test with var len key with io mgr page size is 512. SISL_OPTION_GROUP(test_index_btree, (num_iters, "", "num_iters", "number of iterations for rand ops", - ::cxxopts::value< uint32_t >()->default_value("1500"), "number"), + ::cxxopts::value< uint32_t >()->default_value("500"), "number"), (num_entries, "", "num_entries", "number of entries to test with", - ::cxxopts::value< uint32_t >()->default_value("15000"), "number"), + ::cxxopts::value< uint32_t >()->default_value("5000"), "number"), (seed, "", "seed", "random engine seed, use random if not defined", ::cxxopts::value< uint64_t >()->default_value("0"), "number")) @@ -159,12 +161,7 @@ struct BtreeTest : public BtreeTestHelper< TestType > { } }; -// TODO sanal fix the varkey issue. -// using BtreeTypes = testing::Types< FixedLenBtreeTest, VarKeySizeBtreeTest, VarValueSizeBtreeTest, -// VarObjSizeBtreeTest -// >; - -using BtreeTypes = testing::Types< FixedLenBtreeTest >; +using BtreeTypes = testing::Types< FixedLenBtreeTest, VarKeySizeBtreeTest, VarValueSizeBtreeTest, VarObjSizeBtreeTest >; TYPED_TEST_SUITE(BtreeTest, BtreeTypes); @@ -225,7 +222,6 @@ TYPED_TEST(BtreeTest, RandomInsert) { this->get_all(); } -#if 0 TYPED_TEST(BtreeTest, SequentialRemove) { LOGINFO("SequentialRemove test start"); // Forward sequential insert @@ -280,7 +276,6 @@ TYPED_TEST(BtreeTest, RandomRemove) { } this->get_all(); } -#endif TYPED_TEST(BtreeTest, RangeUpdate) { LOGINFO("RangeUpdate test start"); @@ -309,23 +304,29 @@ TYPED_TEST(BtreeTest, CpFlush) { for (uint32_t i = 0; i < num_entries; ++i) { this->put(i, btree_put_type::INSERT); } + + // Remove some of the entries. + for (uint32_t i = 0; i < num_entries; i += 10) { + this->remove_one(i); + } + LOGINFO("Query {} entries and validate with pagination of 75 entries", num_entries / 2); this->do_query(0, num_entries / 2 - 1, 75); - this->print(std::string("before.txt")); - LOGINFO("Trigger checkpoint flush."); test_common::HSTestHelper::trigger_cp(true /* wait */); LOGINFO("Query {} entries and validate with pagination of 75 entries", num_entries); this->do_query(0, num_entries - 1, 75); + this->print(std::string("before.txt")); + this->destroy_btree(); // Restart homestore. m_bt is updated by the TestIndexServiceCallback. this->restart_homestore(); - std::this_thread::sleep_for(std::chrono::seconds{3}); + std::this_thread::sleep_for(std::chrono::seconds{1}); LOGINFO("Restarted homestore with index recovered"); this->print(std::string("after.txt")); @@ -373,7 +374,7 @@ TYPED_TEST(BtreeTest, MultipleCpFlush) { // Restart homestore. m_bt is updated by the TestIndexServiceCallback. this->restart_homestore(); - std::this_thread::sleep_for(std::chrono::seconds{3}); + std::this_thread::sleep_for(std::chrono::seconds{1}); LOGINFO(" Restarted homestore with index recovered"); this->print(std::string("after.txt")); @@ -388,24 +389,41 @@ TYPED_TEST(BtreeTest, ThreadedCpFlush) { LOGINFO("ThreadedCpFlush test start"); const auto num_entries = SISL_OPTIONS["num_entries"].as< uint32_t >(); - bool stop_cp_flush = false; - auto io_thread = std::thread([this, num_entries] { + bool stop = false; + std::atomic< uint32_t > last_index{0}; + auto insert_io_thread = std::thread([this, num_entries, &last_index] { LOGINFO("Do Forward sequential insert for {} entries", num_entries); + uint32_t j = 0; for (uint32_t i = 0; i < num_entries; ++i) { this->put(i, btree_put_type::INSERT); + last_index = i; } }); - auto cp_flush_thread = std::thread([this, &stop_cp_flush] { - while (!stop_cp_flush) { - LOGINFO("Trigger checkpoint flush wait=false."); - test_common::HSTestHelper::trigger_cp(false /* wait */); + auto remove_io_thread = std::thread([this, &stop, num_entries, &last_index] { + LOGINFO("Do random removes for {} entries", num_entries); + while (!stop) { + std::this_thread::sleep_for(std::chrono::milliseconds{10}); + // Remove a random entry. + std::uniform_int_distribution< uint32_t > rand{0, last_index.load()}; + auto rm_idx = rand(g_re); + LOGINFO("Removing entry {}", rm_idx); + this->remove_one(rm_idx); + } + }); + + auto cp_flush_thread = std::thread([this, &stop] { + while (!stop) { std::this_thread::sleep_for(std::chrono::seconds{1}); + LOGINFO("Trigger checkpoint flush wait=true."); + test_common::HSTestHelper::trigger_cp(false /* wait */); + LOGINFO("Trigger checkpoint flush wait=true done."); } }); - io_thread.join(); - stop_cp_flush = true; + insert_io_thread.join(); + stop = true; + remove_io_thread.join(); cp_flush_thread.join(); LOGINFO("Trigger checkpoint flush wait=true."); @@ -420,7 +438,7 @@ TYPED_TEST(BtreeTest, ThreadedCpFlush) { // Restart homestore. m_bt is updated by the TestIndexServiceCallback. this->restart_homestore(); - std::this_thread::sleep_for(std::chrono::seconds{3}); + std::this_thread::sleep_for(std::chrono::seconds{1}); LOGINFO(" Restarted homestore with index recovered"); this->print(std::string("after.txt")); diff --git a/src/tests/test_mem_btree.cpp b/src/tests/test_mem_btree.cpp index 68bf4003d..bb794fca3 100644 --- a/src/tests/test_mem_btree.cpp +++ b/src/tests/test_mem_btree.cpp @@ -41,14 +41,15 @@ SISL_OPTION_GROUP( (num_entries, "", "num_entries", "number of entries to test with", ::cxxopts::value< uint32_t >()->default_value("10000"), "number"), (disable_merge, "", "disable_merge", "disable_merge", ::cxxopts::value< bool >()->default_value("0"), ""), - (n_threads, "", "n_threads", "number of threads", ::cxxopts::value< uint32_t >()->default_value("2"), "number"), - (n_fibers, "", "n_fibers", "number of fibers", ::cxxopts::value< uint32_t >()->default_value("10"), "number"), + (n_threads, "", "num_threads", "number of threads", ::cxxopts::value< uint32_t >()->default_value("2"), "number"), + (n_fibers, "", "num_fibers", "number of fibers", ::cxxopts::value< uint32_t >()->default_value("10"), "number"), (operation_list, "", "operation_list", "operation list instead of default created following by percentage", ::cxxopts::value< std::vector< std::string > >(), "operations [...]"), (preload_size, "", "preload_size", "number of entries to preload tree with", ::cxxopts::value< uint32_t >()->default_value("1000"), "number"), (seed, "", "seed", "random engine seed, use random if not defined", - ::cxxopts::value< uint64_t >()->default_value("0"), "number")) + ::cxxopts::value< uint64_t >()->default_value("0"), "number"), + (run_time, "", "run_time", "run time for io", ::cxxopts::value< uint32_t >()->default_value("360000"), "seconds")) struct FixedLenBtreeTest { using BtreeType = MemBtree< TestFixedKey, TestFixedValue >; @@ -265,7 +266,7 @@ TYPED_TEST(BtreeTest, RandomRemoveRange) { this->put(i, btree_put_type::INSERT); } // generate keys including out of bound - static thread_local std::uniform_int_distribution< uint32_t > s_rand_key_generator{0, 2 * num_entries}; + static thread_local std::uniform_int_distribution< uint32_t > s_rand_key_generator{0, num_entries}; // this->print_keys(); LOGINFO("Step 2: Do range remove for maximum of {} iterations", num_iters); for (uint32_t i{0}; (i < num_iters) && this->m_shadow_map.size(); ++i) { @@ -289,10 +290,10 @@ struct BtreeConcurrentTest : public BtreeTestHelper< TestType > { BtreeConcurrentTest() { this->m_is_multi_threaded = true; } void SetUp() override { - LOGINFO("Starting iomgr with {} threads", SISL_OPTIONS["n_threads"].as< uint32_t >()); - ioenvironment.with_iomgr(iomgr::iomgr_params{.num_threads = SISL_OPTIONS["n_threads"].as< uint32_t >(), + LOGINFO("Starting iomgr with {} threads", SISL_OPTIONS["num_threads"].as< uint32_t >()); + ioenvironment.with_iomgr(iomgr::iomgr_params{.num_threads = SISL_OPTIONS["num_threads"].as< uint32_t >(), .is_spdk = false, - .num_fibers = 1 + SISL_OPTIONS["n_fibers"].as< uint32_t >(), + .num_fibers = 1 + SISL_OPTIONS["num_fibers"].as< uint32_t >(), .app_mem_size_mb = 0, .hugepage_size_mb = 0}); From e44aa46b8955f2786c471dc0fa5322ababb3fd96 Mon Sep 17 00:00:00 2001 From: Mehdi Hosseini <116847813+shosseinimotlagh@users.noreply.github.com> Date: Mon, 13 Nov 2023 12:56:31 -0800 Subject: [PATCH 3/9] Long running for index service. (#219) * Main changes add concurrent UT for index tree change btree_test.py (add device list and run time) introduce run_time --- conanfile.py | 2 +- src/tests/btree_helpers/btree_test_helper.hpp | 7 +- .../test_common/homestore_test_common.hpp | 11 ++- src/tests/test_index_btree.cpp | 97 +++++++++++++++++++ src/tests/test_mem_btree.cpp | 4 +- src/tests/test_scripts/btree_test.py | 35 ++++--- 6 files changed, 136 insertions(+), 20 deletions(-) diff --git a/conanfile.py b/conanfile.py index 783fbbd23..a25e48964 100644 --- a/conanfile.py +++ b/conanfile.py @@ -5,7 +5,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "4.8.2" + version = "4.8.3" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/tests/btree_helpers/btree_test_helper.hpp b/src/tests/btree_helpers/btree_test_helper.hpp index 1072ef981..606aceba0 100644 --- a/src/tests/btree_helpers/btree_test_helper.hpp +++ b/src/tests/btree_helpers/btree_test_helper.hpp @@ -51,6 +51,7 @@ struct BtreeTestHelper : public testing::Test { if (m_is_multi_threaded) { std::mutex mtx; + m_run_time = SISL_OPTIONS["run_time"].as< uint32_t >(); iomanager.run_on_wait(iomgr::reactor_regex::all_io, [this, &mtx]() { auto fv = iomanager.sync_io_capable_fibers(); std::unique_lock lg(mtx); @@ -73,6 +74,7 @@ struct BtreeTestHelper : public testing::Test { BtreeConfig m_cfg{g_node_size}; uint32_t m_max_range_input{1000}; bool m_is_multi_threaded{false}; + uint32_t m_run_time{0}; std::map< std::string, op_func_t > m_operations; std::vector< iomgr::io_fiber_t > m_fibers; @@ -378,8 +380,9 @@ struct BtreeTestHelper : public testing::Test { // Construct a weighted distribution based on the input frequencies std::discrete_distribution< uint32_t > s_rand_op_generator(weights.begin(), weights.end()); - - for (uint32_t i = 0; i < num_iters_per_thread; i++) { + auto m_start_time = Clock::now(); + auto time_to_stop = [this, m_start_time]() {return (get_elapsed_time_sec(m_start_time) > m_run_time);}; + for (uint32_t i = 0; i < num_iters_per_thread && !time_to_stop(); i++) { uint32_t op_idx = s_rand_op_generator(re); (this->m_operations[op_list[op_idx].first])(); } diff --git a/src/tests/test_common/homestore_test_common.hpp b/src/tests/test_common/homestore_test_common.hpp index 4729fc634..b1208feb5 100644 --- a/src/tests/test_common/homestore_test_common.hpp +++ b/src/tests/test_common/homestore_test_common.hpp @@ -39,6 +39,8 @@ const std::string USER_WANT_DIRECT_IO{"USER_WANT_DIRECT_IO"}; // u SISL_OPTION_GROUP(test_common_setup, (num_threads, "", "num_threads", "number of threads", ::cxxopts::value< uint32_t >()->default_value("2"), "number"), + (num_fibers, "", "num_fibers", "number of fibers per thread", + ::cxxopts::value< uint32_t >()->default_value("2"), "number"), (num_devs, "", "num_devs", "number of devices to create", ::cxxopts::value< uint32_t >()->default_value("2"), "number"), (dev_size_mb, "", "dev_size_mb", "size of each device in MB", @@ -111,7 +113,8 @@ class HSTestHelper { hs_before_services_starting_cb_t cb = nullptr, bool restart = false) { auto const ndevices = SISL_OPTIONS["num_devs"].as< uint32_t >(); auto const dev_size = SISL_OPTIONS["dev_size_mb"].as< uint64_t >() * 1024 * 1024; - auto nthreads = SISL_OPTIONS["num_threads"].as< uint32_t >(); + auto num_threads = SISL_OPTIONS["num_threads"].as< uint32_t >(); + auto num_fibers = SISL_OPTIONS["num_fibers"].as< uint32_t >(); auto is_spdk = SISL_OPTIONS["spdk"].as< bool >(); if (restart) { @@ -145,11 +148,11 @@ class HSTestHelper { if (is_spdk) { LOGINFO("Spdk with more than 2 threads will cause overburden test systems, changing nthreads to 2"); - nthreads = 2; + num_threads = 2; } - LOGINFO("Starting iomgr with {} threads, spdk: {}", nthreads, is_spdk); - ioenvironment.with_iomgr(iomgr::iomgr_params{.num_threads = nthreads, .is_spdk = is_spdk}); + LOGINFO("Starting iomgr with {} threads, spdk: {}", num_threads, is_spdk); + ioenvironment.with_iomgr(iomgr::iomgr_params{.num_threads = num_threads, .is_spdk = is_spdk, .num_fibers = num_fibers}); auto const http_port = SISL_OPTIONS["http_port"].as< int >(); if (http_port != 0) { diff --git a/src/tests/test_index_btree.cpp b/src/tests/test_index_btree.cpp index 4bab73ad0..e110a931d 100644 --- a/src/tests/test_index_btree.cpp +++ b/src/tests/test_index_btree.cpp @@ -51,6 +51,12 @@ SISL_OPTION_GROUP(test_index_btree, ::cxxopts::value< uint32_t >()->default_value("500"), "number"), (num_entries, "", "num_entries", "number of entries to test with", ::cxxopts::value< uint32_t >()->default_value("5000"), "number"), + (run_time, "", "run_time", "run time for io", ::cxxopts::value< uint32_t >()->default_value("360000"), "seconds"), + (disable_merge, "", "disable_merge", "disable_merge", ::cxxopts::value< bool >()->default_value("0"), ""), + (operation_list, "", "operation_list", "operation list instead of default created following by percentage", + ::cxxopts::value< std::vector< std::string > >(), "operations [...]"), + (preload_size, "", "preload_size", "number of entries to preload tree with", + ::cxxopts::value< uint32_t >()->default_value("1000"), "number"), (seed, "", "seed", "random engine seed, use random if not defined", ::cxxopts::value< uint64_t >()->default_value("0"), "number")) @@ -449,6 +455,97 @@ TYPED_TEST(BtreeTest, ThreadedCpFlush) { LOGINFO("ThreadedCpFlush test end"); } +template < typename TestType > +struct BtreeConcurrentTest : public BtreeTestHelper< TestType > { + + using T = TestType; + using K = typename TestType::KeyType; + using V = typename TestType::ValueType; + class TestIndexServiceCallbacks : public IndexServiceCallbacks { + public: + TestIndexServiceCallbacks(BtreeConcurrentTest* test) : m_test(test) {} + std::shared_ptr< IndexTableBase > on_index_table_found(const superblk< index_table_sb >& sb) override { + LOGINFO("Index table recovered"); + LOGINFO("Root bnode_id {} version {}", sb->root_node, sb->link_version); + m_test->m_bt = std::make_shared< typename T::BtreeType >(sb, m_test->m_cfg); + return m_test->m_bt; + } + + private: + BtreeConcurrentTest* m_test; + }; + + BtreeConcurrentTest() { this->m_is_multi_threaded = true; } + + void SetUp() override { + test_common::HSTestHelper::start_homestore( + "test_index_btree", + {{HS_SERVICE::META, {.size_pct = 10.0}}, + {HS_SERVICE::INDEX, {.size_pct = 70.0, .index_svc_cbs = new TestIndexServiceCallbacks(this)}}}); + + LOGINFO("Node size {} ", hs()->index_service().node_size()); + this->m_cfg = BtreeConfig(hs()->index_service().node_size()); + + auto uuid = boost::uuids::random_generator()(); + auto parent_uuid = boost::uuids::random_generator()(); + + // Test cp flush of write back. + HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) { + s.generic.cache_max_throttle_cnt = 10000; + HS_SETTINGS_FACTORY().save(); + }); + homestore::hs()->resource_mgr().reset_dirty_buf_qd(); + + // Create index table and attach to index service. + BtreeTestHelper< TestType >::SetUp(); + this->m_bt = std::make_shared< typename T::BtreeType >(uuid, parent_uuid, 0, this->m_cfg); + hs()->index_service().add_index_table(this->m_bt); + LOGINFO("Added index table to index service"); + } + + void TearDown() override { + BtreeTestHelper< TestType >::TearDown(); + test_common::HSTestHelper::shutdown_homestore(); + } +}; + +TYPED_TEST_SUITE(BtreeConcurrentTest, BtreeTypes); +TYPED_TEST(BtreeConcurrentTest, ConcurrentAllOps) { + // range put is not supported for non-extent keys + std::vector< std::string > input_ops = {"put:20", "remove:20", "range_put:20", "range_remove:20", "query:20"}; + std::vector< std::pair< std::string, int > > ops; + if (SISL_OPTIONS.count("operation_list")) { + input_ops = SISL_OPTIONS["operation_list"].as< std::vector< std::string > >(); + } + int total = std::accumulate(input_ops.begin(), input_ops.end(), 0, [](int sum, const auto& str) { + std::vector< std::string > tokens; + boost::split(tokens, str, boost::is_any_of(":")); + if (tokens.size() == 2) { + try { + return sum + std::stoi(tokens[1]); + } catch (const std::exception&) { + // Invalid frequency, ignore this element + } + } + return sum; // Ignore malformed strings + }); + + std::transform(input_ops.begin(), input_ops.end(), std::back_inserter(ops), [total](const auto& str) { + std::vector< std::string > tokens; + boost::split(tokens, str, boost::is_any_of(":")); + if (tokens.size() == 2) { + try { + return std::make_pair(tokens[0], (int)(100.0 * std::stoi(tokens[1]) / total)); + } catch (const std::exception&) { + // Invalid frequency, ignore this element + } + } + return std::make_pair(std::string(), 0); + }); + + this->multi_op_execute(ops); +} + int main(int argc, char* argv[]) { int parsed_argc{argc}; ::testing::InitGoogleTest(&parsed_argc, argv); diff --git a/src/tests/test_mem_btree.cpp b/src/tests/test_mem_btree.cpp index bb794fca3..94e78d53c 100644 --- a/src/tests/test_mem_btree.cpp +++ b/src/tests/test_mem_btree.cpp @@ -41,8 +41,8 @@ SISL_OPTION_GROUP( (num_entries, "", "num_entries", "number of entries to test with", ::cxxopts::value< uint32_t >()->default_value("10000"), "number"), (disable_merge, "", "disable_merge", "disable_merge", ::cxxopts::value< bool >()->default_value("0"), ""), - (n_threads, "", "num_threads", "number of threads", ::cxxopts::value< uint32_t >()->default_value("2"), "number"), - (n_fibers, "", "num_fibers", "number of fibers", ::cxxopts::value< uint32_t >()->default_value("10"), "number"), + (num_threads, "", "num_threads", "number of threads", ::cxxopts::value< uint32_t >()->default_value("2"), "number"), + (num_fibers, "", "num_fibers", "number of fibers", ::cxxopts::value< uint32_t >()->default_value("10"), "number"), (operation_list, "", "operation_list", "operation list instead of default created following by percentage", ::cxxopts::value< std::vector< std::string > >(), "operations [...]"), (preload_size, "", "preload_size", "number of entries to preload tree with", diff --git a/src/tests/test_scripts/btree_test.py b/src/tests/test_scripts/btree_test.py index e87059e12..ed0389f4b 100644 --- a/src/tests/test_scripts/btree_test.py +++ b/src/tests/test_scripts/btree_test.py @@ -12,16 +12,18 @@ opts, args = getopt.getopt(sys.argv[1:], 'tdlme:', ['test_suits=', 'dirpath=', 'op_list=', 'log_mods=', 'threads=', 'fibers=', 'preload_size=', - 'op_list=', 'num_entries=', 'num_iters=']) + 'op_list=', 'num_entries=', 'num_iters=', 'dev_list=', 'run_time=']) test_suits = "" dirpath = "./" op_list = "" log_mods = "" -threads = " --n_threads=10" -fibers = " --n_fibers=10" -preload_size = " --preload_size=2000" -num_entries = " --num_entries=10000" -num_iters = " --num_iters=1000000" +threads = " --num_threads=10" +fibers = " --num_fibers=10" +preload_size = " --preload_size=16384" +num_entries = " --num_entries=65536" +num_iters = " --num_iters=10000000" +run_time = " --run_time=36000" +dev_list = "" for opt, arg in opts: if opt in ('-t', '--test_suits'): @@ -38,13 +40,13 @@ log_mods = arg print("log_mods (%s)" % arg) if opt in ('-f', '--fibers'): - fibers = " --n_fibers=" + arg + fibers = " --num_fibers=" + arg print("number of fibers per thread (%s)" % arg) if opt in ('-p', '--preload_size'): preload_size = " --preload_size=" + arg print("preload_size = (%s)" % arg) if opt in ('-t', '--threads'): - threads = " --n_threads=" + arg + threads = " --num_threads=" + arg print("number of threads (%s)" % arg) if opt in ('-n', '--num_entries'): num_entries = " --num_entries=" + arg @@ -52,19 +54,30 @@ if opt in ('-i', '--num_iters'): num_iters = " --num_iters=" + arg print("number of iterations (%s)" % arg) + if opt in ('-r', '--run_time'): + run_time = " --run_time=" + arg + print("total run time (%s)" % arg) + if opt in ('-v', '--dev_list'): + dev_list = arg + print(("device list (%s)") % (arg)) operations = "" if bool(op_list and op_list.strip()): operations = ''.join([f' --operation_list={op}' for op in op_list.split()]) -btree_options = num_entries + num_iters + preload_size + fibers + threads + operations +addln_opts = ' ' +if bool(dev_list and dev_list.strip()): + addln_opts += ' --device_list ' + addln_opts += dev_list + +btree_options = num_entries + num_iters + preload_size + fibers + threads + operations + run_time + addln_opts def normal(): print("normal test started with (%s)" % btree_options) # " --operation_list=query:20 --operation_list=put:20 --operation_list=remove:20" - cmd_opts = " --gtest_filter=BtreeConcurrentTest/*.AllTree" + btree_options + " "+log_mods - subprocess.check_call(dirpath + "test_mem_btree " + cmd_opts, stderr=subprocess.STDOUT, shell=True) + cmd_opts = " --gtest_filter=BtreeConcurrentTest/*.ConcurrentAllOps" + btree_options + " "+log_mods + subprocess.check_call(dirpath + "test_index_btree " + cmd_opts, stderr=subprocess.STDOUT, shell=True) print("normal test completed") From 74b8e78e76a5a66b4eba950fa4886bb11d5f4e0c Mon Sep 17 00:00:00 2001 From: Jie Yao Date: Wed, 15 Nov 2023 02:07:15 +0800 Subject: [PATCH 4/9] fix NDEBUG (#228) --- src/include/homestore/btree/detail/prefix_node.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/include/homestore/btree/detail/prefix_node.hpp b/src/include/homestore/btree/detail/prefix_node.hpp index 8f2f3b2fd..c0ee8844e 100644 --- a/src/include/homestore/btree/detail/prefix_node.hpp +++ b/src/include/homestore/btree/detail/prefix_node.hpp @@ -770,7 +770,7 @@ class FixedPrefixNode : public VariantNode< K, V > { phdr->tail_slot = phdr->used_slots; } -#ifdef _DEBUG +#ifndef NDEBUG void validate_sanity() { uint32_t i{0}; // validate if keys are in ascending order From 115c000fcfb0ffaf225d48156eaeec0bd081a654 Mon Sep 17 00:00:00 2001 From: Harihara Kadayam Date: Thu, 21 Dec 2023 17:30:50 -0800 Subject: [PATCH 5/9] Next set of code to get the raft based replication. (#230) Major changes in this PR i * Foundational code and framework for Raft replication with RaftReplService override and RaftReplDev/RaftStateMachine * It does have barebone code and no testing related to the Raft code, but at this time it compiles and existing test cases run successfully. * Added home raft logstore test case * Homestore replication code next set of changes * Introduces the test case framework which creates multiple members and run the unit test and ensure success of replication * Fixed folly init RAII preventing from globalCPUExecutor execution --- conanfile.py | 2 +- src/CMakeLists.txt | 3 + src/include/homestore/blkdata_service.hpp | 9 +- src/include/homestore/homestore.hpp | 2 +- src/include/homestore/homestore_decl.hpp | 3 +- .../homestore/replication/repl_decls.h | 9 +- src/include/homestore/replication/repl_dev.h | 58 ++- src/include/homestore/replication_service.hpp | 17 +- src/include/homestore/superblk_handler.hpp | 2 +- src/lib/blkdata_svc/blkdata_service.cpp | 2 + src/lib/common/homestore_config.fbs | 31 ++ src/lib/homestore.cpp | 9 +- src/lib/logstore/log_dev.cpp | 52 +- src/lib/replication/CMakeLists.txt | 23 +- src/lib/replication/fetch_data_rpc.fbs | 34 ++ .../log_store/home_raft_log_store.cpp | 267 +++++++++++ .../log_store/home_raft_log_store.h | 180 +++++++ .../replication/log_store/repl_log_store.cpp | 70 +++ .../replication/log_store/repl_log_store.h | 33 ++ .../log_store/storage_engine_buffer.h | 251 ++++++++++ src/lib/replication/push_data_rpc.fbs | 13 + src/lib/replication/repl_dev/common.cpp | 51 ++ src/lib/replication/repl_dev/common.h | 89 ++++ .../replication/repl_dev/raft_repl_dev.cpp | 451 ++++++++++++++++++ src/lib/replication/repl_dev/raft_repl_dev.h | 118 +++++ .../repl_dev/raft_state_machine.cpp | 184 +++++++ .../replication/repl_dev/raft_state_machine.h | 123 +++++ .../replication/repl_dev/solo_repl_dev.cpp | 8 +- src/lib/replication/repl_dev/solo_repl_dev.h | 1 + .../replication/service/generic_repl_svc.cpp | 146 +++--- .../replication/service/generic_repl_svc.h | 37 +- .../replication/service/raft_repl_service.cpp | 243 ++++++++++ .../replication/service/raft_repl_service.h | 77 +++ .../replication/service/repl_service_impl.cpp | 180 ------- src/tests/CMakeLists.txt | 32 +- src/tests/log_dev_benchmark.cpp | 2 +- src/tests/log_store_benchmark.cpp | 2 +- src/tests/test_append_blkalloc.cpp | 6 +- src/tests/test_blk_cache_queue.cpp | 2 +- src/tests/test_blk_read_tracker.cpp | 2 +- src/tests/test_blkalloc.cpp | 2 +- src/tests/test_blkid.cpp | 2 +- src/tests/test_btree_node.cpp | 2 +- .../test_common/homestore_test_common.hpp | 73 ++- src/tests/test_common/hs_repl_test_common.hpp | 252 ++++++++++ src/tests/test_cp_mgr.cpp | 2 +- src/tests/test_data_service.cpp | 16 +- src/tests/test_device_manager.cpp | 2 +- src/tests/test_home_raft_logstore.cpp | 275 +++++++++++ src/tests/test_index_btree.cpp | 2 +- src/tests/test_journal_vdev.cpp | 4 +- src/tests/test_log_dev.cpp | 2 +- src/tests/test_log_store.cpp | 2 +- src/tests/test_mem_btree.cpp | 2 +- src/tests/test_meta_blk_mgr.cpp | 3 +- src/tests/test_pdev.cpp | 2 +- src/tests/test_raft_repl_dev.cpp | 265 ++++++++++ src/tests/test_solo_repl_dev.cpp | 81 +--- 58 files changed, 3320 insertions(+), 493 deletions(-) create mode 100644 src/lib/replication/fetch_data_rpc.fbs create mode 100644 src/lib/replication/log_store/home_raft_log_store.cpp create mode 100644 src/lib/replication/log_store/home_raft_log_store.h create mode 100644 src/lib/replication/log_store/repl_log_store.cpp create mode 100644 src/lib/replication/log_store/repl_log_store.h create mode 100644 src/lib/replication/log_store/storage_engine_buffer.h create mode 100644 src/lib/replication/push_data_rpc.fbs create mode 100644 src/lib/replication/repl_dev/common.cpp create mode 100644 src/lib/replication/repl_dev/common.h create mode 100644 src/lib/replication/repl_dev/raft_repl_dev.cpp create mode 100644 src/lib/replication/repl_dev/raft_repl_dev.h create mode 100644 src/lib/replication/repl_dev/raft_state_machine.cpp create mode 100644 src/lib/replication/repl_dev/raft_state_machine.h create mode 100644 src/lib/replication/service/raft_repl_service.cpp create mode 100644 src/lib/replication/service/raft_repl_service.h delete mode 100644 src/lib/replication/service/repl_service_impl.cpp create mode 100644 src/tests/test_common/hs_repl_test_common.hpp create mode 100644 src/tests/test_home_raft_logstore.cpp create mode 100644 src/tests/test_raft_repl_dev.cpp diff --git a/conanfile.py b/conanfile.py index a25e48964..5f27aa413 100644 --- a/conanfile.py +++ b/conanfile.py @@ -5,7 +5,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "4.8.3" + version = "4.9.0" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 30329c54e..35d2c94eb 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -8,12 +8,15 @@ find_package(isa-l REQUIRED) find_package(iomgr REQUIRED) find_package(farmhash REQUIRED) find_package(GTest REQUIRED) +find_package(NuraftMesg REQUIRED) set (COMMON_DEPS iomgr::iomgr farmhash::farmhash isa-l::isa-l sisl::sisl + nuraft::nuraft + NuraftMesg::proto ) set(COMMON_TEST_DEPS diff --git a/src/include/homestore/blkdata_service.hpp b/src/include/homestore/blkdata_service.hpp index da0d5d403..1c5692642 100644 --- a/src/include/homestore/blkdata_service.hpp +++ b/src/include/homestore/blkdata_service.hpp @@ -99,7 +99,7 @@ class BlkDataService { * @return A Future that will resolve to an error code indicating the result of the write operation. */ folly::Future< std::error_code > async_write(const char* buf, uint32_t size, MultiBlkId const& bid, - bool part_of_batch); + bool part_of_batch = false); /** * @brief : asynchronous write with input block ids; * @@ -171,6 +171,13 @@ class BlkDataService { */ uint32_t get_blk_size() const { return m_blk_size; } + /** + * @brief : get the blk size of this data service; + * + * @return : blk size + */ + uint32_t get_align_size() const; + /** * @brief : get the read block tracker handle; * diff --git a/src/include/homestore/homestore.hpp b/src/include/homestore/homestore.hpp index 2c9d51d05..263986639 100644 --- a/src/include/homestore/homestore.hpp +++ b/src/include/homestore/homestore.hpp @@ -114,7 +114,7 @@ class HomeStore { std::unique_ptr< MetaBlkService > m_meta_service; std::unique_ptr< LogStoreService > m_log_service; std::unique_ptr< IndexService > m_index_service; - std::unique_ptr< ReplicationService > m_repl_service; + std::shared_ptr< ReplicationService > m_repl_service; std::unique_ptr< DeviceManager > m_dev_mgr; shared< sisl::logging::logger_t > m_periodic_logger; diff --git a/src/include/homestore/homestore_decl.hpp b/src/include/homestore/homestore_decl.hpp index 99c6f234e..26e863c14 100644 --- a/src/include/homestore/homestore_decl.hpp +++ b/src/include/homestore/homestore_decl.hpp @@ -198,7 +198,6 @@ struct cap_attrs { ////////////// Misc /////////////////// #define HOMESTORE_LOG_MODS \ - btree_structures, btree_nodes, btree_generics, btree, cache, device, blkalloc, vol_io_wd, volume, flip, cp, \ - metablk, indx_mgr, wbcache, logstore, replay, transient, IOMGR_LOG_MODS + btree, device, blkalloc, cp, metablk, wbcache, logstore, transient, replication, nuraft_mesg, nuraft, IOMGR_LOG_MODS } // namespace homestore diff --git a/src/include/homestore/replication/repl_decls.h b/src/include/homestore/replication/repl_decls.h index c496c37e2..9f9fee69f 100644 --- a/src/include/homestore/replication/repl_decls.h +++ b/src/include/homestore/replication/repl_decls.h @@ -18,8 +18,8 @@ using blkid_list_t = folly::small_vector< BlkId, 4 >; // Fully qualified domain pba, unique pba id across replica set struct RemoteBlkId { RemoteBlkId() = default; - RemoteBlkId(uint32_t s, const BlkId& b) : server_id{s}, blkid{b} {} - uint32_t server_id{0}; + RemoteBlkId(int32_t s, const MultiBlkId& b) : server_id{s}, blkid{b} {} + int32_t server_id{0}; MultiBlkId blkid; bool operator==(RemoteBlkId const& o) const { return (server_id == o.server_id) && (blkid == o.blkid); } @@ -27,9 +27,8 @@ struct RemoteBlkId { using remote_blkid_list_t = folly::small_vector< RemoteBlkId, 4 >; -// data service api names -static std::string const SEND_DATA{"send_data"}; -static std::string const FETCH_DATA{"fetch_data"}; +using replica_id_t = uuid_t; +using group_id_t = uuid_t; } // namespace homestore diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index 4ebbd1438..6fe221a84 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -14,14 +14,15 @@ namespace nuraft { template < typename T > using ptr = std::shared_ptr< T >; -// class buffer; -class buffer { -public: - static ptr< buffer > alloc(uint32_t size) { return std::make_shared< buffer >(); } -}; // Temporary till we get nuraft included by homestore impl - +class buffer; } // namespace nuraft +namespace sisl { +class GenericRpcData; +} + +void intrusive_ptr_release(sisl::GenericRpcData*); + namespace homestore { class ReplDev; struct repl_req_ctx; @@ -30,6 +31,7 @@ using repl_req_ptr_t = boost::intrusive_ptr< repl_req_ctx >; VENUM(repl_req_state_t, uint32_t, INIT = 0, // Initial state + BLK_ALLOCATED = 1 << 0, // Local block is allocated DATA_RECEIVED = 1 << 1, // Data has been received and being written to the storage DATA_WRITTEN = 1 << 2, // Data has been written to the storage LOG_RECEIVED = 1 << 3, // Log is received and waiting for data @@ -47,6 +49,9 @@ struct repl_key { std::hash< uint64_t >()(rk.dsn); } }; + + bool operator==(repl_key const& other) const = default; + std::string to_string() const { return fmt::format("server={}, term={}, dsn={}", server_id, term, dsn); } }; struct repl_journal_entry; @@ -63,11 +68,15 @@ struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost:: raft_buf_ptr_t& raft_journal_buf(); uint8_t* raw_journal_buf(); + std::string to_string() const; + std::string to_compact_string() const; + public: - repl_key rkey; // Unique key for the request - sisl::blob header; // User header - sisl::blob key; // User supplied key for this req - int64_t lsn{0}; // Lsn for this replication req + repl_key rkey; // Unique key for the request + sisl::blob header; // User header + sisl::blob key; // User supplied key for this req + int64_t lsn{0}; // Lsn for this replication req + bool is_proposer{false}; // Is the repl_req proposed by this node //////////////// Value related section ///////////////// sisl::sg_list value; // Raw value - applicable only to leader req @@ -80,12 +89,14 @@ struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost:: //////////////// Replication state related section ///////////////// std::mutex state_mtx; - std::atomic< repl_req_state_t > state{repl_req_state_t::INIT}; // State of the replication request - folly::Promise< folly::Unit > data_written_promise; // Promise to be fulfilled when data is written + std::atomic< uint32_t > state{uint32_cast(repl_req_state_t::INIT)}; // State of the replication request + folly::Promise< folly::Unit > data_written_promise; // Promise to be fulfilled when data is written //////////////// Communication packet/builder section ///////////////// sisl::io_blob_list_t pkts; flatbuffers::FlatBufferBuilder fb_builder; + sisl::io_blob_safe buf_for_unaligned_data; + intrusive< sisl::GenericRpcData > rpc_data; }; // @@ -95,7 +106,7 @@ class ReplDevListener { public: virtual ~ReplDevListener() = default; - void set_repl_dev(ReplDev* rdev) { m_repl_dev = std::move(rdev); } + void set_repl_dev(ReplDev* rdev) { m_repl_dev = rdev; } virtual ReplDev* repl_dev() { return m_repl_dev; } /// @brief Called when the log entry has been committed in the replica set. @@ -156,9 +167,9 @@ class ReplDevListener { /// write. In cases where caller don't care about the hints can return default blk_alloc_hints. /// /// @param header Header originally passed with repl_dev::async_alloc_write() api on the leader - /// @param Original context passed as part of repl_dev::async_alloc_write + /// @param data_size Size needed to be allocated for /// @return Expected to return blk_alloc_hints for this write - virtual blk_alloc_hints get_blk_alloc_hints(sisl::blob const& header, cintrusive< repl_req_ctx >& ctx) = 0; + virtual blk_alloc_hints get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size) = 0; /// @brief Called when the replica set is being stopped virtual void on_replica_stop() = 0; @@ -217,14 +228,23 @@ class ReplDev { /// @brief Gets the group_id this repldev is working for /// @return group_id - virtual uuid_t group_id() const = 0; - - virtual void attach_listener(std::unique_ptr< ReplDevListener > listener) { m_listener = std::move(listener); } + virtual group_id_t group_id() const = 0; + /// @brief Gets the block size with which IO will happen on this device + /// @return Block size virtual uint32_t get_blk_size() const = 0; + virtual void attach_listener(shared< ReplDevListener > listener) { m_listener = std::move(listener); } + protected: - std::unique_ptr< ReplDevListener > m_listener; + shared< ReplDevListener > m_listener; }; } // namespace homestore + +template <> +struct fmt::formatter< homestore::repl_key > : fmt::formatter< std::string > { + auto format(const homestore::repl_key& a, format_context& ctx) const { + return fmt::formatter< std::string >::format(a.to_string(), ctx); + } +}; diff --git a/src/include/homestore/replication_service.hpp b/src/include/homestore/replication_service.hpp index fe1469e5d..72d24d626 100644 --- a/src/include/homestore/replication_service.hpp +++ b/src/include/homestore/replication_service.hpp @@ -64,15 +64,16 @@ class ReplicationService { /// @param members List of members to form this group /// @param listener state machine listener of all the events happening on the repl_dev (commit, precommit etc) /// @return A Future ReplDev on success or Future ReplServiceError upon error - virtual AsyncReplResult< shared< ReplDev > > create_repl_dev(uuid_t group_id, - std::set< uuid_t, std::less<> >&& members) = 0; + virtual AsyncReplResult< shared< ReplDev > > create_repl_dev(group_id_t group_id, + std::set< replica_id_t > const& members) = 0; - virtual AsyncReplResult<> replace_member(uuid_t group_id, uuid_t member_out, uuid_t member_in) const = 0; + virtual AsyncReplResult<> replace_member(group_id_t group_id, replica_id_t member_out, + replica_id_t member_in) const = 0; /// @brief Get the repl dev for a given group id if it is already created or opened /// @param group_id Group id interested in /// @return ReplDev is opened or ReplServiceError::SERVER_NOT_FOUND if it doesn't exist - virtual ReplResult< shared< ReplDev > > get_repl_dev(uuid_t group_id) const = 0; + virtual ReplResult< shared< ReplDev > > get_repl_dev(group_id_t group_id) const = 0; /// @brief Iterate over all repl devs and then call the callback provided /// @param cb Callback with repl dev @@ -97,11 +98,13 @@ class ReplApplication { // Called when the repl dev is found upon restart of the homestore instance. The caller should return an instance of // Listener corresponding to the ReplDev which will be used to perform the precommit/commit/rollback. - virtual std::unique_ptr< ReplDevListener > create_repl_dev_listener(uuid_t group_id) = 0; + virtual shared< ReplDevListener > create_repl_dev_listener(group_id_t group_id) = 0; - virtual std::string lookup_peer(uuid_t uuid) const = 0; + // Given the uuid of the peer, get their address and port + virtual std::pair< std::string, uint16_t > lookup_peer(replica_id_t uuid) const = 0; - virtual uint16_t lookup_port() const = 0; + // Get the current application/server repl uuid + virtual replica_id_t get_my_repl_id() const = 0; }; } // namespace homestore diff --git a/src/include/homestore/superblk_handler.hpp b/src/include/homestore/superblk_handler.hpp index f76262ef5..25041dbc4 100644 --- a/src/include/homestore/superblk_handler.hpp +++ b/src/include/homestore/superblk_handler.hpp @@ -49,7 +49,7 @@ class superblk { return m_sb; } - T* create(uint32_t size) { + T* create(uint32_t size = sizeof(T)) { if (meta_service().is_aligned_buf_needed(size)) { auto al_sz = meta_service().align_size(); m_raw_buf = sisl::make_byte_array(uint32_cast(sisl::round_up(size, al_sz)), al_sz, sisl::buftag::metablk); diff --git a/src/lib/blkdata_svc/blkdata_service.cpp b/src/lib/blkdata_svc/blkdata_service.cpp index 9b899e97b..822ca6566 100644 --- a/src/lib/blkdata_svc/blkdata_service.cpp +++ b/src/lib/blkdata_svc/blkdata_service.cpp @@ -231,4 +231,6 @@ uint64_t BlkDataService::get_total_capacity() const { return m_vdev->size(); } uint64_t BlkDataService::get_used_capacity() const { return m_vdev->used_size(); } +uint32_t BlkDataService::get_align_size() const { return m_vdev->align_size(); } + } // namespace homestore diff --git a/src/lib/common/homestore_config.fbs b/src/lib/common/homestore_config.fbs index e299ee8f9..8ea23aba6 100644 --- a/src/lib/common/homestore_config.fbs +++ b/src/lib/common/homestore_config.fbs @@ -189,6 +189,36 @@ table MetaBlkStore { sanity_check_interval: uint32 = 10 (hotswap); } +table Consensus { + // Backoff for any rpc failure + rpc_backoff_ms: uint32 = 250; + + // Frequency of Raft heartbeat + heartbeat_period_ms: uint32 = 250; + + // Re-election timeout low and high mark + elect_to_low_ms: uint32 = 900; + elect_to_high_ms: uint32 = 1400; + + // When a new member is being synced, the batch size of number of logs to be shipped + log_sync_batch_size: int32 = 100; + + // Log distance with which snapshot/compact needs to happen. 0 means snapshot is disabled + snapshot_freq_distance: int32 = 0; + + // Max append batch size + max_append_batch_size: int32 = 64; + + // Threshold of log gap from leader to consider a replica as stale + stale_log_gap_hi_threshold: int32 = 200; + + // Threshold of log gap from leader to consider a replica as come out of stale and became fresh + stale_log_gap_lo_threshold: int32 = 30; + + // Minimum log gap a replica has to be from leader before joining the replica set. + min_log_gap_to_join: int32 = 30; +} + table HomeStoreSettings { version: uint32 = 1; generic: Generic; @@ -199,6 +229,7 @@ table HomeStoreSettings { logstore: LogStore; resource_limits: ResourceLimits; metablk: MetaBlkStore; + consensus: Consensus; } root_type HomeStoreSettings; diff --git a/src/lib/homestore.cpp b/src/lib/homestore.cpp index bf91f1a79..fd6d21570 100644 --- a/src/lib/homestore.cpp +++ b/src/lib/homestore.cpp @@ -231,6 +231,11 @@ void HomeStore::shutdown() { LOGINFO("Homestore shutdown is started"); + if (has_repl_data_service()) { + s_cast< GenericReplService* >(m_repl_service.get())->stop(); + m_repl_service.reset(); + } + if (has_index_service()) { m_index_service->stop(); // m_index_service.reset(); @@ -248,10 +253,6 @@ void HomeStore::shutdown() { if (has_data_service()) { m_data_service.reset(); } - if (has_repl_data_service()) { - s_cast< GenericReplService* >(m_repl_service.get())->stop(); - m_repl_service.reset(); - } m_dev_mgr->close_devices(); m_dev_mgr.reset(); m_cp_mgr->shutdown(); diff --git a/src/lib/logstore/log_dev.cpp b/src/lib/logstore/log_dev.cpp index 39e8ff162..d27e53501 100644 --- a/src/lib/logstore/log_dev.cpp +++ b/src/lib/logstore/log_dev.cpp @@ -236,18 +236,10 @@ int64_t LogDev::append_async(const logstore_id_t store_id, const logstore_seq_nu } log_buffer LogDev::read(const logdev_key& key, serialized_log_record& return_record_header) { - static thread_local sisl::aligned_unique_ptr< uint8_t, sisl::buftag::logread > read_buf; + auto buf = sisl::make_byte_array(initial_read_size, m_flush_size_multiple, sisl::buftag::logread); + m_vdev->sync_pread(buf->bytes(), initial_read_size, key.dev_offset); - // First read the offset and read the log_group. Then locate the log_idx within that and get the actual data - // Read about 4K of buffer - if (!read_buf) { - read_buf = sisl::aligned_unique_ptr< uint8_t, sisl::buftag::logread >::make_sized(m_flush_size_multiple, - initial_read_size); - } - auto rbuf = read_buf.get(); - m_vdev->sync_pread(rbuf, initial_read_size, key.dev_offset); - - auto* header = r_cast< const log_group_header* >(rbuf); + auto* header = r_cast< const log_group_header* >(buf->cbytes()); HS_REL_ASSERT_EQ(header->magic_word(), LOG_GROUP_HDR_MAGIC, "Log header corrupted with magic mismatch!"); HS_REL_ASSERT_EQ(header->get_version(), log_group_header::header_version, "Log header version mismatch!"); HS_REL_ASSERT_LE(header->start_idx(), key.idx, "log key offset does not match with log_idx"); @@ -257,44 +249,30 @@ log_buffer LogDev::read(const logdev_key& key, serialized_log_record& return_rec // We can only do crc match in read if we have read all the blocks. We don't want to aggressively read more data // than we need to just to compare CRC for read operation. It can be done during recovery. if (header->total_size() <= initial_read_size) { - crc32_t const crc = crc32_ieee(init_crc32, reinterpret_cast< const uint8_t* >(rbuf) + sizeof(log_group_header), + crc32_t const crc = crc32_ieee(init_crc32, (buf->cbytes() + sizeof(log_group_header)), header->total_size() - sizeof(log_group_header)); HS_REL_ASSERT_EQ(header->this_group_crc(), crc, "CRC mismatch on read data"); } - auto record_header = header->nth_record(key.idx - header->start_log_idx); uint32_t const data_offset = (record_header->offset + (record_header->get_inlined() ? 0 : header->oob_data_offset)); - sisl::byte_array b = sisl::make_byte_array(uint32_cast(record_header->size)); - if ((data_offset + b->size()) < initial_read_size) { - std::memcpy(static_cast< void* >(b->bytes()), static_cast< const void* >(rbuf + data_offset), - b->size()); // Already read them enough, copy the data + sisl::byte_view ret_view; + if ((data_offset + record_header->size) < initial_read_size) { + ret_view = sisl::byte_view{buf, data_offset, record_header->size}; } else { - // Round them data offset to dma boundary in-order to make sure pread on direct io succeed. We need to skip - // the rounded portion while copying to user buffer auto const rounded_data_offset = sisl::round_down(data_offset, m_vdev->align_size()); - auto const rounded_size = sisl::round_up(b->size() + data_offset - rounded_data_offset, m_vdev->align_size()); - - // Allocate a fresh aligned buffer, if size cannot fit standard size - if (rounded_size > initial_read_size) { - rbuf = hs_utils::iobuf_alloc(rounded_size, sisl::buftag::logread, m_vdev->align_size()); - } - - /* THIS_LOGDEV_LOG(TRACE, - "Addln read as data resides outside initial_read_size={} key.idx={} - key.group_dev_offset={} " "data_offset={} size={} rounded_data_offset={} rounded_size={}", initial_read_size, - key.idx, key.dev_offset, data_offset, b.size(), rounded_data_offset, rounded_size); */ - m_vdev->sync_pread(rbuf, rounded_size, key.dev_offset + rounded_data_offset); - std::memcpy(static_cast< void* >(b->bytes()), - static_cast< const void* >(rbuf + data_offset - rounded_data_offset), b->size()); - - // Free the buffer in case we allocated above - if (rounded_size > initial_read_size) { hs_utils::iobuf_free(rbuf, sisl::buftag::logread); } + auto const rounded_size = + sisl::round_up(record_header->size + data_offset - rounded_data_offset, m_vdev->align_size()); + auto new_buf = sisl::make_byte_array(rounded_size, m_vdev->align_size(), sisl::buftag::logread); + m_vdev->sync_pread(new_buf->bytes(), rounded_size, key.dev_offset + rounded_data_offset); + ret_view = sisl::byte_view{new_buf, s_cast< uint32_t >(data_offset - rounded_data_offset), record_header->size}; } + return_record_header = serialized_log_record(record_header->size, record_header->offset, record_header->get_inlined(), record_header->store_seq_num, record_header->store_id); - return log_buffer{b}; + + return ret_view; } logstore_id_t LogDev::reserve_store_id() { diff --git a/src/lib/replication/CMakeLists.txt b/src/lib/replication/CMakeLists.txt index 7d3587e5e..448d14adc 100644 --- a/src/lib/replication/CMakeLists.txt +++ b/src/lib/replication/CMakeLists.txt @@ -3,29 +3,26 @@ include (${CMAKE_SOURCE_DIR}/cmake/test_mode.cmake) include_directories (BEFORE ..) include_directories (BEFORE .) +list(APPEND SCHEMA_FLAGS "--scoped-enums" "--gen-name-strings" "--cpp-std=c++17" "--cpp-static-reflection" "--reflect-names") + flatbuffers_generate_headers( TARGET hs_replication_fb - SCHEMAS rpc/push_data_rpc.fbs - FLAGS "--cpp" + SCHEMAS push_data_rpc.fbs + FLAGS ${SCHEMA_FLAGS} ) -#build_flatbuffers( -# rpc/push_data_rpc.fbs -# "" -# hs_replication_fb -# "" -# "${CMAKE_CURRENT_BINARY_DIR}/generated/" -# "" -# "" -# ) -#target_link_libraries(hs_replication_fb ${COMMON_DEPS}) add_library(hs_replication OBJECT) target_sources(hs_replication PRIVATE service/generic_repl_svc.cpp + service/raft_repl_service.cpp repl_dev/solo_repl_dev.cpp repl_dev/common.cpp + repl_dev/raft_repl_dev.cpp + repl_dev/raft_state_machine.cpp + log_store/repl_log_store.cpp + log_store/home_raft_log_store.cpp ) -target_link_libraries(hs_replication PRIVATE ${COMMON_DEPS} hs_replication_fb) +target_link_libraries(hs_replication PRIVATE ${COMMON_DEPS} hs_common hs_replication_fb) #set(FLATBUFFERS_FLATC_EXECUTABLE ${flatbuffers_LIB_DIRS}/../bin/flatc) #flatbuffer_gen_cpp(${FLATBUFFERS_FLATC_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/generated/ hs_replication rpc/push_data_rpc.fbs rpc/fetch_data_rpc.fbs) diff --git a/src/lib/replication/fetch_data_rpc.fbs b/src/lib/replication/fetch_data_rpc.fbs new file mode 100644 index 000000000..d73d4dd1f --- /dev/null +++ b/src/lib/replication/fetch_data_rpc.fbs @@ -0,0 +1,34 @@ +namespace homestore; + +table RequestEntry { + lsn : int64; // LSN of the raft log if known + raft_term : uint64; // Raft term number + dsn : uint64; // Data Sequence number + user_header: [ubyte]; // User header bytes + user_key : [ubyte]; // User key data + blkid_originator : int32; // Originally which replica's blkid is this + remote_blkid : [ubyte]; // Serialized remote blkid +} + +table FetchDataRequest { + entries : [RequestEntry]; // Array of request entries +} + +table ResponseEntry { + lsn : [int64]; // LSN of the raft log if known + dsn : uint64; // Data Sequence number + raft_term : uint64; // Raft term number + data_size : uint32; // Size of the data which is sent as separate non flatbuffer +} + +table FetchDataResponse { + issuer_replica_id : int32; // Replica id of the issuer + entries : [ResponseEntry]; // Array of request entries +} + +table FetchData { + request : FetchDataRequest; + response : FetchDataResponse; +} + +root_type FetchData; \ No newline at end of file diff --git a/src/lib/replication/log_store/home_raft_log_store.cpp b/src/lib/replication/log_store/home_raft_log_store.cpp new file mode 100644 index 000000000..bb3b7d2fb --- /dev/null +++ b/src/lib/replication/log_store/home_raft_log_store.cpp @@ -0,0 +1,267 @@ +/********************************************************************************* + * Modifications Copyright 2017-2019 eBay Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ + +#include "home_raft_log_store.h" +#include "storage_engine_buffer.h" +#include + +using namespace homestore; + +SISL_LOGGING_DECL(replication) + +#define REPL_STORE_LOG(level, msg, ...) \ + LOG##level##MOD_FMT(replication, ([&](fmt::memory_buffer& buf, const char* msgcb, auto&&... args) -> bool { \ + fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}:{}] "}, \ + fmt::make_format_args(file_name(__FILE__), __LINE__)); \ + fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}={}] "}, \ + fmt::make_format_args("replstore", m_logstore_id)); \ + fmt::vformat_to(fmt::appender{buf}, fmt::string_view{msgcb}, \ + fmt::make_format_args(std::forward< decltype(args) >(args)...)); \ + return true; \ + }), \ + msg, ##__VA_ARGS__); + +namespace homestore { +static constexpr store_lsn_t to_store_lsn(uint64_t raft_lsn) { return s_cast< store_lsn_t >(raft_lsn) - 1; } +static constexpr store_lsn_t to_store_lsn(repl_lsn_t repl_lsn) { return repl_lsn - 1; } +static constexpr repl_lsn_t to_repl_lsn(store_lsn_t store_lsn) { return store_lsn + 1; } + +static nuraft::ptr< nuraft::log_entry > to_nuraft_log_entry(const log_buffer& log_bytes) { + uint8_t const* raw_ptr = log_bytes.bytes(); + uint64_t term = *r_cast< uint64_t const* >(raw_ptr); + raw_ptr += sizeof(uint64_t); + nuraft::log_val_type type = static_cast< nuraft::log_val_type >(*raw_ptr); + raw_ptr += sizeof(nuraft::log_val_type); + + size_t data_len = log_bytes.size() - sizeof(uint64_t) - sizeof(nuraft::log_val_type); + auto nb = nuraft::buffer::alloc(data_len); + nb->put_raw(raw_ptr, data_len); + return nuraft::cs_new< nuraft::log_entry >(term, nb, type); +} + +static uint64_t extract_term(const log_buffer& log_bytes) { + uint8_t const* raw_ptr = log_bytes.bytes(); + return (*r_cast< uint64_t const* >(raw_ptr)); +} + +HomeRaftLogStore::HomeRaftLogStore(logstore_id_t logstore_id) { + m_dummy_log_entry = nuraft::cs_new< nuraft::log_entry >(0, nuraft::buffer::alloc(0), nuraft::log_val_type::app_log); + + if (logstore_id == UINT32_MAX) { + m_log_store = logstore_service().create_new_log_store(LogStoreService::DATA_LOG_FAMILY_IDX, true); + if (!m_log_store) { throw std::runtime_error("Failed to create log store"); } + m_logstore_id = m_log_store->get_store_id(); + LOGDEBUGMOD(replication, "Opened new home log store id={}", m_logstore_id); + } else { + m_logstore_id = logstore_id; + LOGDEBUGMOD(replication, "Opening existing home log store id={}", logstore_id); + logstore_service().open_log_store(LogStoreService::DATA_LOG_FAMILY_IDX, logstore_id, true, + [this](shared< HomeLogStore > log_store) { + m_log_store = std::move(log_store); + DEBUG_ASSERT_EQ(m_logstore_id, m_log_store->get_store_id(), + "Mismatch in passed and create logstore id"); + REPL_STORE_LOG(DEBUG, "Home Log store created/opened successfully"); + }); + } +} + +void HomeRaftLogStore::remove_store() { + REPL_STORE_LOG(DEBUG, "Logstore is being physically removed"); + logstore_service().remove_log_store(LogStoreService::DATA_LOG_FAMILY_IDX, m_logstore_id); + m_log_store.reset(); +} + +ulong HomeRaftLogStore::next_slot() const { + uint64_t next_slot = to_repl_lsn(m_log_store->get_contiguous_issued_seq_num(m_last_durable_lsn)) + 1; + REPL_STORE_LOG(DEBUG, "next_slot()={}", next_slot); + return next_slot; +} + +ulong HomeRaftLogStore::start_index() const { + // start_index starts from 1. + ulong start_index = std::max((repl_lsn_t)1, to_repl_lsn(m_log_store->truncated_upto()) + 1); + REPL_STORE_LOG(DEBUG, "start_index()={}", start_index); + return start_index; +} + +nuraft::ptr< nuraft::log_entry > HomeRaftLogStore::last_entry() const { + store_lsn_t max_seq = m_log_store->get_contiguous_issued_seq_num(m_last_durable_lsn); + REPL_STORE_LOG(DEBUG, "last_entry() store seqnum={}", max_seq); + if (max_seq < 0) { return m_dummy_log_entry; } + + nuraft::ptr< nuraft::log_entry > nle; + try { + auto log_bytes = m_log_store->read_sync(max_seq); + nle = to_nuraft_log_entry(log_bytes); + } catch (const std::exception& e) { + REPL_STORE_LOG(ERROR, "last_entry() out_of_range={}", max_seq); + throw e; + } + + return nle; +} + +ulong HomeRaftLogStore::append(nuraft::ptr< nuraft::log_entry >& entry) { + REPL_STORE_LOG(TRACE, "append entry term={}, log_val_type={} size={}", entry->get_term(), + static_cast< uint32_t >(entry->get_val_type()), entry->get_buf().size()); + auto buf = entry->serialize(); + return append(buf); +} + +ulong HomeRaftLogStore::append(raft_buf_ptr_t& buffer) { + auto next_seq = m_log_store->append_async( + sisl::io_blob{buffer->data_begin(), uint32_cast(buffer->size()), false /* is_aligned */}, nullptr /* cookie */, + [buffer](int64_t, sisl::io_blob&, logdev_key, void*) {}); + return to_repl_lsn(next_seq); +} + +void HomeRaftLogStore::write_at(ulong index, nuraft::ptr< nuraft::log_entry >& entry) { + auto buf = entry->serialize(); + write_at(index, buf); +} + +void HomeRaftLogStore::write_at(ulong index, raft_buf_ptr_t& buffer) { + m_log_store->rollback_async(to_store_lsn(index) - 1, nullptr); + // we need to reset the durable lsn, because its ok to set to lower number as it will be updated on next flush + // calls, but it is dangerous to set higher number. + m_last_durable_lsn = -1; + append(buffer); +} + +void HomeRaftLogStore::end_of_append_batch(ulong start, ulong cnt) { + store_lsn_t end_lsn = to_store_lsn(start + cnt - 1); + m_log_store->flush_sync(end_lsn); + m_last_durable_lsn = end_lsn; +} + +nuraft::ptr< std::vector< nuraft::ptr< nuraft::log_entry > > > HomeRaftLogStore::log_entries(ulong start, ulong end) { + auto out_vec = std::make_shared< std::vector< nuraft::ptr< nuraft::log_entry > > >(); + m_log_store->foreach (to_store_lsn(start), [end, &out_vec](store_lsn_t cur, const log_buffer& entry) -> bool { + bool ret = (cur < to_store_lsn(end) - 1); + if (cur < to_store_lsn(end)) { out_vec->emplace_back(to_nuraft_log_entry(entry)); } + return ret; + }); + return out_vec; +} + +nuraft::ptr< nuraft::log_entry > HomeRaftLogStore::entry_at(ulong index) { + nuraft::ptr< nuraft::log_entry > nle; + try { + auto log_bytes = m_log_store->read_sync(to_store_lsn(index)); + nle = to_nuraft_log_entry(log_bytes); + } catch (const std::exception& e) { + REPL_STORE_LOG(ERROR, "entry_at({}) index out_of_range", index); + throw e; + } + return nle; +} + +ulong HomeRaftLogStore::term_at(ulong index) { + ulong term; + try { + auto log_bytes = m_log_store->read_sync(to_store_lsn(index)); + term = extract_term(log_bytes); + } catch (const std::exception& e) { + REPL_STORE_LOG(ERROR, "term_at({}) index out_of_range", index); + throw e; + } + return term; +} + +raft_buf_ptr_t HomeRaftLogStore::pack(ulong index, int32_t cnt) { + static constexpr size_t estimated_record_size = 128; + size_t estimated_size = cnt * estimated_record_size + sizeof(uint32_t); + + // << Format >> + // # records (N) 4 bytes + // +--- + // | log length (X) 4 bytes + // | log data X bytes + // +--- repeat N + raft_buf_ptr_t out_buf = nuraft::buffer::alloc(estimated_size); + out_buf->put(cnt); + + int32_t remain_cnt = cnt; + m_log_store->foreach ( + to_store_lsn(index), + [this, &out_buf, &remain_cnt]([[maybe_unused]] store_lsn_t cur, const log_buffer& entry) mutable -> bool { + if (remain_cnt-- > 0) { + size_t avail_size = out_buf->size() - out_buf->pos(); + if (avail_size < entry.size()) { + avail_size += std::max(out_buf->size() * 2, (size_t)entry.size()); + out_buf = nuraft::buffer::expand(*out_buf, avail_size); + } + REPL_STORE_LOG(TRACE, "packing lsn={} of size={}, avail_size in buffer={}", to_repl_lsn(cur), + entry.size(), avail_size); + out_buf->put(entry.bytes(), entry.size()); + } + return (remain_cnt > 0); + }); + return out_buf; +} + +void HomeRaftLogStore::apply_pack(ulong index, nuraft::buffer& pack) { + pack.pos(0); + auto num_entries = pack.get_int(); + + auto slot = next_slot(); + if (index < slot) { + // We are asked to apply/insert data behind next slot, so we must rollback before index and then append + m_log_store->rollback_async(to_store_lsn(index) - 1, nullptr); + } else if (index > slot) { + // We are asked to apply/insert data after next slot, so we need to fill in with dummy entries upto the slot + // before append the entries + REPL_STORE_LOG(WARN, + "RaftLogStore is asked to apply pack on lsn={}, but current lsn={} is behind, will be filling " + "with dummy data to make it functional, however, this could result in inconsistent data", + index, to_store_lsn(slot)); + while (index++ < slot) { + append(m_dummy_log_entry); + } + } + + for (int i{0}; i < num_entries; ++i) { + size_t entry_len; + auto* entry = const_cast< nuraft::byte* >(pack.get_bytes(entry_len)); + [[maybe_unused]] auto store_sn = + m_log_store->append_async(sisl::io_blob{entry, uint32_cast(entry_len), false}, nullptr, nullptr); + REPL_STORE_LOG(TRACE, "unpacking nth_entry={} of size={}, lsn={}", i + 1, entry_len, to_repl_lsn(store_sn)); + } + m_log_store->flush_sync(to_store_lsn(index) + num_entries - 1); +} + +bool HomeRaftLogStore::compact(ulong compact_lsn) { + auto cur_max_lsn = m_log_store->get_contiguous_issued_seq_num(m_last_durable_lsn); + if (cur_max_lsn < to_store_lsn(compact_lsn)) { + // We need to fill the remaining entries with dummy data. + for (auto lsn{cur_max_lsn + 1}; lsn <= to_store_lsn(compact_lsn); ++lsn) { + append(m_dummy_log_entry); + } + } + m_log_store->flush_sync(to_store_lsn(compact_lsn)); + m_log_store->truncate(to_store_lsn(compact_lsn)); + return true; +} + +bool HomeRaftLogStore::flush() { + m_log_store->flush_sync(); + return true; +} + +ulong HomeRaftLogStore::last_durable_index() { + m_last_durable_lsn = m_log_store->get_contiguous_completed_seq_num(m_last_durable_lsn); + return to_repl_lsn(m_last_durable_lsn); +} +} // namespace homestore diff --git a/src/lib/replication/log_store/home_raft_log_store.h b/src/lib/replication/log_store/home_raft_log_store.h new file mode 100644 index 000000000..c49cef310 --- /dev/null +++ b/src/lib/replication/log_store/home_raft_log_store.h @@ -0,0 +1,180 @@ +/********************************************************************************* + * Modifications Copyright 2017-2019 eBay Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ +#pragma once + +#include +#include + +#if defined __clang__ or defined __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#endif +#include +#if defined __clang__ or defined __GNUC__ +#pragma GCC diagnostic pop +#endif +#undef auto_lock + +namespace homestore { + +using store_lsn_t = int64_t; +using repl_lsn_t = int64_t; +using raft_buf_ptr_t = nuraft::ptr< nuraft::buffer >; + +class HomeRaftLogStore : public nuraft::log_store { +public: + HomeRaftLogStore(homestore::logstore_id_t logstore_id = UINT32_MAX); + virtual ~HomeRaftLogStore() = default; + + void remove_store(); + + /** + * The first available slot of the store, starts with 1. + * + * @return Last log index number + 1 + */ + virtual ulong next_slot() const override; + + /** + * The start index of the log store, at the very beginning, it must be 1. + * However, after some compact actions, this could be anything + * greater or equals to one. + * + * @return Starting log index number. + */ + virtual ulong start_index() const override; + + /** + * The last log entry in store. + * + * @return If no log entry exists: a dummy constant entry with + * value set to null and term set to zero. + */ + virtual nuraft::ptr< nuraft::log_entry > last_entry() const override; + + /** + * Append a log entry to store + * + * @param entry Log entry + * @return Log index number. + */ + virtual ulong append(nuraft::ptr< nuraft::log_entry >& entry) override; + + // An alternate method on entries already serialized into the raft buffer + ulong append(raft_buf_ptr_t& buffer); + + /** + * Overwrite a log entry at the given `index`. + * + * @param index Log index number to overwrite. + * @param entry New log entry to overwrite. + */ + virtual void write_at(ulong index, nuraft::ptr< nuraft::log_entry >& entry) override; + + // An alternate method on entries already serialized into the raft buffer + void write_at(ulong index, raft_buf_ptr_t& buffer); + + /** + * Invoked after a batch of logs is written as a part of + * a single append_entries request. + * + * @param start The start log index number (inclusive) + * @param cnt The number of log entries written. + */ + virtual void end_of_append_batch(ulong start, ulong cnt) override; + + /** + * Get log entries with index [start, end). + * + * @param start The start log index number (inclusive). + * @param end The end log index number (exclusive). + * @return The log entries between [start, end). + */ + virtual nuraft::ptr< std::vector< nuraft::ptr< nuraft::log_entry > > > log_entries(ulong start, ulong end) override; + + /** + * Get the log entry at the specified log index number. + * + * @param index Should be equal to or greater than 1. + * @return The log entry or null if index >= this->next_slot(). + */ + virtual nuraft::ptr< nuraft::log_entry > entry_at(ulong index) override; + + /** + * Get the term for the log entry at the specified index + * Suggest to stop the system if the index >= this->next_slot() + * + * @param index Should be equal to or greater than 1. + * @return The term for the specified log entry, or + * 0 if index < this->start_index(). + */ + virtual ulong term_at(ulong index) override; + + /** + * Pack cnt log items starts from index + * + * @param index The start log index number (inclusive). + * @param cnt The number of logs to pack. + * @return log pack + */ + virtual raft_buf_ptr_t pack(ulong index, int32_t cnt) override; + + /** + * Apply the log pack to current log store, starting from index. + * + * @param index The start log index number (inclusive). + * @param pack + */ + virtual void apply_pack(ulong index, nuraft::buffer& pack); + + /** + * Compact the log store by purging all log entries, + * including the log at the last_log_index. + * + * If current max log idx is smaller than given `last_log_index`, + * set start log idx to `last_log_index + 1`. + * + * @param last_log_index Log index number that will be purged up to (inclusive). + * @return True on success. + */ + virtual bool compact(ulong last_log_index) override; + + /** + * Synchronously flush all log entries in this log store to the backing storage + * so that all log entries are guaranteed to be durable upon process crash. + * + * @return `true` on success. + */ + virtual bool flush() override; + + /** + * This API is used only when `raft_params::parallel_log_appending_` flag is set. + * Please refer to the comment of the flag. + * + * NOTE: In homestore replication use cases, we use this even without parallel_log_appending_ flag is not set + * + * @return The last durable log index. + */ + virtual ulong last_durable_index() override; + + logstore_id_t logstore_id() const { return m_logstore_id; } + +private: + logstore_id_t m_logstore_id; + shared< HomeLogStore > m_log_store; + nuraft::ptr< nuraft::log_entry > m_dummy_log_entry; + store_lsn_t m_last_durable_lsn{-1}; +}; +} // namespace homestore \ No newline at end of file diff --git a/src/lib/replication/log_store/repl_log_store.cpp b/src/lib/replication/log_store/repl_log_store.cpp new file mode 100644 index 000000000..84a12925d --- /dev/null +++ b/src/lib/replication/log_store/repl_log_store.cpp @@ -0,0 +1,70 @@ +#include +#include "replication/log_store/repl_log_store.h" +#include "replication/repl_dev/raft_state_machine.h" +#include "replication/repl_dev/raft_repl_dev.h" +#include "replication/repl_dev/common.h" + +namespace homestore { + +uint64_t ReplLogStore::append(nuraft::ptr< nuraft::log_entry >& entry) { + repl_req_ptr_t rreq = m_sm.transform_journal_entry(entry); + ulong lsn; + if (rreq) { + lsn = HomeRaftLogStore::append(rreq->raft_journal_buf()); + m_sm.link_lsn_to_req(rreq, int64_cast(lsn)); + RD_LOG(INFO, "Raft Channel: Received log entry rreq=[{}]", rreq->to_compact_string()); + } else { + lsn = HomeRaftLogStore::append(entry); + } + return lsn; +} + +void ReplLogStore::write_at(ulong index, nuraft::ptr< nuraft::log_entry >& entry) { + repl_req_ptr_t rreq = m_sm.transform_journal_entry(entry); + if (rreq) { + HomeRaftLogStore::write_at(index, rreq->raft_journal_buf()); + m_sm.link_lsn_to_req(rreq, int64_cast(index)); + RD_LOG(INFO, "Raft Channel: Received log entry rreq=[{}]", rreq->to_compact_string()); + } else { + HomeRaftLogStore::write_at(index, entry); + } +} + +void ReplLogStore::end_of_append_batch(ulong start_lsn, ulong count) { + // Skip this call in leader, since this method will synchronously flush the data, which is not required for + // leader. Leader will call the flush as part of commit after receiving quorum, upon which time, there is a high + // possibility the log entry is already flushed. + if (!m_rd.is_leader()) { + int64_t end_lsn = int64_cast(start_lsn + count - 1); + + // Start fetch the batch of data for this lsn range from remote if its not available yet. + auto reqs = sisl::VectorPool< repl_req_ptr_t >::alloc(); + for (int64_t lsn = int64_cast(start_lsn); lsn <= end_lsn; ++lsn) { + reqs->emplace_back(m_sm.lsn_to_req(lsn)); + } + + // Check the map if data corresponding to all of these requsts have been received and written. If not, schedule + // a fetch and write. Once all requests are completed and written, these requests are poped out of the map and + // the future will be ready. + auto fut = m_rd.notify_after_data_written(reqs); + + // In the meanwhile, we can flush the journal for this lsn batch. It is ok to flush the entries in log before + // actual data is written, because, even if we have the log, it doesn't mean data is committed, until state + // machine reports that. This way the flush and fetch both can run in parallel. + HomeRaftLogStore::end_of_append_batch(start_lsn, count); + + // Wait for the fetch and write to be completed successfully. + std::move(fut).get(); + + // Mark all the pbas also completely written + for (auto const& rreq : *reqs) { + if (rreq) { rreq->state.fetch_or(uint32_cast(repl_req_state_t::LOG_FLUSHED)); } + } + + sisl::VectorPool< repl_req_ptr_t >::free(reqs); + } +} + +std::string ReplLogStore::rdev_name() const { return m_rd.rdev_name(); } + +} // namespace homestore diff --git a/src/lib/replication/log_store/repl_log_store.h b/src/lib/replication/log_store/repl_log_store.h new file mode 100644 index 000000000..c2fb615f2 --- /dev/null +++ b/src/lib/replication/log_store/repl_log_store.h @@ -0,0 +1,33 @@ +#pragma once + +#include +#include +#include "replication/log_store/home_raft_log_store.h" + +namespace homestore { + +class RaftReplDev; +class RaftStateMachine; + +class ReplLogStore : public HomeRaftLogStore { +private: + RaftReplDev& m_rd; + RaftStateMachine& m_sm; + std::mutex m_batch_mtx; + std::condition_variable m_batch_cv; + int64_t m_batch_lsn{0}; + +public: + template < typename... Args > + ReplLogStore(RaftReplDev& rd, RaftStateMachine& sm, Args&&... args) : + HomeRaftLogStore{std::forward< Args >(args)...}, m_rd{rd}, m_sm{sm} {} + + uint64_t append(nuraft::ptr< nuraft::log_entry >& entry) override; + void write_at(ulong index, nuraft::ptr< nuraft::log_entry >& entry) override; + void end_of_append_batch(ulong start_lsn, ulong count) override; + +private: + std::string rdev_name() const; +}; + +} // namespace homestore diff --git a/src/lib/replication/log_store/storage_engine_buffer.h b/src/lib/replication/log_store/storage_engine_buffer.h new file mode 100644 index 000000000..87bd90b11 --- /dev/null +++ b/src/lib/replication/log_store/storage_engine_buffer.h @@ -0,0 +1,251 @@ +#pragma once + +#if defined(WIN32) || defined(_WIN32) +#error "Unsupported platform, POSIX only!" +#endif + +extern "C" { +#include +} + +#include +#include +#include +#include +#include + +// Copied from NuKV JungleDB project written by Jung-Sang Ahn and modified by Harihara Kadayam + +namespace homestore { + +struct SEBuf { + /** + * Empty buffer. + */ + SEBuf() = default; + + /** + * Reference to given address. + */ + SEBuf(size_t _len, const void* _buf) : len(_len), buf((void*)_buf) {} + + /** + * Reference to given string object. + */ + SEBuf(const std::string& str) : len(str.size()), buf((void*)str.data()) {} + + /** + * Allocate own memory. + * If given length is 0, it will return an empty buffer. + */ + static SEBuf alloc(size_t _len) { + if (!_len) return SEBuf(); + return SEBuf(_len, malloc(_len)); + } + + /** + * Free own memory. + */ + inline void free() { + ::free(buf); + clear(); + } + + /** + * Clear internal pointer without free. + * User is responsible for managing memory to avoid memory leak. + */ + inline void clear() { + buf = nullptr; + len = 0; + } + + /** + * Return `true` if this buffer is empty. + */ + inline bool empty() const { return (buf == nullptr); } + + /** + * Return the size of this buffer. + */ + inline size_t size() const { return len; } + + /** + * Return the pointer to the data of this buffer. + */ + inline void* data() const { return buf; } + + /** + * Create a std::string object that is clone of this buffer. + */ + inline std::string toString() const { return std::string((const char*)buf, len); } + + /** + * Return a string replacing non-readable character with `.`. + * The max length of string will be upto given `limit`. + */ + std::string rStr(size_t limit = 16) const; + + /** + * Move ownership of data to given buffer `dst`. + */ + inline void moveTo(SEBuf& dst) { + dst = *this; + clear(); + } + + /** + * Make a copy of data and set it to given buffer `dst`. + */ + inline void copyTo(SEBuf& dst) const { + dst = alloc(len); + if (len) { memcpy(dst.buf, buf, len); } + } + + size_t len{0}; + void* buf{nullptr}; + + /** + * To easily free buffer (to avoid memory leak by mistake), + * similar to `std::lock_guard`. + */ + struct AutoFree { + AutoFree(SEBuf& buf) : bufToHold(buf) {} + ~AutoFree() { bufToHold.free(); } + SEBuf& bufToHold; + }; +}; +using SEBufHolder = SEBuf::AutoFree; + +struct SEBufSerializer { + SEBufSerializer(const SEBuf& _buf) : buf(_buf), offset(0), errHappened(false) {} + + inline bool isValid(size_t len) { + if (errHappened || len + pos() > buf.len) { + errHappened = true; + return false; + } + return true; + } + + inline bool ok() const { return !errHappened; } + + inline void pos(size_t _pos) { + assert(_pos <= buf.len); + offset = _pos; + } + + inline size_t pos() const { return offset; } + + inline void clearError() { errHappened = false; } + + inline void* data() { + uint8_t* ptr = (uint8_t*)buf.buf; + return ptr + pos(); + } + + inline void putU64(uint64_t val) { + if (!isValid(sizeof(val))) return; + uint64_t u64 = htobe64(val); + memcpy(data(), &u64, sizeof(u64)); + pos(pos() + sizeof(u64)); + } + + inline void putU32(uint32_t val) { + if (!isValid(sizeof(val))) return; + uint32_t u32 = htobe32(val); + memcpy(data(), &u32, sizeof(u32)); + pos(pos() + sizeof(u32)); + } + + inline void putU16(uint16_t val) { + if (!isValid(sizeof(val))) return; + uint16_t u16 = htobe16(val); + memcpy(data(), &u16, sizeof(u16)); + pos(pos() + sizeof(u16)); + } + + inline void putU8(uint8_t val) { + if (!isValid(sizeof(val))) return; + memcpy(data(), &val, sizeof(val)); + pos(pos() + sizeof(val)); + } + + inline void putRaw(size_t len, const void* src) { + memcpy(data(), src, len); + pos(pos() + len); + } + + inline void put(size_t len, const void* src) { + putU32(len); + if (!isValid(len)) return; + putRaw(len, src); + } + + inline void putString(const std::string& str) { put(str.size(), str.data()); } + + inline void putSEBuf(const SEBuf& buf) { put(buf.len, buf.buf); } + + inline uint64_t getU64() { + if (!isValid(sizeof(uint64_t))) return 0; + uint64_t u64; + memcpy(&u64, data(), sizeof(u64)); + pos(pos() + sizeof(u64)); + return be64toh(u64); + } + + inline uint32_t getU32() { + if (!isValid(sizeof(uint32_t))) return 0; + uint32_t u32; + memcpy(&u32, data(), sizeof(u32)); + pos(pos() + sizeof(u32)); + return be32toh(u32); + } + + inline uint16_t getU16() { + if (!isValid(sizeof(uint16_t))) return 0; + uint16_t u16; + memcpy(&u16, data(), sizeof(u16)); + pos(pos() + sizeof(u16)); + return be16toh(u16); + } + + inline uint8_t getU8() { + if (!isValid(sizeof(uint8_t))) return 0; + uint8_t u8; + memcpy(&u8, data(), sizeof(u8)); + pos(pos() + sizeof(u8)); + return u8; + } + + inline void* getRaw(size_t len) { + void* _data = data(); + pos(pos() + len); + return _data; + } + + inline void* get(size_t& len) { + len = getU32(); + if (!isValid(len)) return nullptr; + return getRaw(len); + } + + inline std::string getString() { + size_t _len; + void* _data = get(_len); + if (!_data) return std::string(); + return std::string((const char*)_data, _len); + } + + inline SEBuf getSEBuf() { + size_t _len; + void* _data = get(_len); + return SEBuf(_len, _data); + } + + const SEBuf& buf; + size_t offset; + bool errHappened; +}; + +} // namespace homestore diff --git a/src/lib/replication/push_data_rpc.fbs b/src/lib/replication/push_data_rpc.fbs new file mode 100644 index 000000000..0bf4ce896 --- /dev/null +++ b/src/lib/replication/push_data_rpc.fbs @@ -0,0 +1,13 @@ +native_include "boost/uuid/uuid.hpp"; +namespace homestore; + +table PushDataRequest { + issuer_replica_id : int32; // Replica id of the issuer + raft_term : uint64; // Raft term number + dsn : uint64; // Data Sequence number + user_header: [ubyte]; // User header bytes + user_key : [ubyte]; // User key data + data_size : uint32; // Data size, actual data is sent as separate blob not by flatbuffer +} + +root_type PushDataRequest; \ No newline at end of file diff --git a/src/lib/replication/repl_dev/common.cpp b/src/lib/replication/repl_dev/common.cpp new file mode 100644 index 000000000..db5540d61 --- /dev/null +++ b/src/lib/replication/repl_dev/common.cpp @@ -0,0 +1,51 @@ +#include +#include +#include +#include +#include "replication/repl_dev/common.h" +#include + +namespace homestore { + +void repl_req_ctx::alloc_journal_entry(uint32_t size, bool is_raft_buf) { + if (is_raft_buf) { + journal_buf = nuraft::buffer::alloc(size); + journal_entry = new (raft_journal_buf()->data_begin()) repl_journal_entry(); + } else { + journal_buf = std::unique_ptr< uint8_t[] >(new uint8_t[size]); + journal_entry = new (raw_journal_buf()) repl_journal_entry(); + } +} + +repl_req_ctx::~repl_req_ctx() { + if (journal_entry) { journal_entry->~repl_journal_entry(); } +} + +raft_buf_ptr_t& repl_req_ctx::raft_journal_buf() { return std::get< raft_buf_ptr_t >(journal_buf); } +uint8_t* repl_req_ctx::raw_journal_buf() { return std::get< std::unique_ptr< uint8_t[] > >(journal_buf).get(); } + +static std::string req_state_name(uint32_t state) { + if (state == (uint32_t)repl_req_state_t::INIT) { return "INIT"; } + + std::string ret; + if (state & (uint32_t)repl_req_state_t::BLK_ALLOCATED) { ret += "BLK_ALLOCATED | "; } + if (state & (uint32_t)repl_req_state_t::DATA_RECEIVED) { ret += "DATA_RECEIVED | "; } + if (state & (uint32_t)repl_req_state_t::DATA_WRITTEN) { ret += "DATA_WRITTEN | "; } + if (state & (uint32_t)repl_req_state_t::LOG_RECEIVED) { ret += "LOG_RECEIVED | "; } + if (state & (uint32_t)repl_req_state_t::LOG_FLUSHED) { ret += "LOG_FLUSHED"; } + return ret; +} + +std::string repl_req_ctx::to_string() const { + return fmt::format( + "repl_key=[{}], lsn={} state=[{}] header_size={} key_size={} is_proposer={} local_blkid={} remote_blkid={}", + rkey.to_string(), lsn, req_state_name(state.load()), header.size(), key.size(), is_proposer, + local_blkid.to_string(), remote_blkid.blkid.to_string()); +} + +std::string repl_req_ctx::to_compact_string() const { + return fmt::format("dsn={} term={} lsn={} state={} ref={}", rkey.dsn, rkey.term, lsn, req_state_name(state.load()), + this->use_count()); +} + +} // namespace homestore \ No newline at end of file diff --git a/src/lib/replication/repl_dev/common.h b/src/lib/replication/repl_dev/common.h new file mode 100644 index 000000000..aa6935581 --- /dev/null +++ b/src/lib/replication/repl_dev/common.h @@ -0,0 +1,89 @@ +/********************************************************************************* + * Modifications Copyright 2017-2019 eBay Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ +#pragma once + +#include + +#include +#include +#include +#include + +namespace homestore { +VENUM(journal_type_t, uint16_t, HS_LARGE_DATA = 0, HS_HEADER_ONLY = 1) + +struct repl_journal_entry { + static constexpr uint16_t JOURNAL_ENTRY_MAJOR = 1; + static constexpr uint16_t JOURNAL_ENTRY_MINOR = 1; + + // Major and minor version. For each major version underlying structures could change. Minor versions can only add + // fields, not change any existing fields. + uint16_t major_version{JOURNAL_ENTRY_MAJOR}; + uint16_t minor_version{JOURNAL_ENTRY_MINOR}; + + journal_type_t code; + int32_t server_id; + uint64_t dsn; // Data seq number + uint32_t user_header_size; + uint32_t key_size; + uint32_t value_size; + // Followed by user_header, then key, then MultiBlkId/value + + std::string to_string() const { + return fmt::format("version={}.{}, code={}, server_id={}, dsn={}, header_size={}, key_size={}, value_size={}", + major_version, minor_version, enum_name(code), server_id, dsn, user_header_size, key_size, + value_size); + } + + std::string to_compact_string() const { + return fmt::format("dsn={}, header_size={}, key_size={}, value_size={}", major_version, minor_version, + enum_name(code), server_id, dsn, user_header_size, key_size, value_size); + } +}; + +#pragma pack(1) +struct repl_dev_superblk { + static constexpr uint64_t REPL_DEV_SB_MAGIC = 0xABCDF00D; + static constexpr uint32_t REPL_DEV_SB_VERSION = 1; + + uint64_t magic{REPL_DEV_SB_MAGIC}; + uint32_t version{REPL_DEV_SB_VERSION}; + uuid_t group_id; // group_id of this replica set + logstore_id_t data_journal_id; // Logstore id for the data journal + int64_t commit_lsn; // LSN upto which this replica has committed + int64_t checkpoint_lsn; // LSN upto which this replica have checkpointed the data + uint64_t group_ordinal; // Ordinal number which will be used to indicate the rdevXYZ for debugging + + uint64_t get_magic() const { return magic; } + uint32_t get_version() const { return version; } +}; +#pragma pack() + +template < class V = folly::Unit > +auto make_async_error(ReplServiceError err) { + return folly::makeSemiFuture< ReplResult< V > >(folly::makeUnexpected(err)); +} + +template < class V > +auto make_async_success(V v) { + return folly::makeSemiFuture< ReplResult< V > >(std::move(v)); +} + +template < class V = folly::Unit > +auto make_async_success() { + return folly::makeSemiFuture< ReplResult< folly::Unit > >(folly::Unit{}); +} + +} // namespace homestore diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp new file mode 100644 index 000000000..fc9d31e6c --- /dev/null +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -0,0 +1,451 @@ +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "common/homestore_assert.hpp" +#include "replication/service/raft_repl_service.h" +#include "replication/repl_dev/raft_repl_dev.h" +#include "push_data_rpc_generated.h" + +namespace homestore { +std::atomic< uint64_t > RaftReplDev::s_next_group_ordinal{1}; + +RaftReplDev::RaftReplDev(RaftReplService& svc, superblk< raft_repl_dev_superblk >& rd_sb, bool load_existing) : + m_repl_svc{svc}, + m_msg_mgr{svc.msg_manager()}, + m_group_id{rd_sb->group_id}, + m_my_repl_id{svc.get_my_repl_uuid()}, + m_raft_server_id{nuraft_mesg::to_server_id(m_my_repl_id)}, + m_rd_sb{rd_sb} { + m_state_machine = std::make_shared< RaftStateMachine >(*this); + + if (load_existing) { + m_data_journal = std::make_shared< ReplLogStore >(*this, *m_state_machine, m_rd_sb->data_journal_id); + m_next_dsn = m_rd_sb->last_applied_dsn + 1; + m_commit_upto_lsn = m_rd_sb->commit_lsn; + m_last_flushed_commit_lsn = m_commit_upto_lsn; + m_rdev_name = fmt::format("rdev{}", m_rd_sb->group_ordinal); + + // Its ok not to do compare exchange, because loading is always single threaded as of now + if (m_rd_sb->group_ordinal >= s_next_group_ordinal.load()) { + s_next_group_ordinal.store(m_rd_sb->group_ordinal + 1); + } + + if (m_rd_sb->is_timeline_consistent) { + logstore_service().open_log_store(LogStoreService::CTRL_LOG_FAMILY_IDX, m_rd_sb->free_blks_journal_id, + false, [this](shared< HomeLogStore > log_store) { + m_free_blks_journal = std::move(log_store); + m_rd_sb->free_blks_journal_id = m_free_blks_journal->get_store_id(); + }); + } + } else { + m_data_journal = std::make_shared< ReplLogStore >(*this, *m_state_machine); + m_rd_sb->data_journal_id = m_data_journal->logstore_id(); + m_rd_sb->last_applied_dsn = 0; + m_rd_sb->group_ordinal = s_next_group_ordinal.fetch_add(1); + m_rdev_name = fmt::format("rdev{}", m_rd_sb->group_ordinal); + + if (m_rd_sb->is_timeline_consistent) { + m_free_blks_journal = + logstore_service().create_new_log_store(LogStoreService::CTRL_LOG_FAMILY_IDX, false /* append_mode */); + m_rd_sb->free_blks_journal_id = m_free_blks_journal->get_store_id(); + } + } + + RD_LOG(INFO, "Started {} RaftReplDev group_id={}, replica_id={}, raft_server_id={} commited_lsn={} next_dsn={}", + (load_existing ? "Existing" : "New"), group_id_str(), my_replica_id_str(), m_raft_server_id, + m_commit_upto_lsn.load(), m_next_dsn.load()); + m_msg_mgr.bind_data_service_request(PUSH_DATA, m_group_id, bind_this(RaftReplDev::on_push_data_received, 1)); + // m_msg_mgr.bind_data_service_request(FETCH_DATA, m_group_id, bind_this(RaftReplDev::on_fetch_data_received, 2)); +} + +void RaftReplDev::use_config(json_superblk raft_config_sb) { m_raft_config_sb = std::move(raft_config_sb); } + +void RaftReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& value, + repl_req_ptr_t rreq) { + if (!rreq) { auto rreq = repl_req_ptr_t(new repl_req_ctx{}); } + rreq->header = header; + rreq->key = key; + rreq->value = value; + + // If it is header only entry, directly propose to the raft + if (rreq->value.size) { + rreq->rkey = + repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1)}; + push_data_to_all_followers(rreq); + + // Step 1: Alloc Blkid + auto status = data_service().alloc_blks(uint32_cast(rreq->value.size), + m_listener->get_blk_alloc_hints(rreq->header, rreq->value.size), + rreq->local_blkid); + HS_REL_ASSERT_EQ(status, BlkAllocStatus::SUCCESS); + + // Write the data + data_service().async_write(rreq->value, rreq->local_blkid).thenValue([this, rreq](auto&& err) { + HS_REL_ASSERT(!err, "Error in writing data"); // TODO: Find a way to return error to the Listener + rreq->state.fetch_or(uint32_cast(repl_req_state_t::DATA_WRITTEN)); + m_state_machine->propose_to_raft(std::move(rreq)); + }); + } else { + RD_LOG(INFO, "Skipping data channel send since value size is 0"); + rreq->state.fetch_or(uint32_cast(repl_req_state_t::DATA_WRITTEN)); + m_state_machine->propose_to_raft(std::move(rreq)); + } +} + +void RaftReplDev::push_data_to_all_followers(repl_req_ptr_t rreq) { + auto& builder = rreq->fb_builder; + + // Prepare the rpc request packet with all repl_reqs details + builder.FinishSizePrefixed(CreatePushDataRequest(builder, server_id(), rreq->rkey.term, rreq->rkey.dsn, + builder.CreateVector(rreq->header.cbytes(), rreq->header.size()), + builder.CreateVector(rreq->key.cbytes(), rreq->key.size()), + rreq->value.size)); + + rreq->pkts = sisl::io_blob::sg_list_to_ioblob_list(rreq->value); + rreq->pkts.insert(rreq->pkts.begin(), sisl::io_blob{builder.GetBufferPointer(), builder.GetSize(), false}); + + /*RD_LOG(INFO, "Data Channel: Pushing data to all followers: rreq=[{}] data=[{}]", rreq->to_string(), + flatbuffers::FlatBufferToString(builder.GetBufferPointer() + sizeof(flatbuffers::uoffset_t), + PushDataRequestTypeTable()));*/ + + RD_LOG(INFO, "Data Channel: Pushing data to all followers: rreq=[{}]", rreq->to_compact_string()); + + group_msg_service() + ->data_service_request_unidirectional(nuraft_mesg::role_regex::ALL, PUSH_DATA, rreq->pkts) + .via(&folly::InlineExecutor::instance()) + .thenValue([this, rreq = std::move(rreq)](auto e) { + // Release the buffer which holds the packets + RD_LOG(INFO, "Data Channel: Data push completed for rreq=[{}]", rreq->to_compact_string()); + rreq->fb_builder.Release(); + rreq->pkts.clear(); + }); +} + +void RaftReplDev::on_push_data_received(intrusive< sisl::GenericRpcData >& rpc_data) { + auto const& incoming_buf = rpc_data->request_blob(); + auto const fb_size = + flatbuffers::ReadScalar< flatbuffers::uoffset_t >(incoming_buf.cbytes()) + sizeof(flatbuffers::uoffset_t); + auto push_req = GetSizePrefixedPushDataRequest(incoming_buf.cbytes()); + sisl::blob header = sisl::blob{push_req->user_header()->Data(), push_req->user_header()->size()}; + sisl::blob key = sisl::blob{push_req->user_key()->Data(), push_req->user_key()->size()}; + + RD_LOG(TRACE, "PushData received on data channel: {}", + flatbuffers::FlatBufferToString(incoming_buf.cbytes() + sizeof(flatbuffers::uoffset_t), + PushDataRequestTypeTable())); + + auto rreq = follower_create_req( + repl_key{.server_id = push_req->issuer_replica_id(), .term = push_req->raft_term(), .dsn = push_req->dsn()}, + header, key, push_req->data_size()); + rreq->rpc_data = rpc_data; + + RD_LOG(INFO, "Data Channel: Received data rreq=[{}]", rreq->to_compact_string()); + + if (rreq->state.fetch_or(uint32_cast(repl_req_state_t::DATA_RECEIVED)) & + uint32_cast(repl_req_state_t::DATA_RECEIVED)) { + // We already received the data before, just ignore this data + // TODO: Should we forcibly overwrite the data with new data? + return; + } + + // Get the data portion from the buffer + HS_DBG_ASSERT_EQ(fb_size + push_req->data_size(), incoming_buf.size(), "Size mismatch of data size vs buffer size"); + uint8_t const* data = incoming_buf.cbytes() + fb_size; + + if (((uintptr_t)data % data_service().get_align_size()) != 0) { + // Unaligned buffer, create a new buffer and copy the entire buf + rreq->buf_for_unaligned_data = + std::move(sisl::io_blob_safe(push_req->data_size(), data_service().get_align_size())); + std::memcpy(rreq->buf_for_unaligned_data.bytes(), data, push_req->data_size()); + data = rreq->buf_for_unaligned_data.cbytes(); + } + + // Schedule a write and upon completion, mark the data as written. + data_service() + .async_write(r_cast< const char* >(data), push_req->data_size(), rreq->local_blkid) + .thenValue([this, rreq](auto&& err) { + RD_REL_ASSERT(!err, "Error in writing data"); // TODO: Find a way to return error to the Listener + rreq->state.fetch_or(uint32_cast(repl_req_state_t::DATA_WRITTEN)); + rreq->data_written_promise.setValue(); + RD_LOG(INFO, "Data Channel: Data Write completed rreq=[{}]", rreq->to_compact_string()); + }); +} + +static bool blob_equals(sisl::blob const& a, sisl::blob const& b) { + if (a.size() != b.size()) { return false; } + return (std::memcmp(a.cbytes(), b.cbytes(), a.size()) == 0); +} + +static MultiBlkId do_alloc_blk(uint32_t size, blk_alloc_hints const& hints) { + MultiBlkId blkid; + auto const status = data_service().alloc_blks(sisl::round_up(size, data_service().get_blk_size()), hints, blkid); + RELEASE_ASSERT_EQ(status, BlkAllocStatus::SUCCESS, "alloc_blks returned null, no space left!"); + return blkid; +} + +repl_req_ptr_t RaftReplDev::follower_create_req(repl_key const& rkey, sisl::blob const& user_header, + sisl::blob const& user_key, uint32_t data_size) { + auto const [it, happened] = m_repl_key_req_map.try_emplace(rkey, repl_req_ptr_t(new repl_req_ctx())); + RD_DBG_ASSERT((it != m_repl_key_req_map.end()), "Unexpected error in map_repl_key_to_req"); + auto rreq = it->second; + + if (!happened) { + // We already have the entry in the map, check if we are already allocated the blk by previous caller, in that + // case we need to return the req. + if (rreq->state.load() & uint32_cast(repl_req_state_t::BLK_ALLOCATED)) { + // Do validation if we have the correct mapping + RD_REL_ASSERT(blob_equals(user_header, rreq->header), "User header mismatch for repl_key={}", + rkey.to_string()); + RD_REL_ASSERT(blob_equals(user_key, rreq->key), "User key mismatch for repl_key={}", rkey.to_string()); + RD_LOG(INFO, "Repl_key=[{}] already received ", rkey.to_string()); + return rreq; + } + } + + // We need to allocate the block, since entry doesn't exist or if it exist, two threads are trying to do the same + // thing. So take state mutex and allocate the blk + std::unique_lock< std::mutex > lg(rreq->state_mtx); + if (rreq->state.load() & uint32_cast(repl_req_state_t::BLK_ALLOCATED)) { return rreq; } + rreq->rkey = rkey; + rreq->header = user_header; + rreq->key = user_key; + rreq->local_blkid = do_alloc_blk(data_size, m_listener->get_blk_alloc_hints(user_header, data_size)); + rreq->state.fetch_or(uint32_cast(repl_req_state_t::BLK_ALLOCATED)); + + return rreq; +} + +AsyncNotify RaftReplDev::notify_after_data_written(std::vector< repl_req_ptr_t >* rreqs) { + std::vector< folly::SemiFuture< folly::Unit > > futs; + futs.reserve(rreqs->size()); + + // Pop any entries that are already completed - from the entries list as well as from map + rreqs->erase(std::remove_if( + rreqs->begin(), rreqs->end(), + [this, &futs](repl_req_ptr_t const& rreq) { + if (rreq == nullptr) { return true; } + + if (rreq->state.load() & uint32_cast(repl_req_state_t::DATA_WRITTEN)) { + m_repl_key_req_map.erase(rreq->rkey); // Remove=Pop from map as well, since it is completed + RD_LOG(INFO, + "Raft Channel: Data write completed and blkid mapped, removing from map: rreq=[{}]", + rreq->to_compact_string()); + return true; // Remove from the pending list + } else { + futs.emplace_back(rreq->data_written_promise.getSemiFuture()); + return false; + } + }), + rreqs->end()); + + // All the entries are done already, no need to wait + if (rreqs->size() == 0) { return folly::makeFuture< folly::Unit >(folly::Unit{}); } + +#if 0 + // We are yet to support reactive fetch from remote. + if (m_resync_mode) { + // if in resync mode, fetch data from remote immediately; + check_and_fetch_remote_data(std::move(rreqs)); + } else { + // some blkids are not in completed state, let's schedule a timer to check it again; + // we wait for data channel to fill in the data. Still if its not done we trigger a fetch from remote; + m_wait_blkid_write_timer_hdl = iomanager.schedule_thread_timer( // timer wakes up in current thread; + HS_DYNAMIC_CONFIG(repl->wait_blkid_write_timer_sec) * 1000 * 1000 * 1000, false /* recurring */, + nullptr /* cookie */, [this, std::move(rreqs)](auto) { + check_and_fetch_remote_data(std::move(rreqs)); + }); + } + return ret; +#endif + + return folly::collectAll(futs).deferValue([this, rreqs](auto&& e) { + for (auto const& rreq : *rreqs) { + HS_DBG_ASSERT(rreq->state.load() & uint32_cast(repl_req_state_t::DATA_WRITTEN), + "Data written promise raised without updating DATA_WRITTEN state for rkey={}", + rreq->rkey.to_string()); + RD_LOG(INFO, "Raft Channel: Data write completed and blkid mapped, removing from map: rreq=[{}]", + rreq->to_compact_string()); + m_repl_key_req_map.erase(rreq->rkey); // Remove from map as well, since it is completed + } + return folly::makeSemiFuture< folly::Unit >(folly::Unit{}); + }); +} + +folly::Future< std::error_code > RaftReplDev::async_read(MultiBlkId const& bid, sisl::sg_list& sgs, uint32_t size, + bool part_of_batch) { + return data_service().async_read(bid, sgs, size, part_of_batch); +} + +void RaftReplDev::async_free_blks(int64_t, MultiBlkId const& bid) { + // TODO: For timeline consistency required, we should retain the blkid that is changed and write that to another + // journal. + data_service().async_free_blk(bid); +} + +bool RaftReplDev::is_leader() const { return m_repl_svc_ctx->is_raft_leader(); } + +uint32_t RaftReplDev::get_blk_size() const { return data_service().get_blk_size(); } + +nuraft_mesg::repl_service_ctx* RaftReplDev::group_msg_service() { return m_repl_svc_ctx.get(); } +nuraft::raft_server* RaftReplDev::raft_server() { return m_repl_svc_ctx->_server; } + +/////////////////////////////////// Config Serialize/Deserialize Section //////////////////////////////////// +static nlohmann::json serialize_server_config(std::list< nuraft::ptr< nuraft::srv_config > > const& server_list) { + auto servers = nlohmann::json::array(); + for (auto const& server_conf : server_list) { + if (!server_conf) { continue; } + servers.push_back(nlohmann::json{{"id", server_conf->get_id()}, + {"dc_id", server_conf->get_dc_id()}, + {"endpoint", server_conf->get_endpoint()}, + {"aux", server_conf->get_aux()}, + {"learner", server_conf->is_learner()}, + {"priority", server_conf->get_priority()}}); + } + return servers; +} + +static nlohmann::json serialize_cluster_config(const nuraft::cluster_config& config) { + return nlohmann::json{{"log_idx", config.get_log_idx()}, + {"prev_log_idx", config.get_prev_log_idx()}, + {"eventual_consistency", config.is_async_replication()}, + {"user_ctx", config.get_user_ctx()}, + {"servers", serialize_server_config(config.get_servers())}}; +} + +static nuraft::ptr< nuraft::srv_config > deserialize_server_config(nlohmann::json const& server) { + DEBUG_ASSERT(server.contains("id"), "Missing field") + auto const id = static_cast< int32_t >(server["id"]); + DEBUG_ASSERT(server.contains("dc_id"), "Missing field") + auto const dc_id = static_cast< int32_t >(server["dc_id"]); + DEBUG_ASSERT(server.contains("endpoint"), "Missing field") + auto const endpoint = server["endpoint"]; + DEBUG_ASSERT(server.contains("aux"), "Missing field") + auto const aux = server["aux"]; + DEBUG_ASSERT(server.contains("learner"), "Missing field") + auto const learner = server["learner"]; + DEBUG_ASSERT(server.contains("priority"), "Missing field") + auto const prior = static_cast< int32_t >(server["priority"]); + return nuraft::cs_new< nuraft::srv_config >(id, dc_id, endpoint, aux, learner, prior); +} + +static void deserialize_server_list(nlohmann::json const& servers, + std::list< nuraft::ptr< nuraft::srv_config > >& server_list) { + for (auto const& server_conf : servers) { + server_list.push_back(deserialize_server_config(server_conf)); + } +} + +nuraft::ptr< nuraft::cluster_config > deserialize_cluster_config(nlohmann::json const& cluster_config) { + DEBUG_ASSERT(cluster_config.contains("log_idx"), "Missing field") + auto const& log_idx = cluster_config["log_idx"]; + DEBUG_ASSERT(cluster_config.contains("prev_log_idx"), "Missing field") + auto const& prev_log_idx = cluster_config["prev_log_idx"]; + DEBUG_ASSERT(cluster_config.contains("eventual_consistency"), "Missing field") + auto const& eventual = cluster_config["eventual_consistency"]; + + auto raft_config = nuraft::cs_new< nuraft::cluster_config >(log_idx, prev_log_idx, eventual); + DEBUG_ASSERT(cluster_config.contains("user_ctx"), "Missing field") + raft_config->set_user_ctx(cluster_config["user_ctx"]); + DEBUG_ASSERT(cluster_config.contains("servers"), "Missing field") + deserialize_server_list(cluster_config["servers"], raft_config->get_servers()); + return raft_config; +} + +nuraft::ptr< nuraft::cluster_config > RaftReplDev::load_config() { + std::unique_lock lg{m_config_mtx}; + auto& js = *m_raft_config_sb; + + if (!js.contains("config")) { + auto cluster_conf = nuraft::cs_new< nuraft::cluster_config >(); + cluster_conf->get_servers().push_back( + nuraft::cs_new< nuraft::srv_config >(m_raft_server_id, my_replica_id_str())); + js["config"] = serialize_cluster_config(*cluster_conf); + } + return deserialize_cluster_config(js["config"]); +} + +void RaftReplDev::save_config(const nuraft::cluster_config& config) { + std::unique_lock lg{m_config_mtx}; + (*m_raft_config_sb)["config"] = serialize_cluster_config(config); + m_raft_config_sb.write(); +} + +void RaftReplDev::save_state(const nuraft::srv_state& state) { + std::unique_lock lg{m_config_mtx}; + (*m_raft_config_sb)["state"] = nlohmann::json{{"term", state.get_term()}, {"voted_for", state.get_voted_for()}}; + m_raft_config_sb.write(); +} + +nuraft::ptr< nuraft::srv_state > RaftReplDev::read_state() { + std::unique_lock lg{m_config_mtx}; + auto& js = *m_raft_config_sb; + auto state = nuraft::cs_new< nuraft::srv_state >(); + if (js["state"].empty()) { + js["state"] = nlohmann::json{{"term", state->get_term()}, {"voted_for", state->get_voted_for()}}; + } else { + try { + state->set_term(uint64_cast(js["state"]["term"])); + state->set_voted_for(static_cast< int >(js["state"]["voted_for"])); + } catch (std::out_of_range const&) { + LOGWARN("State data was not in the expected format [group_id={}]!", m_group_id) + } + } + return state; +} + +nuraft::ptr< nuraft::log_store > RaftReplDev::load_log_store() { return m_data_journal; } + +int32_t RaftReplDev::server_id() { return m_raft_server_id; } + +/////////////////////////////////// nuraft_mesg::mesg_state_mgr overrides //////////////////////////////////// +uint32_t RaftReplDev::get_logstore_id() const { return m_data_journal->logstore_id(); } + +std::shared_ptr< nuraft::state_machine > RaftReplDev::get_state_machine() { return m_state_machine; } + +void RaftReplDev::permanent_destroy() { + // TODO: Implement this +} +void RaftReplDev::leave() { + // TODO: Implement this +} + +/////////////////////////////////// Private metohds //////////////////////////////////// +void RaftReplDev::report_committed(repl_req_ptr_t rreq) { + auto prev_lsn = m_commit_upto_lsn.exchange(rreq->lsn); + RD_DBG_ASSERT_GT(rreq->lsn, prev_lsn, "Out of order commit of lsns, it is not expected in RaftReplDev"); + + RD_LOG(INFO, "Raft channel: Commit rreq=[{}]", rreq->to_compact_string()); + m_listener->on_commit(rreq->lsn, rreq->header, rreq->key, rreq->local_blkid, rreq); + + if (!rreq->is_proposer) { + rreq->header = sisl::blob{}; + rreq->key = sisl::blob{}; + rreq->pkts = sisl::io_blob_list_t{}; + if (rreq->rpc_data) { + rreq->rpc_data->send_response(); + rreq->rpc_data = nullptr; + } + } +} + +void RaftReplDev::cp_flush(CP*) { + auto lsn = m_commit_upto_lsn.load(); + if (lsn == m_last_flushed_commit_lsn) { + // Not dirtied since last flush ignore + return; + } + + m_rd_sb->commit_lsn = lsn; + m_rd_sb->checkpoint_lsn = lsn; + m_rd_sb.write(); + m_last_flushed_commit_lsn = lsn; +} + +void RaftReplDev::cp_cleanup(CP*) {} +} // namespace homestore \ No newline at end of file diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h new file mode 100644 index 000000000..42a7384f8 --- /dev/null +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -0,0 +1,118 @@ +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include "replication/repl_dev/common.h" +#include "replication/repl_dev/raft_state_machine.h" +#include "replication/log_store/repl_log_store.h" + +namespace homestore { + +#pragma pack(1) +struct raft_repl_dev_superblk : public repl_dev_superblk { + static constexpr uint32_t RAFT_REPL_DEV_SB_VERSION = 1; + + uint32_t raft_sb_version{RAFT_REPL_DEV_SB_VERSION}; + logstore_id_t free_blks_journal_id; // Logstore id for storing free blkid records + uint8_t is_timeline_consistent; // Flag to indicate whether the recovery of followers need to be timeline consistent + uint64_t last_applied_dsn; // Last applied data sequence number + + uint32_t get_raft_sb_version() const { return raft_sb_version; } +}; +#pragma pack() + +using raft_buf_ptr_t = nuraft::ptr< nuraft::buffer >; + +class RaftReplService; +class CP; +class RaftReplDev : public ReplDev, public nuraft_mesg::mesg_state_mgr { +private: + shared< RaftStateMachine > m_state_machine; + RaftReplService& m_repl_svc; + folly::ConcurrentHashMap< repl_key, repl_req_ptr_t, repl_key::Hasher > m_repl_key_req_map; + nuraft_mesg::Manager& m_msg_mgr; + group_id_t m_group_id; // Replication Group id + std::string m_rdev_name; // Short name for the group for easy debugging + replica_id_t m_my_repl_id; // This replica's uuid + int32_t m_raft_server_id; // Server ID used by raft (unique within raft group) + shared< ReplLogStore > m_data_journal; + shared< HomeLogStore > m_free_blks_journal; + + std::mutex m_config_mtx; + superblk< raft_repl_dev_superblk > m_rd_sb; // Superblk where we store the state machine etc + json_superblk m_raft_config_sb; // Raft Context and Config data information stored + mutable folly::SharedMutexWritePriority m_sb_lock; // Lock to protect staged sb and persisting sb + raft_repl_dev_superblk m_sb_in_mem; // Cached version which is used to read and for staging + + std::atomic< repl_lsn_t > m_commit_upto_lsn{0}; // LSN which was lastly written, to track flushes + repl_lsn_t m_last_flushed_commit_lsn{0}; // LSN upto which it was flushed to persistent store + iomgr::timer_handle_t m_sb_flush_timer_hdl; + + std::atomic< uint64_t > m_next_dsn{0}; // Data Sequence Number that will keep incrementing for each data entry + + static std::atomic< uint64_t > s_next_group_ordinal; + +public: + friend class RaftStateMachine; + + RaftReplDev(RaftReplService& svc, superblk< raft_repl_dev_superblk >& rd_sb, bool load_existing); + virtual ~RaftReplDev() = default; + + void destroy(); + + //////////////// All ReplDev overrides/implementation /////////////////////// + void async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& value, + repl_req_ptr_t ctx) override; + folly::Future< std::error_code > async_read(MultiBlkId const& blkid, sisl::sg_list& sgs, uint32_t size, + bool part_of_batch = false) override; + void async_free_blks(int64_t lsn, MultiBlkId const& blkid) override; + bool is_leader() const override; + group_id_t group_id() const override { return m_group_id; } + std::string group_id_str() const { return boost::uuids::to_string(m_group_id); } + std::string rdev_name() const { return m_rdev_name; } + std::string my_replica_id_str() const { return boost::uuids::to_string(m_my_repl_id); } + uint32_t get_blk_size() const override; + repl_lsn_t get_last_commit_lsn() const { return m_commit_upto_lsn.load(); } + + //////////////// Accessor/shortcut methods /////////////////////// + nuraft_mesg::repl_service_ctx* group_msg_service(); + nuraft::raft_server* raft_server(); + + //////////////// Methods needed for other Raft classes to access ///////////////// + void use_config(json_superblk raft_config_sb); + void report_committed(repl_req_ptr_t rreq); + repl_req_ptr_t follower_create_req(repl_key const& rkey, sisl::blob const& user_header, sisl::blob const& user_key, + uint32_t data_size); + AsyncNotify notify_after_data_written(std::vector< repl_req_ptr_t >* rreqs); + void cp_flush(CP* cp); + void cp_cleanup(CP* cp); + +protected: + //////////////// All nuraft::state_mgr overrides /////////////////////// + nuraft::ptr< nuraft::cluster_config > load_config() override; + void save_config(const nuraft::cluster_config& config) override; + void save_state(const nuraft::srv_state& state) override; + nuraft::ptr< nuraft::srv_state > read_state() override; + nuraft::ptr< nuraft::log_store > load_log_store() override; + int32_t server_id() override; + void system_exit(const int exit_code) override { LOGINFO("System exiting with code [{}]", exit_code); } + + //////////////// All nuraft_mesg::mesg_state_mgr overrides /////////////////////// + uint32_t get_logstore_id() const override; + std::shared_ptr< nuraft::state_machine > get_state_machine() override; + void permanent_destroy() override; + void leave() override; + +private: + shared< nuraft::log_store > data_journal() { return m_data_journal; } + void push_data_to_all_followers(repl_req_ptr_t rreq); + void on_push_data_received(intrusive< sisl::GenericRpcData >& rpc_data); +}; + +} // namespace homestore diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp new file mode 100644 index 000000000..e25f965f3 --- /dev/null +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -0,0 +1,184 @@ +#include +#include +#include +#include + +#include "repl_dev/raft_state_machine.h" +#include "repl_dev/raft_repl_dev.h" + +SISL_LOGGING_DECL(replication) + +namespace homestore { + +RaftStateMachine::RaftStateMachine(RaftReplDev& rd) : m_rd{rd} { + m_success_ptr = nuraft::buffer::alloc(sizeof(int)); + m_success_ptr->put(0); +} + +raft_buf_ptr_t RaftStateMachine::pre_commit_ext(nuraft::state_machine::ext_op_params const& params) { + // Leader precommit is processed in next callback, because this callback doesn't provide a way to stick a context + // which could contain the req structure in it. + if (!m_rd.is_leader()) { + int64_t lsn = s_cast< int64_t >(params.log_idx); + raft_buf_ptr_t data = params.data; + + repl_req_ptr_t rreq = lsn_to_req(lsn); + RD_LOG(INFO, "Raft channel: Precommit rreq=[{}]", rreq->to_compact_string()); + m_rd.m_listener->on_pre_commit(rreq->lsn, rreq->header, rreq->key, rreq); + } + return m_success_ptr; +} + +void RaftStateMachine::after_precommit_in_leader(nuraft::raft_server::req_ext_cb_params const& params) { + repl_req_ptr_t rreq = repl_req_ptr_t(r_cast< repl_req_ctx* >(params.context)); + link_lsn_to_req(rreq, int64_cast(params.log_idx)); + + RD_LOG(INFO, "Raft Channel: Proposed rreq=[{}]", rreq->to_compact_string()); + m_rd.m_listener->on_pre_commit(rreq->lsn, rreq->header, rreq->key, rreq); +} + +raft_buf_ptr_t RaftStateMachine::commit_ext(nuraft::state_machine::ext_op_params const& params) { + int64_t lsn = s_cast< int64_t >(params.log_idx); + raft_buf_ptr_t data = params.data; + + repl_req_ptr_t rreq = lsn_to_req(lsn); + if (rreq == nullptr) { return m_success_ptr; } + + RD_LOG(INFO, "Raft channel: Received Commit message rreq=[{}]", rreq->to_compact_string()); + if (m_rd.is_leader()) { + // This is the time to ensure flushing of journal happens in leader + if (m_rd.m_data_journal->last_durable_index() < uint64_cast(lsn)) { m_rd.m_data_journal->flush(); } + rreq->state.fetch_or(uint32_cast(repl_req_state_t::LOG_FLUSHED)); + } + if (rreq->state.load() & uint32_cast(repl_req_state_t::DATA_WRITTEN)) { + m_lsn_req_map.erase(rreq->lsn); + m_rd.report_committed(rreq); + } + return m_success_ptr; +} + +uint64_t RaftStateMachine::last_commit_index() { return uint64_cast(m_rd.get_last_commit_lsn()); } + +void RaftStateMachine::propose_to_raft(repl_req_ptr_t rreq) { + uint32_t val_size = rreq->value.size ? rreq->local_blkid.serialized_size() : 0; + uint32_t entry_size = sizeof(repl_journal_entry) + rreq->header.size() + rreq->key.size() + val_size; + rreq->alloc_journal_entry(entry_size, true /* raft_buf */); + rreq->journal_entry->code = (rreq->value.size) ? journal_type_t::HS_LARGE_DATA : journal_type_t::HS_HEADER_ONLY; + rreq->journal_entry->server_id = m_rd.server_id(); + rreq->journal_entry->dsn = rreq->dsn(); + rreq->journal_entry->user_header_size = rreq->header.size(); + rreq->journal_entry->key_size = rreq->key.size(); + rreq->journal_entry->value_size = val_size; + + rreq->is_proposer = true; + uint8_t* raw_ptr = uintptr_cast(rreq->journal_entry) + sizeof(repl_journal_entry); + if (rreq->header.size()) { + std::memcpy(raw_ptr, rreq->header.cbytes(), rreq->header.size()); + raw_ptr += rreq->header.size(); + } + + if (rreq->key.size()) { + std::memcpy(raw_ptr, rreq->key.cbytes(), rreq->key.size()); + raw_ptr += rreq->key.size(); + } + + if (rreq->value.size) { + auto const b = rreq->local_blkid.serialize(); + std::memcpy(raw_ptr, b.cbytes(), b.size()); + raw_ptr += b.size(); + } + + auto* vec = sisl::VectorPool< raft_buf_ptr_t >::alloc(); + vec->push_back(rreq->raft_journal_buf()); + + nuraft::raft_server::req_ext_params param; + param.after_precommit_ = bind_this(RaftStateMachine::after_precommit_in_leader, 1); + param.expected_term_ = 0; + param.context_ = voidptr_cast(rreq.get()); + + RD_LOG(TRACE, "Raft Channel: journal_entry=[{}] ", rreq->journal_entry->to_string()); + + m_rd.raft_server()->append_entries_ext(*vec, param); + sisl::VectorPool< raft_buf_ptr_t >::free(vec); +} + +repl_req_ptr_t RaftStateMachine::transform_journal_entry(nuraft::ptr< nuraft::log_entry >& lentry) { + // Leader has nothing to transform or process + if (m_rd.is_leader()) { return nullptr; } + + // We don't want to transform anything that is not an app log + if (lentry->get_val_type() != nuraft::log_val_type::app_log) { return nullptr; } + + repl_journal_entry* jentry = r_cast< repl_journal_entry* >(lentry->get_buf().data_begin()); + RELEASE_ASSERT_EQ(jentry->major_version, repl_journal_entry::JOURNAL_ENTRY_MAJOR, + "Mismatched version of journal entry received from RAFT peer"); + + RD_LOG(TRACE, "Received Raft log_entry=[term={}], journal_entry=[{}] ", lentry->get_term(), jentry->to_string()); + + // For inline data we don't need to transform anything + if (jentry->code != journal_type_t::HS_LARGE_DATA) { return nullptr; } + + sisl::blob const header = sisl::blob{uintptr_cast(jentry) + sizeof(repl_journal_entry), jentry->user_header_size}; + sisl::blob const key = sisl::blob{header.cbytes() + header.size(), jentry->key_size}; + DEBUG_ASSERT_GT(jentry->value_size, 0, "Entry marked as large data, but value size is notified as 0"); + + // From the repl_key, get the repl_req. In cases where log stream got here first, this method will create a new + // repl_req and return that back. Fill up all of the required journal entry inside the repl_req + auto rreq = m_rd.follower_create_req( + repl_key{.server_id = jentry->server_id, .term = lentry->get_term(), .dsn = jentry->dsn}, header, key, + jentry->value_size); + rreq->journal_buf = lentry->serialize(); + + MultiBlkId entry_blkid; + entry_blkid.deserialize(sisl::blob{key.cbytes() + key.size(), jentry->value_size}, true /* copy */); + rreq->remote_blkid = RemoteBlkId{jentry->server_id, entry_blkid}; + + auto const local_size = rreq->local_blkid.serialized_size(); + auto const remote_size = entry_blkid.serialized_size(); + uint8_t* blkid_location; + if (local_size > remote_size) { + // We need to copy the entire log_entry to accomodate local blkid + auto new_buf = nuraft::buffer::expand(*rreq->raft_journal_buf(), + rreq->raft_journal_buf()->size() + local_size - remote_size); + blkid_location = uintptr_cast(new_buf->data_begin()) + rreq->raft_journal_buf()->size() - jentry->value_size; + rreq->journal_buf = std::move(new_buf); + } else { + // Can do in-place replace of remote blkid with local blkid. + blkid_location = uintptr_cast(rreq->raft_journal_buf()->data_begin()) + rreq->raft_journal_buf()->size() - + jentry->value_size; + } + std::memcpy(blkid_location, rreq->local_blkid.serialize().cbytes(), local_size); + rreq->journal_entry = r_cast< repl_journal_entry* >(rreq->raft_journal_buf()->data_begin()); + + return rreq; +} + +void RaftStateMachine::link_lsn_to_req(repl_req_ptr_t rreq, int64_t lsn) { + rreq->lsn = lsn; + rreq->state.fetch_or(uint32_cast(repl_req_state_t::LOG_RECEIVED)); + [[maybe_unused]] auto r = m_lsn_req_map.insert(lsn, std::move(rreq)); + RD_DBG_ASSERT_EQ(r.second, true, "lsn={} already in precommit list", lsn); +} + +repl_req_ptr_t RaftStateMachine::lsn_to_req(int64_t lsn) { + // Pull the req from the lsn + auto const it = m_lsn_req_map.find(lsn); + // RD_DBG_ASSERT(it != m_lsn_req_map.cend(), "lsn req map missing lsn={}", lsn); + if (it == m_lsn_req_map.cend()) { return nullptr; } + + repl_req_ptr_t rreq = it->second; + RD_DBG_ASSERT_EQ(lsn, rreq->lsn, "lsn req map mismatch"); + return rreq; +} + +nuraft_mesg::repl_service_ctx* RaftStateMachine::group_msg_service() { return m_rd.group_msg_service(); } + +void RaftStateMachine::create_snapshot(nuraft::snapshot& s, nuraft::async_result< bool >::handler_type& when_done) { + RD_LOG(DEBUG, "create_snapshot {}/{}", s.get_last_log_idx(), s.get_last_log_term()); + auto null_except = std::shared_ptr< std::exception >(); + auto ret_val{false}; + if (when_done) when_done(ret_val, null_except); +} + +std::string RaftStateMachine::rdev_name() const { return m_rd.rdev_name(); } +} // namespace homestore diff --git a/src/lib/replication/repl_dev/raft_state_machine.h b/src/lib/replication/repl_dev/raft_state_machine.h new file mode 100644 index 000000000..c341ebd3b --- /dev/null +++ b/src/lib/replication/repl_dev/raft_state_machine.h @@ -0,0 +1,123 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "replication/repl_dev/common.h" + +#if defined __clang__ or defined __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#endif +#include +#if defined __clang__ or defined __GNUC__ +#pragma GCC diagnostic pop +#endif +#undef auto_lock + +namespace homestore { +class ReplicaSetImpl; +class StateMachineStore; + +#define RD_LOG(level, msg, ...) \ + LOG##level##MOD_FMT(replication, ([&](fmt::memory_buffer& buf, const char* msgcb, auto&&... args) -> bool { \ + fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}:{}] "}, \ + fmt::make_format_args(file_name(__FILE__), __LINE__)); \ + fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}={}] "}, \ + fmt::make_format_args("rd", rdev_name())); \ + fmt::vformat_to(fmt::appender{buf}, fmt::string_view{msgcb}, \ + fmt::make_format_args(std::forward< decltype(args) >(args)...)); \ + return true; \ + }), \ + msg, ##__VA_ARGS__); + +#define RD_ASSERT_CMP(assert_type, val1, cmp, val2, ...) \ + { \ + assert_type##_ASSERT_CMP( \ + val1, cmp, val2, \ + [&](fmt::memory_buffer& buf, const char* const msgcb, auto&&... args) -> bool { \ + fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}:{}] "}, \ + fmt::make_format_args(file_name(__FILE__), __LINE__)); \ + sisl::logging::default_cmp_assert_formatter(buf, msgcb, std::forward< decltype(args) >(args)...); \ + fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}={}] "}, \ + fmt::make_format_args("rd", rdev_name())); \ + return true; \ + }, \ + ##__VA_ARGS__); \ + } +#define RD_ASSERT(assert_type, cond, ...) \ + { \ + assert_type##_ASSERT_FMT(cond, \ + ([&](fmt::memory_buffer& buf, const char* const msgcb, auto&&... args) -> bool { \ + fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}={}] "}, \ + fmt::make_format_args("rd", rdev_name())); \ + fmt::vformat_to(fmt::appender{buf}, fmt::string_view{msgcb}, \ + fmt::make_format_args(std::forward< decltype(args) >(args)...)); \ + return true; \ + }), \ + ##__VA_ARGS__); \ + } + +#define RD_DBG_ASSERT(cond, ...) RD_ASSERT(DEBUG, cond, ##__VA_ARGS__) +#define RD_DBG_ASSERT_EQ(val1, val2, ...) RD_ASSERT_CMP(DEBUG, val1, ==, val2, ##__VA_ARGS__) +#define RD_DBG_ASSERT_NE(val1, val2, ...) RD_ASSERT_CMP(DEBUG, val1, !=, val2, ##__VA_ARGS__) +#define RD_DBG_ASSERT_LT(val1, val2, ...) RD_ASSERT_CMP(DEBUG, val1, <, val2, ##__VA_ARGS__) +#define RD_DBG_ASSERT_LE(val1, val2, ...) RD_ASSERT_CMP(DEBUG, val1, <=, val2, ##__VA_ARGS__) +#define RD_DBG_ASSERT_GT(val1, val2, ...) RD_ASSERT_CMP(DEBUG, val1, >, val2, ##__VA_ARGS__) +#define RD_DBG_ASSERT_GE(val1, val2, ...) RD_ASSERT_CMP(DEBUG, val1, >=, val2, ##__VA_ARGS__) + +#define RD_REL_ASSERT(cond, ...) RD_ASSERT(RELEASE, cond, ##__VA_ARGS__) +#define RD_REL_ASSERT_EQ(val1, val2, ...) RD_ASSERT_CMP(RELEASE, val1, ==, val2, ##__VA_ARGS__) +#define RD_REL_ASSERT_NE(val1, val2, ...) RD_ASSERT_CMP(RELEASE, val1, !=, val2, ##__VA_ARGS__) +#define RD_REL_ASSERT_LT(val1, val2, ...) RD_ASSERT_CMP(RELEASE, val1, <, val2, ##__VA_ARGS__) +#define RD_REL_ASSERT_LE(val1, val2, ...) RD_ASSERT_CMP(RELEASE, val1, <=, val2, ##__VA_ARGS__) +#define RD_REL_ASSERT_GT(val1, val2, ...) RD_ASSERT_CMP(RELEASE, val1, >, val2, ##__VA_ARGS__) +#define RD_REL_ASSERT_GE(val1, val2, ...) RD_ASSERT_CMP(RELEASE, val1, >=, val2, ##__VA_ARGS__) + +using AsyncNotify = folly::SemiFuture< folly::Unit >; +using AsyncNotifier = folly::Promise< folly::Unit >; + +class RaftReplDev; +class RaftStateMachine : public nuraft::state_machine { +private: + folly::ConcurrentHashMap< int64_t, repl_req_ptr_t > m_lsn_req_map; + RaftReplDev& m_rd; + nuraft::ptr< nuraft::buffer > m_success_ptr; // Preallocate the success return to raft + // iomgr::timer_handle_t m_wait_blkid_write_timer_hdl{iomgr::null_timer_handle}; + bool m_resync_mode{false}; + +public: + RaftStateMachine(RaftReplDev& rd); + ~RaftStateMachine() override = default; + RaftStateMachine(RaftStateMachine const&) = delete; + RaftStateMachine& operator=(RaftStateMachine const&) = delete; + + /// NuRaft overrides + uint64_t last_commit_index() override; + raft_buf_ptr_t pre_commit_ext(const nuraft::state_machine::ext_op_params& params) override; + raft_buf_ptr_t commit_ext(const nuraft::state_machine::ext_op_params& params) override; + void rollback(uint64_t lsn, nuraft::buffer&) override { LOGCRITICAL("Unimplemented rollback on: [{}]", lsn); } + + bool apply_snapshot(nuraft::snapshot&) override { return false; } + void create_snapshot(nuraft::snapshot& s, nuraft::async_result< bool >::handler_type& when_done) override; + nuraft::ptr< nuraft::snapshot > last_snapshot() override { return nullptr; } + + ////////// APIs outside of nuraft::state_machine requirements //////////////////// + void propose_to_raft(repl_req_ptr_t rreq); + repl_req_ptr_t transform_journal_entry(nuraft::ptr< nuraft::log_entry >& lentry); + void link_lsn_to_req(repl_req_ptr_t rreq, int64_t lsn); + repl_req_ptr_t lsn_to_req(int64_t lsn); + nuraft_mesg::repl_service_ctx* group_msg_service(); + + std::string rdev_name() const; + +private: + void after_precommit_in_leader(const nuraft::raft_server::req_ext_cb_params& params); +}; + +} // namespace homestore diff --git a/src/lib/replication/repl_dev/solo_repl_dev.cpp b/src/lib/replication/repl_dev/solo_repl_dev.cpp index 5cf2bd8eb..5d7d7d338 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.cpp +++ b/src/lib/replication/repl_dev/solo_repl_dev.cpp @@ -1,9 +1,10 @@ +#include +#include "replication/repl_dev/solo_repl_dev.h" +#include "replication/repl_dev/common.h" #include #include #include #include "common/homestore_assert.hpp" -#include "replication/repl_dev/solo_repl_dev.h" -#include "replication/repl_dev/common.h" namespace homestore { SoloReplDev::SoloReplDev(superblk< repl_dev_superblk > const& rd_sb, bool load_existing) : @@ -35,7 +36,8 @@ void SoloReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& if (rreq->value.size) { // Step 1: Alloc Blkid auto status = data_service().alloc_blks(uint32_cast(rreq->value.size), - m_listener->get_blk_alloc_hints(rreq->header, rreq), rreq->local_blkid); + m_listener->get_blk_alloc_hints(rreq->header, rreq->value.size), + rreq->local_blkid); HS_REL_ASSERT_EQ(status, BlkAllocStatus::SUCCESS); // Write the data diff --git a/src/lib/replication/repl_dev/solo_repl_dev.h b/src/lib/replication/repl_dev/solo_repl_dev.h index 51204a07a..684cf8be3 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.h +++ b/src/lib/replication/repl_dev/solo_repl_dev.h @@ -14,6 +14,7 @@ *********************************************************************************/ #pragma once +#include #include #include diff --git a/src/lib/replication/service/generic_repl_svc.cpp b/src/lib/replication/service/generic_repl_svc.cpp index 0340a018d..7505703a2 100644 --- a/src/lib/replication/service/generic_repl_svc.cpp +++ b/src/lib/replication/service/generic_repl_svc.cpp @@ -17,26 +17,28 @@ #include #include "common/homestore_assert.hpp" #include "replication/service/generic_repl_svc.h" +#include "replication/service/raft_repl_service.h" #include "replication/repl_dev/solo_repl_dev.h" namespace homestore { ReplicationService& repl_service() { return hs()->repl_service(); } -std::unique_ptr< GenericReplService > GenericReplService::create(cshared< ReplApplication >& repl_app) { +std::shared_ptr< GenericReplService > GenericReplService::create(cshared< ReplApplication >& repl_app) { auto impl_type = repl_app->get_impl_type(); if (impl_type == repl_impl_type::solo) { - return std::make_unique< SoloReplService >(repl_app); - //} else if (impl_type == repl_impl_type::server_side) { - // return std::make_unique< RaftReplService >(repl_app); + return std::make_shared< SoloReplService >(repl_app); + } else if (impl_type == repl_impl_type::server_side) { + return std::make_shared< RaftReplService >(repl_app); } else { return nullptr; } } -GenericReplService::GenericReplService(cshared< ReplApplication >& repl_app) : m_repl_app{repl_app} { +GenericReplService::GenericReplService(cshared< ReplApplication >& repl_app) : + m_repl_app{repl_app}, m_my_uuid{repl_app->get_my_repl_id()} { meta_service().register_handler( get_meta_blk_name(), - [this](meta_blk* mblk, sisl::byte_view buf, size_t) { rd_super_blk_found(std::move(buf), voidptr_cast(mblk)); }, + [this](meta_blk* mblk, sisl::byte_view buf, size_t) { load_repl_dev(std::move(buf), voidptr_cast(mblk)); }, nullptr); } @@ -45,53 +47,7 @@ void GenericReplService::stop() { m_rd_map.clear(); } -AsyncReplResult< shared< ReplDev > > GenericReplService::create_repl_dev(uuid_t group_id, - std::set< uuid_t, std::less<> >&& members) { - // Ensure idempotency of the repl_dev creation - auto it = m_rd_map.end(); - bool happened = false; - { - std::unique_lock lg(m_rd_map_mtx); - std::tie(it, happened) = m_rd_map.emplace(std::make_pair(group_id, nullptr)); - - if (!happened) { - if (it == m_rd_map.end()) { - // We should never reach here, as we have failed to emplace in map, but couldn't find entry - DEBUG_ASSERT(false, "Unable to put the repl_dev in rd map"); - return make_async_error< shared< ReplDev > >(ReplServiceError::SERVER_ALREADY_EXISTS); - } else if (it->second == nullptr) { - // There is a duplicate create_repl_dev request while one is being done. - return make_async_error< shared< ReplDev > >(ReplServiceError::SERVER_IS_JOINING); - } else { - return make_async_success(it->second); - } - } - } - - // Create whatever underlying implementation of repl_dev needs to be for fresh creation of repl_dev - auto const result = create_replica_set(group_id, std::move(members)).get(); - if (!bool(result)) { return make_async_error< shared< ReplDev > >(result.error()); } - - // Now we need to create local repl dev instance which is actually creates entry, state machine etc - // according the the underlying implementation - superblk< repl_dev_superblk > rd_sb{get_meta_blk_name()}; - rd_sb.create(rd_super_blk_size()); - rd_sb->group_id = group_id; - - auto repl_dev = create_local_repl_dev_instance(rd_sb, false /* load_existing */); - auto listener = m_repl_app->create_repl_dev_listener(group_id); - listener->set_repl_dev(repl_dev.get()); - repl_dev->attach_listener(std::move(listener)); - rd_sb.write(); - - { - std::unique_lock lg(m_rd_map_mtx); - it->second = repl_dev; - } - return make_async_success(repl_dev); -} - -ReplResult< shared< ReplDev > > GenericReplService::get_repl_dev(uuid_t group_id) const { +ReplResult< shared< ReplDev > > GenericReplService::get_repl_dev(group_id_t group_id) const { std::shared_lock lg(m_rd_map_mtx); if (auto it = m_rd_map.find(group_id); it != m_rd_map.end()) { return it->second; } return folly::makeUnexpected(ReplServiceError::SERVER_NOT_FOUND); @@ -104,22 +60,10 @@ void GenericReplService::iterate_repl_devs(std::function< void(cshared< ReplDev } } -void GenericReplService::rd_super_blk_found(sisl::byte_view const& buf, void* meta_cookie) { - superblk< repl_dev_superblk > rd_sb; - rd_sb.load(buf, meta_cookie); - HS_DBG_ASSERT_EQ(rd_sb->get_magic(), repl_dev_superblk::REPL_DEV_SB_MAGIC, "Invalid rdev metablk, magic mismatch"); - HS_DBG_ASSERT_EQ(rd_sb->get_version(), repl_dev_superblk::REPL_DEV_SB_VERSION, "Invalid version of rdev metablk"); - - auto repl_dev = create_local_repl_dev_instance(rd_sb, true /* load_existing */); - auto listener = m_repl_app->create_repl_dev_listener(repl_dev->group_id()); - listener->set_repl_dev(repl_dev.get()); - repl_dev->attach_listener(std::move(listener)); - - join_replica_set(rd_sb->group_id, repl_dev); - { - std::unique_lock lg(m_rd_map_mtx); - m_rd_map.emplace(std::make_pair(rd_sb->group_id, repl_dev)); - } +void GenericReplService::add_repl_dev(group_id_t group_id, shared< ReplDev > rdev) { + std::unique_lock lg(m_rd_map_mtx); + [[maybe_unused]] auto [it, happened] = m_rd_map.emplace(std::pair{group_id, rdev}); + HS_DBG_ASSERT(happened, "Unable to put the repl_dev in rd map for group_id={}, duplicated add?", group_id); } hs_stats GenericReplService::get_cap_stats() const { @@ -137,36 +81,68 @@ void SoloReplService::start() { hs()->cp_mgr().register_consumer(cp_consumer_t::REPLICATION_SVC, std::make_unique< SoloReplServiceCPHandler >()); } -AsyncReplResult<> SoloReplService::replace_member(uuid_t group_id, uuid_t member_out, uuid_t member_in) const { - return make_async_error<>(ReplServiceError::NOT_IMPLEMENTED); -} +AsyncReplResult< shared< ReplDev > > SoloReplService::create_repl_dev(group_id_t group_id, + std::set< replica_id_t > const& members) { + superblk< repl_dev_superblk > rd_sb{get_meta_blk_name()}; + rd_sb.create(); + rd_sb->group_id = group_id; + auto rdev = std::make_shared< SoloReplDev >(rd_sb, false /* load_existing */); -AsyncReplResult<> SoloReplService::create_replica_set(uuid_t group_id, std::set< uuid_t, std::less<> >&& members) { - return make_async_success<>(); -} + auto listener = m_repl_app->create_repl_dev_listener(group_id); + listener->set_repl_dev(rdev.get()); + rdev->attach_listener(std::move(listener)); + rd_sb.write(); + + { + std::unique_lock lg(m_rd_map_mtx); + auto [it, happened] = m_rd_map.emplace(group_id, rdev); + if (!happened) { + // We should never reach here, as we have failed to emplace in map, but couldn't find entry + DEBUG_ASSERT(false, "Unable to put the repl_dev in rd map"); + return make_async_error< shared< ReplDev > >(ReplServiceError::SERVER_ALREADY_EXISTS); + } + } -AsyncReplResult<> SoloReplService::join_replica_set(uuid_t group_id, cshared< ReplDev >& repl_dev) { - return make_async_success<>(); + return make_async_success< shared< ReplDev > >(rdev); } -shared< ReplDev > SoloReplService::create_local_repl_dev_instance(superblk< repl_dev_superblk > const& rd_sb, - bool load_existing) { - return std::make_shared< SoloReplDev >(rd_sb, load_existing); +void SoloReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) { + superblk< repl_dev_superblk > rd_sb{get_meta_blk_name()}; + rd_sb.load(buf, meta_cookie); + HS_DBG_ASSERT_EQ(rd_sb->get_magic(), repl_dev_superblk::REPL_DEV_SB_MAGIC, "Invalid rdev metablk, magic mismatch"); + HS_DBG_ASSERT_EQ(rd_sb->get_version(), repl_dev_superblk::REPL_DEV_SB_VERSION, "Invalid version of rdev metablk"); + group_id_t group_id = rd_sb->group_id; + auto rdev = std::make_shared< SoloReplDev >(rd_sb, true /* load_existing */); + + auto listener = m_repl_app->create_repl_dev_listener(group_id); + listener->set_repl_dev(rdev.get()); + rdev->attach_listener(std::move(listener)); + + { + std::unique_lock lg(m_rd_map_mtx); + auto [it, happened] = m_rd_map.emplace(group_id, rdev); + HS_DBG_ASSERT(happened, "Unable to put the repl_dev in rd map for group_id={}", group_id); + } } -uint32_t SoloReplService::rd_super_blk_size() const { return sizeof(repl_dev_superblk); } +AsyncReplResult<> SoloReplService::replace_member(group_id_t group_id, replica_id_t member_out, + replica_id_t member_in) const { + return make_async_error<>(ReplServiceError::NOT_IMPLEMENTED); +} std::unique_ptr< CPContext > SoloReplServiceCPHandler::on_switchover_cp(CP* cur_cp, CP* new_cp) { return nullptr; } folly::Future< bool > SoloReplServiceCPHandler::cp_flush(CP* cp) { - repl_service().iterate_repl_devs( - [cp](cshared< ReplDev >& repl_dev) { std::dynamic_pointer_cast< SoloReplDev >(repl_dev)->cp_flush(cp); }); + repl_service().iterate_repl_devs([cp](cshared< ReplDev >& repl_dev) { + if (repl_dev) { std::dynamic_pointer_cast< SoloReplDev >(repl_dev)->cp_flush(cp); } + }); return folly::makeFuture< bool >(true); } void SoloReplServiceCPHandler::cp_cleanup(CP* cp) { - repl_service().iterate_repl_devs( - [cp](cshared< ReplDev >& repl_dev) { std::dynamic_pointer_cast< SoloReplDev >(repl_dev)->cp_cleanup(cp); }); + repl_service().iterate_repl_devs([cp](cshared< ReplDev >& repl_dev) { + if (repl_dev) { std::dynamic_pointer_cast< SoloReplDev >(repl_dev)->cp_cleanup(cp); } + }); } int SoloReplServiceCPHandler::cp_progress_percent() { return 100; } diff --git a/src/lib/replication/service/generic_repl_svc.h b/src/lib/replication/service/generic_repl_svc.h index 95bc52bb4..64b8ea47a 100644 --- a/src/lib/replication/service/generic_repl_svc.h +++ b/src/lib/replication/service/generic_repl_svc.h @@ -31,36 +31,34 @@ namespace homestore { +static std::string const PUSH_DATA{"push_data"}; +static std::string const FETCH_DATA{"fetch_data"}; + struct repl_dev_superblk; class GenericReplService : public ReplicationService { protected: shared< ReplApplication > m_repl_app; std::shared_mutex m_rd_map_mtx; - std::map< uuid_t, shared< ReplDev > > m_rd_map; + std::map< group_id_t, shared< ReplDev > > m_rd_map; + replica_id_t m_my_uuid; public: - static std::unique_ptr< GenericReplService > create(cshared< ReplApplication >& repl_app); + static std::shared_ptr< GenericReplService > create(cshared< ReplApplication >& repl_app); GenericReplService(cshared< ReplApplication >& repl_app); virtual void start() = 0; virtual void stop(); meta_sub_type get_meta_blk_name() const override { return "repl_dev"; } - AsyncReplResult< shared< ReplDev > > create_repl_dev(uuid_t group_id, - std::set< uuid_t, std::less<> >&& members) override; - ReplResult< shared< ReplDev > > get_repl_dev(uuid_t group_id) const override; + ReplResult< shared< ReplDev > > get_repl_dev(group_id_t group_id) const override; void iterate_repl_devs(std::function< void(cshared< ReplDev >&) > const& cb) override; + hs_stats get_cap_stats() const override; + replica_id_t get_my_repl_uuid() const { return m_my_uuid; } protected: - virtual AsyncReplResult<> create_replica_set(uuid_t group_id, std::set< uuid_t, std::less<> >&& members) = 0; - virtual AsyncReplResult<> join_replica_set(uuid_t group_id, cshared< ReplDev >& repl_dev) = 0; - virtual shared< ReplDev > create_local_repl_dev_instance(superblk< repl_dev_superblk > const& rd_sb, - bool load_existing) = 0; - virtual uint32_t rd_super_blk_size() const = 0; - -private: - void rd_super_blk_found(sisl::byte_view const& buf, void* meta_cookie); + virtual void add_repl_dev(group_id_t group_id, shared< ReplDev > rdev); + virtual void load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) = 0; }; class SoloReplService : public GenericReplService { @@ -68,14 +66,11 @@ class SoloReplService : public GenericReplService { SoloReplService(cshared< ReplApplication >& repl_app); void start() override; - AsyncReplResult<> replace_member(uuid_t group_id, uuid_t member_out, uuid_t member_in) const override; - -private: - AsyncReplResult<> create_replica_set(uuid_t group_id, std::set< uuid_t, std::less<> >&& members) override; - AsyncReplResult<> join_replica_set(uuid_t group_id, cshared< ReplDev >& repl_dev) override; - shared< ReplDev > create_local_repl_dev_instance(superblk< repl_dev_superblk > const& rd_sb, - bool load_existing) override; - uint32_t rd_super_blk_size() const override; + AsyncReplResult< shared< ReplDev > > create_repl_dev(group_id_t group_id, + std::set< replica_id_t > const& members) override; + void load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) override; + AsyncReplResult<> replace_member(group_id_t group_id, replica_id_t member_out, + replica_id_t member_in) const override; }; class SoloReplServiceCPHandler : public CPCallbacks { diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp new file mode 100644 index 000000000..7377b30ba --- /dev/null +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -0,0 +1,243 @@ +/********************************************************************************* + * Modifications Copyright 2017-2019 eBay Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ +#include +#include +#include + +#include +#include "common/homestore_config.hpp" +#include "common/homestore_assert.hpp" +#include "replication/service/raft_repl_service.h" +#include "replication/repl_dev/raft_repl_dev.h" + +namespace homestore { +ReplServiceError RaftReplService::to_repl_error(nuraft::cmd_result_code code) { + ReplServiceError ret; + if (code == nuraft::cmd_result_code::OK) { + ret = ReplServiceError::OK; + } else if (code == nuraft::cmd_result_code::CANCELLED) { + ret = ReplServiceError::CANCELLED; + } else if (code == nuraft::cmd_result_code::TIMEOUT) { + ret = ReplServiceError::TIMEOUT; + } else if (code == nuraft::cmd_result_code::NOT_LEADER) { + ret = ReplServiceError::NOT_LEADER; + } else if (code == nuraft::cmd_result_code::BAD_REQUEST) { + ret = ReplServiceError::BAD_REQUEST; + } else if (code == nuraft::cmd_result_code::SERVER_ALREADY_EXISTS) { + ret = ReplServiceError::SERVER_ALREADY_EXISTS; + } else if (code == nuraft::cmd_result_code::CONFIG_CHANGING) { + ret = ReplServiceError::CONFIG_CHANGING; + } else if (code == nuraft::cmd_result_code::SERVER_IS_JOINING) { + ret = ReplServiceError::SERVER_IS_JOINING; + } else if (code == nuraft::cmd_result_code::SERVER_NOT_FOUND) { + ret = ReplServiceError::SERVER_NOT_FOUND; + } else if (code == nuraft::cmd_result_code::CANNOT_REMOVE_LEADER) { + ret = ReplServiceError::CANNOT_REMOVE_LEADER; + } else if (code == nuraft::cmd_result_code::SERVER_IS_LEAVING) { + ret = ReplServiceError::SERVER_IS_LEAVING; + } else if (code == nuraft::cmd_result_code::TERM_MISMATCH) { + ret = ReplServiceError::TERM_MISMATCH; + } else if (code == nuraft::cmd_result_code::RESULT_NOT_EXIST_YET) { + ret = ReplServiceError::RESULT_NOT_EXIST_YET; + } else { + ret = ReplServiceError::FAILED; + } + return ret; +} + +RaftReplService::RaftReplService(cshared< ReplApplication >& repl_app) : GenericReplService{repl_app} { + meta_service().register_handler( + get_meta_blk_name() + "_raft_config", + [this](meta_blk* mblk, sisl::byte_view buf, size_t) { + raft_group_config_found(std::move(buf), voidptr_cast(mblk)); + }, + nullptr, false, std::optional< meta_subtype_vec_t >({get_meta_blk_name()})); +} + +void RaftReplService::start() { + /*auto params = nuraft_mesg::Manager::Params{ + .server_uuid_ = m_my_uuid, + .mesg_port_ = m_repl_app->lookup_peer(m_my_uuid).second, + .default_group_type_ = "homestore_replication", + .ssl_key_ = ioenvironment.get_ssl_key(), + .ssl_cert_ = ioenvironment.get_ssl_cert(), + .token_verifier_ = std::dynamic_pointer_cast< sisl::GrpcTokenVerifier >(ioenvironment.get_token_verifier()), + .token_client_ = std::dynamic_pointer_cast< sisl::GrpcTokenClient >(ioenvironment.get_token_client())};*/ + auto params = nuraft_mesg::Manager::Params(); + params.server_uuid_ = m_my_uuid; + params.mesg_port_ = m_repl_app->lookup_peer(m_my_uuid).second; + params.default_group_type_ = "homestore_replication"; + m_msg_mgr = nuraft_mesg::init_messaging(params, weak_from_this(), true /* with_data_channel */); + + LOGINFOMOD(replication, "Starting RaftReplService with server_uuid={} port={}", + boost::uuids::to_string(params.server_uuid_), params.mesg_port_); + + auto r_params = nuraft::raft_params() + .with_election_timeout_lower(HS_DYNAMIC_CONFIG(consensus.elect_to_low_ms)) + .with_election_timeout_upper(HS_DYNAMIC_CONFIG(consensus.elect_to_high_ms)) + .with_rpc_failure_backoff(HS_DYNAMIC_CONFIG(consensus.rpc_backoff_ms)) + .with_hb_interval(HS_DYNAMIC_CONFIG(consensus.heartbeat_period_ms)) + .with_max_append_size(HS_DYNAMIC_CONFIG(consensus.max_append_batch_size)) + .with_log_sync_batch_size(HS_DYNAMIC_CONFIG(consensus.log_sync_batch_size)) + .with_log_sync_stopping_gap(HS_DYNAMIC_CONFIG(consensus.min_log_gap_to_join)) + .with_stale_log_gap(HS_DYNAMIC_CONFIG(consensus.stale_log_gap_hi_threshold)) + .with_fresh_log_gap(HS_DYNAMIC_CONFIG(consensus.stale_log_gap_lo_threshold)) + .with_snapshot_enabled(HS_DYNAMIC_CONFIG(consensus.snapshot_freq_distance)) + .with_reserved_log_items(0) // In reality ReplLogStore retains much more than this + .with_auto_forwarding(false); + r_params.return_method_ = nuraft::raft_params::async_handler; + m_msg_mgr->register_mgr_type(params.default_group_type_, r_params); + + hs()->cp_mgr().register_consumer(cp_consumer_t::REPLICATION_SVC, std::make_unique< RaftReplServiceCPHandler >()); +} + +void RaftReplService::raft_group_config_found(sisl::byte_view const& buf, void* meta_cookie) { + json_superblk group_config; + auto& js = group_config.load(buf, meta_cookie); + std::string gid_str = js["group_id"]; + RELEASE_ASSERT(!gid_str.empty(), "Invalid raft_group config found"); + + boost::uuids::string_generator gen; + uuid_t uuid = gen(gid_str); + + auto v = get_repl_dev(uuid); + RELEASE_ASSERT(bool(v), "Not able to find the group_id corresponding, has repl_dev superblk not loaded yet?"); + + (std::dynamic_pointer_cast< RaftReplDev >(*v))->use_config(std::move(group_config)); +} + +std::string RaftReplService::lookup_peer(nuraft_mesg::peer_id_t const& peer) { + auto const p = m_repl_app->lookup_peer(peer); + return p.first + ":" + std::to_string(p.second); +} + +shared< nuraft_mesg::mesg_state_mgr > RaftReplService::create_state_mgr(int32_t srv_id, + nuraft_mesg::group_id_t const& group_id) { + auto result = get_repl_dev(group_id); + if (result) { return std::dynamic_pointer_cast< nuraft_mesg::mesg_state_mgr >(result.value()); } + + // Create a new raft superblk + superblk< raft_repl_dev_superblk > rd_sb{get_meta_blk_name()}; + rd_sb.create(); + rd_sb->group_id = group_id; + rd_sb->is_timeline_consistent = m_repl_app->need_timeline_consistency(); + + // Create a new instance of Raft ReplDev (which is the state manager this method is looking for) + auto rdev = std::make_shared< RaftReplDev >(*this, rd_sb, false /* load_existing */); + rdev->use_config(json_superblk{get_meta_blk_name() + "_raft_config"}); + + // Attach the listener to the raft + auto listener = m_repl_app->create_repl_dev_listener(group_id); + listener->set_repl_dev(rdev.get()); + rdev->attach_listener(std::move(listener)); + + // Add the repl dev to the map + add_repl_dev(group_id, rdev); + + // Now we can persist the superblk + rd_sb.write(); + return std::dynamic_pointer_cast< nuraft_mesg::mesg_state_mgr >(rdev); +} + +AsyncReplResult< shared< ReplDev > > RaftReplService::create_repl_dev(group_id_t group_id, + std::set< replica_id_t > const& members) { + // TODO: All operations are made sync here for convenience to caller. However, we should attempt to make this async + // and do deferValue to a seperate dedicated hs thread for these kind of operations and wakeup the caller. It + // probably needs iomanager executor for deferValue. + if (members.size() > 0) { + // Create a new RAFT group and add all members. create_group() will call the create_state_mgr which will create + // the repl_dev instance and add it to the map. + if (auto const status = m_msg_mgr->create_group(group_id, "homestore_replication").get(); !status) { + return make_async_error< shared< ReplDev > >(to_repl_error(status.error())); + } + + auto my_id = m_repl_app->get_my_repl_id(); + for (auto& member : members) { + if (member == my_id) { continue; } // Skip myself + do { + auto const result = m_msg_mgr->add_member(group_id, member).get(); + if (result) { + LOGINFO("Groupid={}, new member={} added", boost::uuids::to_string(group_id), + boost::uuids::to_string(member)); + break; + } else if (result.error() != nuraft::CONFIG_CHANGING) { + return make_async_error< shared< ReplDev > >(to_repl_error(result.error())); + } else { + LOGWARN("Config is changing for group_id={} while adding member={}, retry operation in a second", + boost::uuids::to_string(group_id), boost::uuids::to_string(member)); + std::this_thread::sleep_for(std::chrono::seconds(1)); + } + } while (true); + } + } + + auto result = get_repl_dev(group_id); + return result ? make_async_success< shared< ReplDev > >(result.value()) + : make_async_error< shared< ReplDev > >(ReplServiceError::SERVER_NOT_FOUND); +} + +void RaftReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) { + // Load the superblk + superblk< raft_repl_dev_superblk > rd_sb{get_meta_blk_name()}; + rd_sb.load(buf, meta_cookie); + HS_DBG_ASSERT_EQ(rd_sb->get_magic(), repl_dev_superblk::REPL_DEV_SB_MAGIC, "Invalid rdev metablk, magic mismatch"); + HS_DBG_ASSERT_EQ(rd_sb->get_raft_sb_version(), raft_repl_dev_superblk::RAFT_REPL_DEV_SB_VERSION, + "Invalid version of raft rdev metablk"); + group_id_t group_id = rd_sb->group_id; + + // Validate if the repl_dev for this group is already loaded. + auto rdev_result = get_repl_dev(group_id); + if (rdev_result) { + HS_DBG_ASSERT("Group ID={} already loaded and added to repl_dev list, duplicate load?", + boost::uuids::to_string(group_id).c_str()); + return; + } + + // Create an instance of ReplDev from loaded superblk + auto rdev = std::make_shared< RaftReplDev >(*this, rd_sb, true /* load_existing */); + + // Try to join the RAFT group + auto raft_result = m_msg_mgr->join_group(group_id, "homestore_replication", + std::dynamic_pointer_cast< nuraft_mesg::mesg_state_mgr >(rdev)); + if (!raft_result) { + HS_DBG_ASSERT(false, "Unable to join the group_id={} with error={}", boost::uuids::to_string(group_id).c_str(), + raft_result.error()); + } + + // Add the RaftReplDev to the list of repl_devs + add_repl_dev(group_id, rdev); +} + +AsyncReplResult<> RaftReplService::replace_member(group_id_t group_id, replica_id_t member_out, + replica_id_t member_in) const { + return make_async_error<>(ReplServiceError::NOT_IMPLEMENTED); +} + +///////////////////// RaftReplService CP Callbacks ///////////////////////////// +std::unique_ptr< CPContext > RaftReplServiceCPHandler::on_switchover_cp(CP* cur_cp, CP* new_cp) { return nullptr; } + +folly::Future< bool > RaftReplServiceCPHandler::cp_flush(CP* cp) { + repl_service().iterate_repl_devs( + [cp](cshared< ReplDev >& repl_dev) { std::static_pointer_cast< RaftReplDev >(repl_dev)->cp_flush(cp); }); + return folly::makeFuture< bool >(true); +} + +void RaftReplServiceCPHandler::cp_cleanup(CP* cp) { + repl_service().iterate_repl_devs( + [cp](cshared< ReplDev >& repl_dev) { std::static_pointer_cast< RaftReplDev >(repl_dev)->cp_cleanup(cp); }); +} + +int RaftReplServiceCPHandler::cp_progress_percent() { return 100; } +} // namespace homestore \ No newline at end of file diff --git a/src/lib/replication/service/raft_repl_service.h b/src/lib/replication/service/raft_repl_service.h new file mode 100644 index 000000000..fa12cd07e --- /dev/null +++ b/src/lib/replication/service/raft_repl_service.h @@ -0,0 +1,77 @@ +/********************************************************************************* + * Modifications Copyright 2017-2019 eBay Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ +#pragma once +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include "replication/service/generic_repl_svc.h" + +namespace homestore { + +struct repl_dev_superblk; +class RaftReplService : public GenericReplService, + public nuraft_mesg::MessagingApplication, + public std::enable_shared_from_this< RaftReplService > { +private: + shared< nuraft_mesg::Manager > m_msg_mgr; + json_superblk m_config_sb; + +public: + RaftReplService(cshared< ReplApplication >& repl_app); + + static ReplServiceError to_repl_error(nuraft::cmd_result_code code); + + ///////////////////// Overrides of nuraft_mesg::MessagingApplication //////////////////// + std::string lookup_peer(nuraft_mesg::peer_id_t const&) override; + std::shared_ptr< nuraft_mesg::mesg_state_mgr > create_state_mgr(int32_t srv_id, + nuraft_mesg::group_id_t const& group_id) override; + nuraft_mesg::Manager& msg_manager() { return *m_msg_mgr; } + +protected: + ///////////////////// Overrides of GenericReplService //////////////////// + void start() override; + AsyncReplResult< shared< ReplDev > > create_repl_dev(group_id_t group_id, + std::set< replica_id_t > const& members) override; + void load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) override; + AsyncReplResult<> replace_member(group_id_t group_id, replica_id_t member_out, + replica_id_t member_in) const override; + +private: + void raft_group_config_found(sisl::byte_view const& buf, void* meta_cookie); +}; + +class RaftReplServiceCPHandler : public CPCallbacks { +public: + RaftReplServiceCPHandler() = default; + virtual ~RaftReplServiceCPHandler() = default; + +public: + std::unique_ptr< CPContext > on_switchover_cp(CP* cur_cp, CP* new_cp) override; + folly::Future< bool > cp_flush(CP* cp) override; + void cp_cleanup(CP* cp) override; + int cp_progress_percent() override; +}; + +} // namespace homestore diff --git a/src/lib/replication/service/repl_service_impl.cpp b/src/lib/replication/service/repl_service_impl.cpp deleted file mode 100644 index f93c25e8a..000000000 --- a/src/lib/replication/service/repl_service_impl.cpp +++ /dev/null @@ -1,180 +0,0 @@ -/********************************************************************************* - * Modifications Copyright 2017-2019 eBay Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software distributed - * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR - * CONDITIONS OF ANY KIND, either express or implied. See the License for the - * specific language governing permissions and limitations under the License. - * - *********************************************************************************/ -#include -#include -#include "common/homestore_assert.hpp" -#include "replication/service/repl_service_impl.h" -#include "replication/repl_dev/solo_repl_dev.h" -#include "homestore/blkdata_service.hpp" -#include "homestore/homestore.hpp" - -namespace homestore { -ReplicationService& repl_service() { return hs()->repl_service(); } - -ReplicationServiceImpl::ReplicationServiceImpl(repl_impl_type impl_type) : m_repl_type{impl_type} { - meta_service().register_handler( - "repl_dev", - [this](meta_blk* mblk, sisl::byte_view buf, size_t) { rd_super_blk_found(std::move(buf), voidptr_cast(mblk)); }, - nullptr); -} - -void ReplicationServiceImpl::start() { - // Register to CP to flush the super blk and truncate the logstore - hs()->cp_mgr().register_consumer(cp_consumer_t::REPLICATION_SVC, std::make_unique< ReplServiceCPHandler >()); - - { - std::shared_lock lg{m_rd_map_mtx}; - for (auto const& [gid, info] : m_pending_open) { - // info.dev_promise.setValue(folly::makeUnexpected(ReplServiceError::SERVER_NOT_FOUND)); - } - } - m_rd_map_loaded = true; -} - -void ReplicationServiceImpl::stop() { - std::unique_lock lg{m_rd_map_mtx}; - m_rd_map.clear(); -} - -hs_stats ReplicationServiceImpl::get_cap_stats() const { - hs_stats stats; - - stats.total_capacity = data_service().get_total_capacity(); - stats.used_capacity = data_service().get_used_capacity(); - return stats; -} - -AsyncReplResult< shared< ReplDev > > -ReplicationServiceImpl::create_repl_dev(uuid_t group_id, std::set< std::string, std::less<> >&& members, - std::unique_ptr< ReplDevListener > listener) { - superblk< repl_dev_superblk > rd_sb{"repl_dev"}; - rd_sb.create(sizeof(repl_dev_superblk)); - rd_sb->gid = group_id; - - shared< ReplDev > repl_dev = create_repl_dev_instance(rd_sb, false /* load_existing */); - listener->set_repl_dev(repl_dev.get()); - repl_dev->attach_listener(std::move(listener)); - rd_sb.write(); - return make_async_success(std::move(repl_dev)); -} - -AsyncReplResult< shared< ReplDev > > -ReplicationServiceImpl::open_repl_dev(uuid_t group_id, std::unique_ptr< ReplDevListener > listener) { - if (m_rd_map_loaded) { - // We have already loaded all repl_dev and open_repl_dev is called after that, we don't support dynamically - // opening the repl_dev. Return an error - LOGERROR("Opening group_id={} after services are started, which is not supported", - boost::uuids::to_string(group_id)); - return make_async_error< shared< ReplDev > >(ReplServiceError::BAD_REQUEST); - } - - std::unique_lock lg(m_rd_map_mtx); - auto it = m_rd_map.find(group_id); - if (it != m_rd_map.end()) { - // We already loaded the ReplDev, just call the group_id and attach the listener - auto repl_dev = it->second; - listener->set_repl_dev(repl_dev.get()); - repl_dev->attach_listener(std::move(listener)); - return make_async_success< shared< ReplDev > >(std::move(repl_dev)); - } else { - auto [pending_it, inserted] = - m_pending_open.insert_or_assign(group_id, listener_info{.listener = std::move(listener)}); - DEBUG_ASSERT(inserted, "Duplicate open_replica_dev called for group_id = {}", - boost::uuids::to_string(group_id)); - return pending_it->second.dev_promise.getFuture(); - } -} - -ReplResult< shared< ReplDev > > ReplicationServiceImpl::get_repl_dev(uuid_t group_id) const { - std::shared_lock lg(m_rd_map_mtx); - if (auto it = m_rd_map.find(group_id); it != m_rd_map.end()) { return it->second; } - return folly::makeUnexpected(ReplServiceError::SERVER_NOT_FOUND); -} - -void ReplicationServiceImpl::iterate_repl_devs(std::function< void(cshared< ReplDev >&) > const& cb) { - std::shared_lock lg(m_rd_map_mtx); - for (const auto& [uuid, rd] : m_rd_map) { - cb(rd); - } -} - -folly::Future< ReplServiceError > ReplicationServiceImpl::replace_member(uuid_t group_id, std::string const& member_out, - std::string const& member_in) const { - return folly::makeFuture< ReplServiceError >(ReplServiceError::NOT_IMPLEMENTED); -} - -shared< ReplDev > ReplicationServiceImpl::create_repl_dev_instance(superblk< repl_dev_superblk > const& rd_sb, - bool load_existing) { - auto it = m_rd_map.end(); - bool happened = false; - - { - std::unique_lock lg(m_rd_map_mtx); - std::tie(it, happened) = m_rd_map.emplace(std::make_pair(rd_sb->gid, nullptr)); - } - DEBUG_ASSERT(m_rd_map.end() != it, "Could not insert into map!"); - if (!happened) { return it->second; } - - shared< ReplDev > repl_dev; - if (m_repl_type == repl_impl_type::solo) { - repl_dev = std::make_shared< SoloReplDev >(rd_sb, load_existing); - } else { - HS_REL_ASSERT(false, "Repl impl type = {} is not supported yet", enum_name(m_repl_type)); - } - it->second = repl_dev; - - return repl_dev; -} - -void ReplicationServiceImpl::rd_super_blk_found(sisl::byte_view const& buf, void* meta_cookie) { - superblk< repl_dev_superblk > rd_sb; - rd_sb.load(buf, meta_cookie); - HS_DBG_ASSERT_EQ(rd_sb->get_magic(), repl_dev_superblk::REPL_DEV_SB_MAGIC, "Invalid rdev metablk, magic mismatch"); - HS_DBG_ASSERT_EQ(rd_sb->get_version(), repl_dev_superblk::REPL_DEV_SB_VERSION, "Invalid version of rdev metablk"); - - shared< ReplDev > repl_dev = create_repl_dev_instance(rd_sb, true /* load_existing */); - { - std::unique_lock lg(m_rd_map_mtx); - auto it = m_pending_open.find(rd_sb->gid); - if (it != m_pending_open.end()) { - auto& li_info = it->second; - // Someone waiting for this repl dev to open, call them to attach the listener and provide the value - li_info.listener->set_repl_dev(repl_dev.get()); - repl_dev->attach_listener(std::move(li_info.listener)); - li_info.dev_promise.setValue(repl_dev); - m_pending_open.erase(it); - } - } -} - -///////////////////// CP Callbacks for Repl Service ////////////// -ReplServiceCPHandler::ReplServiceCPHandler() {} - -std::unique_ptr< CPContext > ReplServiceCPHandler::on_switchover_cp(CP* cur_cp, CP* new_cp) { return nullptr; } - -folly::Future< bool > ReplServiceCPHandler::cp_flush(CP* cp) { - repl_service().iterate_repl_devs( - [cp](cshared< ReplDev >& repl_dev) { std::dynamic_pointer_cast< SoloReplDev >(repl_dev)->cp_flush(cp); }); - return folly::makeFuture< bool >(true); -} - -void ReplServiceCPHandler::cp_cleanup(CP* cp) { - repl_service().iterate_repl_devs( - [cp](cshared< ReplDev >& repl_dev) { std::dynamic_pointer_cast< SoloReplDev >(repl_dev)->cp_cleanup(cp); }); -} - -int ReplServiceCPHandler::cp_progress_percent() { return 100; } - -} // namespace homestore diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt index f40a66311..e71d3e510 100644 --- a/src/tests/CMakeLists.txt +++ b/src/tests/CMakeLists.txt @@ -16,7 +16,7 @@ if (${build_nonio_tests}) add_executable(test_blkalloc) target_sources(test_blkalloc PRIVATE test_blkalloc.cpp $) target_link_libraries(test_blkalloc homestore ${COMMON_TEST_DEPS} ) - add_test(NAME BlkAlloc COMMAND ${CMAKE_SOURCE_DIR}/test_wrap.sh ${CMAKE_BINARY_DIR}/bin/test_blkalloc) + add_test(NAME BlkAlloc COMMAND ${CMAKE_BINARY_DIR}/bin/test_blkalloc) add_executable(test_blk_cache_queue) target_sources(test_blk_cache_queue PRIVATE test_blk_cache_queue.cpp ../lib/blkalloc/blk_cache_queue.cpp) @@ -26,7 +26,7 @@ if (${build_nonio_tests}) set(TEST_JOURNAL_VDEV_SOURCES test_journal_vdev.cpp) add_executable(test_journal_vdev ${TEST_JOURNAL_VDEV_SOURCES}) target_link_libraries(test_journal_vdev homestore ${COMMON_TEST_DEPS} GTest::gmock) - add_test(NAME JournalVDev COMMAND ${CMAKE_SOURCE_DIR}/test_wrap.sh ${CMAKE_BINARY_DIR}/bin/test_journal_vdev) + add_test(NAME JournalVDev COMMAND ${CMAKE_BINARY_DIR}/bin/test_journal_vdev) set(TEST_BTREENODE_SOURCE_FILES test_btree_node.cpp) add_executable(test_btree_node ${TEST_BTREENODE_SOURCE_FILES}) @@ -95,20 +95,32 @@ if (${io_tests}) target_sources(test_solo_repl_dev PRIVATE test_solo_repl_dev.cpp) target_link_libraries(test_solo_repl_dev homestore ${COMMON_TEST_DEPS} GTest::gmock) + add_executable(test_home_raft_logstore) + target_sources(test_home_raft_logstore PRIVATE test_home_raft_logstore.cpp) + target_link_libraries(test_home_raft_logstore homestore ${COMMON_TEST_DEPS} GTest::gmock) + + add_executable(test_raft_repl_dev) + target_sources(test_raft_repl_dev PRIVATE test_raft_repl_dev.cpp) + target_link_libraries(test_raft_repl_dev homestore ${COMMON_TEST_DEPS} GTest::gmock) + can_build_epoll_io_tests(epoll_tests) if(${epoll_tests}) - add_test(NAME LogStore-Epoll COMMAND ${CMAKE_SOURCE_DIR}/test_wrap.sh ${CMAKE_BINARY_DIR}/bin/test_log_store) - add_test(NAME MetaBlkMgr-Epoll COMMAND ${CMAKE_SOURCE_DIR}/test_wrap.sh ${CMAKE_BINARY_DIR}/bin/test_meta_blk_mgr) - add_test(NAME DataService-Epoll COMMAND ${CMAKE_SOURCE_DIR}/test_wrap.sh ${CMAKE_BINARY_DIR}/bin/test_data_service) - add_test(NAME SoloReplDev-Epoll COMMAND ${CMAKE_SOURCE_DIR}/test_wrap.sh ${CMAKE_BINARY_DIR}/bin/test_solo_repl_dev) + add_test(NAME LogStore-Epoll COMMAND ${CMAKE_BINARY_DIR}/bin/test_log_store) + add_test(NAME MetaBlkMgr-Epoll COMMAND ${CMAKE_BINARY_DIR}/bin/test_meta_blk_mgr) + add_test(NAME DataService-Epoll COMMAND ${CMAKE_BINARY_DIR}/bin/test_data_service) + add_test(NAME SoloReplDev-Epoll COMMAND ${CMAKE_BINARY_DIR}/bin/test_solo_repl_dev) + add_test(NAME HomeRaftLogStore-Epoll COMMAND ${CMAKE_BINARY_DIR}/bin/test_home_raft_logstore) + add_test(NAME RaftReplDev-Epoll COMMAND ${CMAKE_BINARY_DIR}/bin/test_raft_repl_dev) endif() can_build_spdk_io_tests(spdk_tests) if(${spdk_tests}) - add_test(NAME LogStore-Spdk COMMAND ${CMAKE_SOURCE_DIR}/test_wrap.sh ${CMAKE_BINARY_DIR}/bin/test_log_store -- --spdk "true") - add_test(NAME MetaBlkMgr-Spdk COMMAND ${CMAKE_SOURCE_DIR}/test_wrap.sh ${CMAKE_BINARY_DIR}/bin/test_meta_blk_mgr -- --spdk "true") - add_test(NAME DataSerice-Spdk COMMAND ${CMAKE_SOURCE_DIR}/test_wrap.sh ${CMAKE_BINARY_DIR}/bin/test_data_service -- --spdk "true") - add_test(NAME SoloReplDev-Spdk COMMAND ${CMAKE_SOURCE_DIR}/test_wrap.sh ${CMAKE_BINARY_DIR}/bin/test_solo_repl_dev -- --spdk "true") + add_test(NAME LogStore-Spdk COMMAND ${CMAKE_BINARY_DIR}/bin/test_log_store -- --spdk "true") + add_test(NAME MetaBlkMgr-Spdk COMMAND ${CMAKE_BINARY_DIR}/bin/test_meta_blk_mgr -- --spdk "true") + add_test(NAME DataSerice-Spdk COMMAND ${CMAKE_BINARY_DIR}/bin/test_data_service -- --spdk "true") + add_test(NAME SoloReplDev-Spdk COMMAND ${CMAKE_BINARY_DIR}/bin/test_solo_repl_dev -- --spdk "true") + add_test(NAME HomeRaftLogStore-Spdk COMMAND ${CMAKE_BINARY_DIR}/bin/test_home_raft_logstore -- --spdk "true") + add_test(NAME RaftReplDev-Spdk COMMAND ${CMAKE_BINARY_DIR}/bin/test_raft_repl_dev -- --spdk "true") if(${epoll_tests}) SET_TESTS_PROPERTIES(MetaBlkMgr-Spdk PROPERTIES DEPENDS LogStore-Spdk) SET_TESTS_PROPERTIES(DataService-Spdk PROPERTIES DEPENDS MetaBlkMgr-Spdk) diff --git a/src/tests/log_dev_benchmark.cpp b/src/tests/log_dev_benchmark.cpp index a16b616e7..107f5c08f 100644 --- a/src/tests/log_dev_benchmark.cpp +++ b/src/tests/log_dev_benchmark.cpp @@ -28,7 +28,7 @@ #include "logstore/log_dev.hpp" -SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) +SISL_LOGGING_DEF(HOMESTORE_LOG_MODS) RCU_REGISTER_INIT static constexpr size_t ITERATIONS{100000}; diff --git a/src/tests/log_store_benchmark.cpp b/src/tests/log_store_benchmark.cpp index b8d43e6ec..885a1474f 100644 --- a/src/tests/log_store_benchmark.cpp +++ b/src/tests/log_store_benchmark.cpp @@ -35,7 +35,7 @@ #include "test_common/homestore_test_common.hpp" using namespace homestore; -SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) +SISL_LOGGING_DEF(HOMESTORE_LOG_MODS) std::vector< std::string > test_common::HSTestHelper::s_dev_names; SISL_OPTIONS_ENABLE(logging, log_store_benchmark, iomgr, test_common_setup) diff --git a/src/tests/test_append_blkalloc.cpp b/src/tests/test_append_blkalloc.cpp index c1073691f..22ff9975c 100644 --- a/src/tests/test_append_blkalloc.cpp +++ b/src/tests/test_append_blkalloc.cpp @@ -44,7 +44,7 @@ //////////////////////////////////////////////////////////////////////////////////////////////////// RCU_REGISTER_INIT -SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) +SISL_LOGGING_DEF(HOMESTORE_LOG_MODS) SISL_OPTIONS_ENABLE(logging, test_append_blkalloc, iomgr, test_common_setup) SISL_LOGGING_DECL(test_append_blkalloc) @@ -285,9 +285,7 @@ TEST_F(AppendBlkAllocatorTest, TestWriteThenRecovey) { SISL_OPTION_GROUP(test_append_blkalloc, (run_time, "", "run_time", "running time in seconds", - ::cxxopts::value< uint64_t >()->default_value("30"), "number"), - (num_io, "", "num_io", "number of io", ::cxxopts::value< uint64_t >()->default_value("300"), - "number")); + ::cxxopts::value< uint64_t >()->default_value("30"), "number")); int main(int argc, char* argv[]) { int parsed_argc{argc}; diff --git a/src/tests/test_blk_cache_queue.cpp b/src/tests/test_blk_cache_queue.cpp index 840c921af..ab2bf5818 100644 --- a/src/tests/test_blk_cache_queue.cpp +++ b/src/tests/test_blk_cache_queue.cpp @@ -28,7 +28,7 @@ #include "blkalloc/varsize_blk_allocator.h" #include "blkalloc/blk_cache_queue.h" -SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) +SISL_LOGGING_DEF(HOMESTORE_LOG_MODS) using namespace homestore; diff --git a/src/tests/test_blk_read_tracker.cpp b/src/tests/test_blk_read_tracker.cpp index 4c656ac0b..7d9eef662 100644 --- a/src/tests/test_blk_read_tracker.cpp +++ b/src/tests/test_blk_read_tracker.cpp @@ -25,7 +25,7 @@ using namespace homestore; -SISL_LOGGING_INIT(test_blk_read_tracker, iomgr, flip, io_wd) +SISL_LOGGING_DEF(HOMESTORE_LOG_MODS) SISL_OPTIONS_ENABLE(logging, test_blk_read_tracker) VENUM(op_type_t, uint8_t, insert = 0, remove = 1, wait_on = 2, max_op = 3); diff --git a/src/tests/test_blkalloc.cpp b/src/tests/test_blkalloc.cpp index 0d6204022..6f0040556 100644 --- a/src/tests/test_blkalloc.cpp +++ b/src/tests/test_blkalloc.cpp @@ -41,7 +41,7 @@ #include "blkalloc/fixed_blk_allocator.h" #include "blkalloc/varsize_blk_allocator.h" -SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) +SISL_LOGGING_DEF(HOMESTORE_LOG_MODS) using namespace homestore; diff --git a/src/tests/test_blkid.cpp b/src/tests/test_blkid.cpp index c312f94ce..e1d5a9358 100644 --- a/src/tests/test_blkid.cpp +++ b/src/tests/test_blkid.cpp @@ -7,7 +7,7 @@ #include -SISL_LOGGING_INIT(test_blkid, iomgr, flip, io_wd) +SISL_LOGGING_DEF(HOMESTORE_LOG_MODS) SISL_OPTIONS_ENABLE(logging, test_blkid) SISL_OPTION_GROUP(test_blkid, diff --git a/src/tests/test_btree_node.cpp b/src/tests/test_btree_node.cpp index 23109cd58..22069595c 100644 --- a/src/tests/test_btree_node.cpp +++ b/src/tests/test_btree_node.cpp @@ -31,7 +31,7 @@ static constexpr uint32_t g_max_keys{6000}; static std::uniform_int_distribution< uint32_t > g_randkey_generator{0, g_max_keys - 1}; using namespace homestore; -SISL_LOGGING_INIT(btree, iomgr, flip, io_wd) +SISL_LOGGING_DEF(HOMESTORE_LOG_MODS) struct FixedLenNodeTest { using NodeType = SimpleNode< TestFixedKey, TestFixedValue >; diff --git a/src/tests/test_common/homestore_test_common.hpp b/src/tests/test_common/homestore_test_common.hpp index b1208feb5..c55b10ebd 100644 --- a/src/tests/test_common/homestore_test_common.hpp +++ b/src/tests/test_common/homestore_test_common.hpp @@ -49,6 +49,8 @@ SISL_OPTION_GROUP(test_common_setup, ::cxxopts::value< std::vector< std::string > >(), "path [...]"), (http_port, "", "http_port", "http port (0 for no http, -1 for random, rest specific value)", ::cxxopts::value< int >()->default_value("-1"), "number"), + (num_io, "", "num_io", "number of IO operations", + ::cxxopts::value< uint64_t >()->default_value("300"), "number"), (spdk, "", "spdk", "spdk", ::cxxopts::value< bool >()->default_value("false"), "true or false")); SETTINGS_INIT(iomgrcfg::IomgrSettings, iomgr_config); @@ -74,6 +76,71 @@ inline static uint32_t generate_random_http_port() { return http_port; } +struct Runner { + uint64_t total_tasks_{0}; + uint32_t qdepth_{8}; + std::atomic< uint64_t > issued_tasks_{0}; + std::atomic< uint64_t > completed_tasks_{0}; + std::function< void(void) > task_; + folly::Promise< folly::Unit > comp_promise_; + + Runner(uint64_t num_tasks, uint32_t qd = 8) : total_tasks_{num_tasks}, qdepth_{qd} { + if (total_tasks_ < (uint64_t)qdepth_) { total_tasks_ = qdepth_; } + } + Runner() : Runner{SISL_OPTIONS["num_io"].as< uint64_t >()} {} + Runner(const Runner&) = delete; + Runner& operator=(const Runner&) = delete; + + void set_num_tasks(uint64_t num_tasks) { total_tasks_ = std::max((uint64_t)qdepth_, num_tasks); } + void set_task(std::function< void(void) > f) { + issued_tasks_.store(0); + completed_tasks_.store(0); + comp_promise_ = folly::Promise< folly::Unit >{}; + task_ = std::move(f); + } + + folly::Future< folly::Unit > execute() { + for (uint32_t i{0}; i < qdepth_; ++i) { + run_task(); + } + return comp_promise_.getFuture(); + } + + void next_task() { + auto ctasks = completed_tasks_.fetch_add(1); + if ((issued_tasks_.load() < total_tasks_)) { + run_task(); + } else if ((ctasks + 1) == total_tasks_) { + comp_promise_.setValue(); + } + } + + void run_task() { + ++issued_tasks_; + iomanager.run_on_forget(iomgr::reactor_regex::random_worker, task_); + } +}; + +struct Waiter { + std::atomic< uint64_t > expected_comp{0}; + std::atomic< uint64_t > actual_comp{0}; + folly::Promise< folly::Unit > comp_promise; + + Waiter(uint64_t num_op) : expected_comp{num_op} {} + Waiter() : Waiter{SISL_OPTIONS["num_io"].as< uint64_t >()} {} + Waiter(const Waiter&) = delete; + Waiter& operator=(const Waiter&) = delete; + + folly::Future< folly::Unit > start(std::function< void(void) > f) { + f(); + return comp_promise.getFuture(); + } + + void one_complete() { + if ((actual_comp.fetch_add(1) + 1) >= expected_comp.load()) { comp_promise.setValue(); } + } +}; + class HSTestHelper { private: static void remove_files(const std::vector< std::string >& file_paths) { @@ -152,7 +219,8 @@ class HSTestHelper { } LOGINFO("Starting iomgr with {} threads, spdk: {}", num_threads, is_spdk); - ioenvironment.with_iomgr(iomgr::iomgr_params{.num_threads = num_threads, .is_spdk = is_spdk, .num_fibers = num_fibers}); + ioenvironment.with_iomgr( + iomgr::iomgr_params{.num_threads = num_threads, .is_spdk = is_spdk, .num_fibers = num_fibers}); auto const http_port = SISL_OPTIONS["http_port"].as< int >(); if (http_port != 0) { @@ -224,8 +292,9 @@ class HSTestHelper { } } - static sisl::sg_list create_sgs(uint64_t io_size, uint32_t blk_size, uint32_t max_size_per_iov, + static sisl::sg_list create_sgs(uint64_t io_size, uint32_t max_size_per_iov, std::optional< uint64_t > fill_data_pattern = std::nullopt) { + auto blk_size = SISL_OPTIONS["block_size"].as< uint32_t >(); HS_REL_ASSERT_EQ(io_size % blk_size, 0, "io_size should be a multiple of blk_size"); HS_REL_ASSERT_EQ(max_size_per_iov % blk_size, 0, "max_size_per_iov should be a multiple of blk_size"); diff --git a/src/tests/test_common/hs_repl_test_common.hpp b/src/tests/test_common/hs_repl_test_common.hpp new file mode 100644 index 000000000..e2e18b2b2 --- /dev/null +++ b/src/tests/test_common/hs_repl_test_common.hpp @@ -0,0 +1,252 @@ +/********************************************************************************* + * Modifications Copyright 2017-2019 eBay Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ +/* + * Homestore Replication testing binaries shared common definitions, apis and data structures + */ + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include "test_common/homestore_test_common.hpp" + +SISL_OPTION_GROUP(test_repl_common_setup, + (replicas, "", "replicas", "Total number of replicas", + ::cxxopts::value< uint32_t >()->default_value("3"), "number"), + (base_port, "", "base_port", "Port number of first replica", + ::cxxopts::value< uint16_t >()->default_value("4000"), "number"), + (replica_num, "", "replica_num", + "Internal replica num (used to lauch multi process) - don't override", + ::cxxopts::value< uint16_t >()->default_value("0"), "number")); + +std::vector< std::string > test_common::HSTestHelper::s_dev_names; + +using namespace homestore; +namespace bip = boost::interprocess; + +namespace test_common { + +VENUM(ipc_packet_op_t, uint32_t, WAKE_UP = 0, CLEAN_EXIT = 1, UNCLEAN_EXIT = 2, PEER_GOING_DOWN = 3); +ENUM(repl_test_phase_t, uint32_t, REGISTER, MEMBER_START, TEST_RUN, VALIDATE, CLEANUP); + +class HSReplTestHelper { +protected: + struct IPCData { + bip::interprocess_mutex mtx_; + bip::interprocess_condition cv_; + + repl_test_phase_t phase_{repl_test_phase_t::REGISTER}; + uint32_t registered_count_{0}; + uint32_t test_start_count_{0}; + uint32_t verify_start_count_{0}; + uint32_t cleanup_start_count_{0}; + uint64_t test_dataset_size_{0}; + + void sync_for_member_start() { sync_for(registered_count_, repl_test_phase_t::MEMBER_START); } + void sync_for_test_start() { sync_for(test_start_count_, repl_test_phase_t::TEST_RUN); } + void sync_for_verify_start() { sync_for(verify_start_count_, repl_test_phase_t::VALIDATE); } + void sync_for_cleanup_start() { sync_for(cleanup_start_count_, repl_test_phase_t::CLEANUP); } + + private: + void sync_for(uint32_t& count, repl_test_phase_t new_phase) { + std::unique_lock< bip::interprocess_mutex > lg(mtx_); + ++count; + if (count == SISL_OPTIONS["replicas"].as< uint32_t >()) { + phase_ = new_phase; + cv_.notify_all(); + } + cv_.wait(lg, [this, new_phase]() { return (phase_ == new_phase); }); + } + }; + +public: + class TestReplApplication : public ReplApplication { + private: + HSReplTestHelper& helper_; + + public: + TestReplApplication(HSReplTestHelper& h) : helper_{h} {} + virtual ~TestReplApplication() = default; + + homestore::repl_impl_type get_impl_type() const override { return homestore::repl_impl_type::server_side; } + bool need_timeline_consistency() const { return false; } + + std::shared_ptr< homestore::ReplDevListener > + create_repl_dev_listener(homestore::group_id_t group_id) override { + return helper_.get_listener(group_id); + } + + std::pair< std::string, uint16_t > lookup_peer(homestore::replica_id_t replica_id) const override { + uint16_t port; + if (auto it = helper_.members_.find(replica_id); it != helper_.members_.end()) { + port = SISL_OPTIONS["base_port"].as< uint16_t >() + it->second; + } else { + RELEASE_ASSERT(false, "Gotten lookup_peer call for a non member"); + } + + return std::make_pair(std::string("127.0.0.1"), port); + } + + homestore::replica_id_t get_my_repl_id() const override { return helper_.my_replica_id_; } + }; + +public: + friend class TestReplApplication; + + HSReplTestHelper(std::string const& name, char** argv) : name_{name}, argv_{argv} {} + + void setup() { + replica_num_ = SISL_OPTIONS["replica_num"].as< uint16_t >(); + sisl::logging::SetLogger(name_ + std::string("_replica_") + std::to_string(replica_num_)); + auto const num_replicas = SISL_OPTIONS["replicas"].as< uint32_t >(); + + boost::uuids::string_generator gen; + for (uint32_t i{0}; i < num_replicas; ++i) { + auto replica_id = gen(fmt::format("{:04}", i) + std::string("0123456789abcdef0123456789ab")); + up_members_.insert(i); + if (i == replica_num_) { my_replica_id_ = replica_id; } + members_.insert(std::pair(replica_id, i)); + } + + if (replica_num_ == 0) { + // Erase previous shmem and create a new shmem with IPCData structure + bip::shared_memory_object::remove("raft_repl_test_shmem"); + shm_ = std::make_unique< bip::shared_memory_object >(bip::create_only, "raft_repl_test_shmem", + bip::read_write); + shm_->truncate(sizeof(IPCData)); + region_ = std::make_unique< bip::mapped_region >(*shm_, bip::read_write); + ipc_data_ = new (region_->get_address()) IPCData; + + for (uint32_t i{1}; i < num_replicas; ++i) { + LOGINFO("Spawning Homestore replica={} instance", i); + boost::process::child c(argv_[0], "--replica_num", std::to_string(i), proc_grp_); + c.detach(); + } + } else { + shm_ = + std::make_unique< bip::shared_memory_object >(bip::open_only, "raft_repl_test_shmem", bip::read_write); + region_ = std::make_unique< bip::mapped_region >(*shm_, bip::read_write); + ipc_data_ = static_cast< IPCData* >(region_->get_address()); + } + + int tmp_argc = 1; + folly_ = std::make_unique< folly::Init >(&tmp_argc, &argv_, true); + + LOGINFO("Starting Homestore replica={}", replica_num_); + test_common::HSTestHelper::start_homestore( + name_ + std::to_string(replica_num_), + {{HS_SERVICE::META, {.size_pct = 5.0}}, + {HS_SERVICE::REPLICATION, {.size_pct = 60.0, .repl_app = std::make_unique< TestReplApplication >(*this)}}, + {HS_SERVICE::LOG_REPLICATED, {.size_pct = 20.0}}, + {HS_SERVICE::LOG_LOCAL, {.size_pct = 2.0}}}); + } + + void teardown() { + LOGINFO("Stopping Homestore replica={}", replica_num_); + sisl::GrpcAsyncClientWorker::shutdown_all(); + test_common::HSTestHelper::shutdown_homestore(); + } + + void reset_setup() { + teardown(); + setup(); + } + + uint16_t replica_num() const { return replica_num_; } + + Runner& runner() { return io_runner_; } + + void register_listener(std::shared_ptr< ReplDevListener > listener) { + if (replica_num_ != 0) { pending_listeners_.emplace_back(std::move(listener)); } + + ipc_data_->sync_for_member_start(); + + if (replica_num_ == 0) { + std::set< homestore::replica_id_t > members; + std::transform(members_.begin(), members_.end(), std::inserter(members, members.end()), + [](auto const& p) { return p.first; }); + group_id_t repl_group_id = hs_utils::gen_random_uuid(); + { + std::unique_lock lg(groups_mtx_); + repl_groups_.insert({repl_group_id, std::move(listener)}); + } + + auto v = hs()->repl_service().create_repl_dev(repl_group_id, members).get(); + ASSERT_EQ(v.hasValue(), true) + << "Error in creating repl dev for group_id=" << boost::uuids::to_string(repl_group_id).c_str(); + } + } + + std::shared_ptr< ReplDevListener > get_listener(homestore::group_id_t group_id) { + std::unique_lock lg(groups_mtx_); + + auto it = repl_groups_.find(group_id); + if ((it != repl_groups_.end()) && (it->second != nullptr)) { return it->second; } + + RELEASE_ASSERT(!pending_listeners_.empty(), + "Looking for listener for group_id, but register_listener was not called"); + + auto listener = std::move(pending_listeners_[0]); + repl_groups_.insert(std::pair(group_id, listener)); + pending_listeners_.erase(pending_listeners_.begin()); + return listener; + } + + void sync_for_test_start() { ipc_data_->sync_for_test_start(); } + void sync_for_verify_start() { ipc_data_->sync_for_verify_start(); } + void sync_for_cleanup_start() { ipc_data_->sync_for_cleanup_start(); } + void sync_dataset_size(uint64_t dataset_size) { ipc_data_->test_dataset_size_ = dataset_size; } + uint64_t dataset_size() const { return ipc_data_->test_dataset_size_; } + +private: + uint16_t replica_num_; + std::string name_; + char** argv_; + + boost::process::group proc_grp_; + std::unique_ptr< bip::shared_memory_object > shm_; + std::unique_ptr< bip::mapped_region > region_; + std::unique_ptr< folly::Init > folly_; + + std::mutex groups_mtx_; + std::condition_variable group_created_cv_; + std::map< homestore::group_id_t, std::shared_ptr< homestore::ReplDevListener > > repl_groups_; + std::vector< std::shared_ptr< homestore::ReplDevListener > > pending_listeners_; // pending to join raft group + std::map< homestore::replica_id_t, uint32_t > members_; + std::set< uint32_t > up_members_; + homestore::replica_id_t my_replica_id_; + + std::mutex wakeup_mtx_; + uint32_t wokenup_replicas_{0}; + std::condition_variable wakeup_cv_; + + IPCData* ipc_data_; + + Runner io_runner_; +}; +} // namespace test_common \ No newline at end of file diff --git a/src/tests/test_cp_mgr.cpp b/src/tests/test_cp_mgr.cpp index ad0e8a60d..cf605c151 100644 --- a/src/tests/test_cp_mgr.cpp +++ b/src/tests/test_cp_mgr.cpp @@ -27,7 +27,7 @@ using namespace homestore; -SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) +SISL_LOGGING_DEF(HOMESTORE_LOG_MODS) SISL_OPTIONS_ENABLE(logging, test_cp_mgr, iomgr, test_common_setup) SISL_LOGGING_DECL(test_cp_mgr) diff --git a/src/tests/test_data_service.cpp b/src/tests/test_data_service.cpp index 39e8f2112..db44d0867 100644 --- a/src/tests/test_data_service.cpp +++ b/src/tests/test_data_service.cpp @@ -49,7 +49,7 @@ using namespace homestore; RCU_REGISTER_INIT -SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) +SISL_LOGGING_DEF(HOMESTORE_LOG_MODS) SISL_OPTIONS_ENABLE(logging, test_data_service, iomgr, test_common_setup) SISL_LOGGING_DECL(test_data_service) @@ -816,13 +816,13 @@ TEST_F(BlkDataServiceTest, TestRandMixIOLoad) { // Stream related test -SISL_OPTION_GROUP( - test_data_service, - (run_time, "", "run_time", "running time in seconds", ::cxxopts::value< uint64_t >()->default_value("30"), - "number"), - (min_io_size, "", "min_io_size", "mim io size", ::cxxopts::value< uint32_t >()->default_value("4096"), "number"), - (max_io_size, "", "max_io_size", "max io size", ::cxxopts::value< uint32_t >()->default_value("4096"), "number"), - (num_io, "", "num_io", "number of io", ::cxxopts::value< uint64_t >()->default_value("300"), "number")); +SISL_OPTION_GROUP(test_data_service, + (run_time, "", "run_time", "running time in seconds", + ::cxxopts::value< uint64_t >()->default_value("30"), "number"), + (min_io_size, "", "min_io_size", "mim io size", ::cxxopts::value< uint32_t >()->default_value("4096"), + "number"), + (max_io_size, "", "max_io_size", "max io size", ::cxxopts::value< uint32_t >()->default_value("4096"), + "number")); int main(int argc, char* argv[]) { int parsed_argc{argc}; diff --git a/src/tests/test_device_manager.cpp b/src/tests/test_device_manager.cpp index 13bef9d6b..b9801423e 100644 --- a/src/tests/test_device_manager.cpp +++ b/src/tests/test_device_manager.cpp @@ -35,7 +35,7 @@ #include "device/virtual_dev.hpp" using namespace homestore; -SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) +SISL_LOGGING_DEF(HOMESTORE_LOG_MODS) SISL_OPTIONS_ENABLE(logging, test_device_manager, iomgr) SISL_OPTION_GROUP(test_device_manager, diff --git a/src/tests/test_home_raft_logstore.cpp b/src/tests/test_home_raft_logstore.cpp new file mode 100644 index 000000000..fdb6759d3 --- /dev/null +++ b/src/tests/test_home_raft_logstore.cpp @@ -0,0 +1,275 @@ +#include +#include +#include +#include +#include +#include + +#include "test_common/homestore_test_common.hpp" +#include "replication/log_store/home_raft_log_store.h" + +using namespace homestore; + +SISL_LOGGING_DEF(HOMESTORE_LOG_MODS) + +static constexpr uint32_t g_max_logsize{512}; +static std::random_device g_rd{}; +static std::default_random_engine g_re{g_rd()}; +static std::uniform_int_distribution< uint32_t > g_randlogsize_generator{2, g_max_logsize}; +std::vector< std::string > test_common::HSTestHelper::s_dev_names; + +static constexpr std::array< const char, 62 > alphanum{ + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', + 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', + 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'}; + +static std::string gen_random_string(size_t len, uint64_t preamble = std::numeric_limits< uint32_t >::max()) { + std::string str; + if (preamble != std::numeric_limits< uint64_t >::max()) { + std::stringstream ss; + ss << std::setw(8) << std::setfill('0') << std::hex << preamble; + str += ss.str(); + } + + std::uniform_int_distribution< size_t > rand_char{0, alphanum.size() - 1}; + for (size_t i{0}; i < len; ++i) { + str += alphanum[rand_char(g_re)]; + } + str += '\0'; + return str; +} + +struct pack_result_t { + raft_buf_ptr_t actual_data; + std::vector< std::string > exp_data; +}; + +class RaftLogStoreClient { +public: + friend class TestRaftLogStore; + + void append_read_test(uint32_t num_entries) { + ASSERT_EQ(m_rls->next_slot(), m_next_lsn); + ASSERT_EQ(m_rls->start_index(), m_start_lsn); + + auto max_lsn_this_iter = uint64_cast(m_next_lsn) + num_entries; + for (uint64_t lsn = m_next_lsn; lsn <= max_lsn_this_iter; ++lsn) { + auto le = make_log(m_cur_term, lsn); + int64_t const store_sn = m_rls->append(le); + + ASSERT_EQ(lsn, store_sn); + ASSERT_EQ(m_rls->next_slot(), lsn + 1); + validate_log(m_rls->last_entry(), lsn); + + ++m_next_lsn; + } + m_rls->flush(); + ASSERT_EQ(m_rls->start_index(), m_start_lsn) << "Start Index not expected to be updated after insertion"; + } + + void rollback_test() { + m_next_lsn = (m_next_lsn - m_start_lsn) / 2; // Rollback half of the current logs + ++m_cur_term; + auto le = make_log(m_cur_term, m_next_lsn); + m_rls->write_at(m_next_lsn, le); // Rollback and write with next term + m_shadow_log.erase(m_shadow_log.begin() + m_next_lsn, m_shadow_log.end()); + ++m_next_lsn; + + ASSERT_EQ(m_rls->next_slot(), m_next_lsn) << "Post rollback, next slot doesn't have expected value"; + validate_log(m_rls->last_entry(), m_next_lsn - 1); + validate_all_logs(); + } + + void compact_test(uint32_t num_records) { + uint64_t compact_upto = m_start_lsn + num_records - 1; + + // Reflect expected behavior from logstore, if we are compacting beyond next insertion index, then we should + // reset the next insertion slot and we expect logstores to create holes and fill it with dummy. + if (compact_upto >= uint64_cast(m_next_lsn)) { m_next_lsn = compact_upto + 1; } + + m_start_lsn = compact_upto + 1; + m_rls->compact(compact_upto); + ASSERT_EQ(m_rls->start_index(), m_start_lsn) << "Post compaction, start_index is invalid"; + validate_all_logs(); + } + + void pack_test(uint64_t from, int32_t cnt, pack_result_t& out_pack) { + out_pack.actual_data = m_rls->pack(from, cnt); + ASSERT_NE(out_pack.actual_data.get(), nullptr); + out_pack.exp_data.assign(m_shadow_log.begin() + from - 1, m_shadow_log.begin() + from + cnt - 1); + } + + pack_result_t pack_test() { + pack_result_t p; + pack_test(m_start_lsn, m_next_lsn - m_start_lsn, p); + return p; + } + + void unpack_test(const pack_result_t& p) { + m_rls->apply_pack(m_next_lsn, *p.actual_data); + m_shadow_log.insert(std::end(m_shadow_log), p.exp_data.begin(), p.exp_data.end()); + m_next_lsn += p.exp_data.size(); + validate_all_logs(); + } + + size_t total_records() const { return m_shadow_log.size() - m_start_lsn + 1; } + + void validate_all_logs() { + // Do Basic read validation + ASSERT_EQ(m_rls->next_slot(), m_next_lsn); + ASSERT_EQ(m_rls->start_index(), m_start_lsn); + + if (m_next_lsn > m_start_lsn) { validate_log(m_rls->last_entry(), m_next_lsn - 1); } + + // Do invidivual get validation + for (uint64_t lsn = m_start_lsn; lsn < uint64_cast(m_next_lsn); ++lsn) { + validate_log(m_rls->entry_at(lsn), lsn); + } + + // Do bulk get validation as well. + auto lsn = m_start_lsn; + auto const entries = m_rls->log_entries(m_start_lsn, m_next_lsn); + ASSERT_EQ(entries->size(), uint64_cast(m_next_lsn - m_start_lsn)); + for (const auto& le : *entries) { + validate_log(le, lsn++); + } + } + +private: + nuraft::ptr< nuraft::log_entry > make_log(uint64_t term, uint64_t lsn) { + auto val = gen_random_string(g_randlogsize_generator(g_re), term); + raft_buf_ptr_t buf = nuraft::buffer::alloc(val.size() + 1); + buf->put(val); + m_shadow_log[lsn - 1] = std::move(val); + return nuraft::cs_new< nuraft::log_entry >(term, buf); + } + + void validate_log(const nuraft::ptr< nuraft::log_entry >& le, int64_t lsn) { + uint64_t expected_term; + std::stringstream ss; + ss << std::hex << m_shadow_log[lsn - 1].substr(0, 8); + ss >> expected_term; + ASSERT_EQ(le->get_term(), expected_term) << "Term mismatch at lsn=" << lsn; + + nuraft::buffer& buf = le->get_buf(); + buf.pos(0); + auto bytes = buf.get_raw(buf.size()); + + ASSERT_EQ(buf.size() - 1, m_shadow_log[lsn - 1].size()) << "Size from log and shadow mismatch for lsn=" << lsn; + ASSERT_EQ(std::string(r_cast< const char* >(bytes), buf.size() - 1), m_shadow_log[lsn - 1]) + << "Log entry mismatch for lsn=" << lsn; + buf.pos(0); + } + +private: + homestore::logstore_id_t m_store_id{UINT32_MAX}; + std::unique_ptr< HomeRaftLogStore > m_rls; + sisl::sparse_vector< std::string > m_shadow_log; + uint64_t m_cur_term{1}; + int64_t m_next_lsn{1}; + int64_t m_start_lsn{1}; +}; + +class TestRaftLogStore : public ::testing::Test { +public: + void SetUp() { + test_common::HSTestHelper::start_homestore("test_home_raft_log_store", + {{HS_SERVICE::META, {.size_pct = 5.0}}, + {HS_SERVICE::LOG_REPLICATED, {.size_pct = 70.0}}, + {HS_SERVICE::LOG_LOCAL, {.size_pct = 2.0}}}); + m_leader_store.m_rls = std::make_unique< HomeRaftLogStore >(); + m_leader_store.m_store_id = m_leader_store.m_rls->logstore_id(); + + m_follower_store.m_rls = std::make_unique< HomeRaftLogStore >(); + m_follower_store.m_store_id = m_follower_store.m_rls->logstore_id(); + } + + void restart() { + m_leader_store.m_rls.reset(); + m_follower_store.m_rls.reset(); + + test_common::HSTestHelper::start_homestore( + "test_home_raft_log_store", + {{HS_SERVICE::META, {}}, {HS_SERVICE::LOG_REPLICATED, {}}, {HS_SERVICE::LOG_LOCAL, {}}}, + [this]() { + m_leader_store.m_rls = std::make_unique< HomeRaftLogStore >(m_leader_store.m_store_id); + m_follower_store.m_rls = std::make_unique< HomeRaftLogStore >(m_follower_store.m_store_id); + }, + true /* restart */); + } + + virtual void TearDown() override { + m_leader_store.m_rls.reset(); + m_follower_store.m_rls.reset(); + test_common::HSTestHelper::shutdown_homestore(); + } + +protected: + RaftLogStoreClient m_leader_store; + RaftLogStoreClient m_follower_store; +}; + +TEST_F(TestRaftLogStore, lifecycle_test) { + auto nrecords = SISL_OPTIONS["num_records"].as< uint32_t >(); + + LOGINFO("Step 1: Append and test {} records", nrecords); + this->m_leader_store.append_read_test(nrecords); // assuming nrecords = 1000, total_records = 1000 + + LOGINFO("Step 2: Rollback half of the records"); + this->m_leader_store.rollback_test(); // total_records = 500 + + LOGINFO("Step 3: Post rollback add {} records", nrecords); + this->m_leader_store.append_read_test(nrecords); // total_records = 1500 + + auto shrink_records = (this->m_leader_store.total_records() * 10) / 100; + LOGINFO("Step 4: Compact first 10% records = {}", shrink_records); + this->m_leader_store.compact_test(shrink_records); // total_records = 1350 + + LOGINFO("Step 5: Post compaction add {} records", nrecords); + this->m_leader_store.append_read_test(nrecords); // total_records = 2350 + + shrink_records = this->m_leader_store.total_records() + (this->m_leader_store.total_records() * 10) / 100; + LOGINFO("Step 6: Compaction 10% records={} beyond max appended entries test", shrink_records); + this->m_leader_store.compact_test(shrink_records); // total_records = 0 + + LOGINFO("Step 7: Post compaction add {} records", nrecords); + this->m_leader_store.append_read_test(nrecords); // total_records = 1000 + + LOGINFO("Step 8: Pack all records"); + auto pack_data = this->m_leader_store.pack_test(); // total_records = 1000 + + LOGINFO("Step 9: Unpack all records on an empty logstore"); + this->m_follower_store.unpack_test(pack_data); // total_records in follower = 1000 + + LOGINFO("Step 10: Append more {} records to follower logstore", nrecords); + this->m_follower_store.append_read_test(nrecords); // total_records in follower = 2000 + + LOGINFO("Step 11: Unpack same leader records again after append inserted records"); + this->m_follower_store.unpack_test(pack_data); // total_records in follower = 3000 + + LOGINFO("Step 12: Restart homestore and validate recovery"); + this->restart(); + this->m_leader_store.validate_all_logs(); + this->m_follower_store.validate_all_logs(); + + LOGINFO("Step 13: Post recovery do append test"); + this->m_leader_store.append_read_test(nrecords); // total_records in leader = 2000 + this->m_follower_store.append_read_test(nrecords); // total_records in follower = 4000 +} + +SISL_OPTIONS_ENABLE(logging, test_home_raft_log_store, iomgr, test_common_setup) +SISL_OPTION_GROUP(test_home_raft_log_store, + (num_records, "", "num_records", "number of record to test", + ::cxxopts::value< uint32_t >()->default_value("1000"), "number"), + (iterations, "", "iterations", "Iterations", ::cxxopts::value< uint32_t >()->default_value("1"), + "the number of iterations to run each test")); + +int main(int argc, char* argv[]) { + int parsed_argc = argc; + ::testing::InitGoogleTest(&parsed_argc, argv); + SISL_OPTIONS_LOAD(parsed_argc, argv, logging, test_home_raft_log_store, iomgr, test_common_setup); + sisl::logging::SetLogger("test_home_raft_log_store"); + spdlog::set_pattern("[%D %T%z] [%^%l%$] [%t] %v"); + + return RUN_ALL_TESTS(); +} diff --git a/src/tests/test_index_btree.cpp b/src/tests/test_index_btree.cpp index e110a931d..6f8cb7b44 100644 --- a/src/tests/test_index_btree.cpp +++ b/src/tests/test_index_btree.cpp @@ -37,7 +37,7 @@ using namespace homestore; -SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) +SISL_LOGGING_DEF(HOMESTORE_LOG_MODS) SISL_OPTIONS_ENABLE(logging, test_index_btree, iomgr, test_common_setup) SISL_LOGGING_DECL(test_index_btree) diff --git a/src/tests/test_journal_vdev.cpp b/src/tests/test_journal_vdev.cpp index 39abf9b43..e146144c1 100644 --- a/src/tests/test_journal_vdev.cpp +++ b/src/tests/test_journal_vdev.cpp @@ -40,7 +40,7 @@ using namespace homestore; RCU_REGISTER_INIT -SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) +SISL_LOGGING_DEF(HOMESTORE_LOG_MODS) SISL_OPTIONS_ENABLE(logging, test_vdev, iomgr, test_common_setup) std::vector< std::string > test_common::HSTestHelper::s_dev_names; @@ -342,8 +342,6 @@ SISL_OPTION_GROUP(test_vdev, ::cxxopts::value< uint32_t >()->default_value("8192"), "number"), (run_time, "", "run_time", "running time in seconds", ::cxxopts::value< uint64_t >()->default_value("30"), "number"), - (num_io, "", "num_io", "number of io", ::cxxopts::value< uint64_t >()->default_value("3000"), - "number"), (per_read, "", "per_read", "read percentage of io that are reads", ::cxxopts::value< uint32_t >()->default_value("20"), "number"), (per_write, "", "per_write", "write percentage of io that are writes", diff --git a/src/tests/test_log_dev.cpp b/src/tests/test_log_dev.cpp index b41a5f9a5..9738d8efc 100644 --- a/src/tests/test_log_dev.cpp +++ b/src/tests/test_log_dev.cpp @@ -31,7 +31,7 @@ using namespace homestore; RCU_REGISTER_INIT -SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) +SISL_LOGGING_DEF(HOMESTORE_LOG_MODS) std::vector< logdev_key > s_logdev_keys; static uint64_t first_offset{~static_cast< uint64_t >(0)}; diff --git a/src/tests/test_log_store.cpp b/src/tests/test_log_store.cpp index 656092105..dfa916c19 100644 --- a/src/tests/test_log_store.cpp +++ b/src/tests/test_log_store.cpp @@ -54,7 +54,7 @@ using namespace homestore; RCU_REGISTER_INIT -SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) +SISL_LOGGING_DEF(HOMESTORE_LOG_MODS) std::vector< std::string > test_common::HSTestHelper::s_dev_names; struct test_log_data { diff --git a/src/tests/test_mem_btree.cpp b/src/tests/test_mem_btree.cpp index 94e78d53c..0c5fa1c5e 100644 --- a/src/tests/test_mem_btree.cpp +++ b/src/tests/test_mem_btree.cpp @@ -31,7 +31,7 @@ #include "btree_helpers/btree_test_helper.hpp" using namespace homestore; -SISL_LOGGING_INIT(btree, iomgr, io_wd, flip) +SISL_LOGGING_DEF(btree, iomgr, io_wd, flip) SISL_OPTIONS_ENABLE(logging, test_mem_btree) SISL_OPTION_GROUP( diff --git a/src/tests/test_meta_blk_mgr.cpp b/src/tests/test_meta_blk_mgr.cpp index f0875562e..d4fd1b993 100644 --- a/src/tests/test_meta_blk_mgr.cpp +++ b/src/tests/test_meta_blk_mgr.cpp @@ -45,7 +45,7 @@ using namespace homestore; RCU_REGISTER_INIT -SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) +SISL_LOGGING_DEF(HOMESTORE_LOG_MODS) std::vector< std::string > test_common::HSTestHelper::s_dev_names; SISL_OPTIONS_ENABLE(logging, test_meta_blk_mgr, iomgr, test_common_setup) @@ -885,7 +885,6 @@ SISL_OPTION_GROUP( "number"), (max_write_size, "", "max_write_size", "maximum write size", ::cxxopts::value< uint32_t >()->default_value("524288"), "number"), - (num_io, "", "num_io", "number of io", ::cxxopts::value< uint64_t >()->default_value("300"), "number"), (overflow, "", "overflow", "always do overflow", ::cxxopts::value< uint32_t >()->default_value("0"), "number"), (per_update, "", "per_update", "update percentage", ::cxxopts::value< uint32_t >()->default_value("20"), "number"), (per_write, "", "per_write", "write percentage", ::cxxopts::value< uint32_t >()->default_value("60"), "number"), diff --git a/src/tests/test_pdev.cpp b/src/tests/test_pdev.cpp index d5670abaf..fb5629041 100644 --- a/src/tests/test_pdev.cpp +++ b/src/tests/test_pdev.cpp @@ -34,7 +34,7 @@ #include "device/physical_dev.hpp" using namespace homestore; -SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) +SISL_LOGGING_DEF(HOMESTORE_LOG_MODS) SISL_OPTIONS_ENABLE(logging, test_pdev, iomgr) SISL_OPTION_GROUP(test_pdev, diff --git a/src/tests/test_raft_repl_dev.cpp b/src/tests/test_raft_repl_dev.cpp new file mode 100644 index 000000000..36542f6f4 --- /dev/null +++ b/src/tests/test_raft_repl_dev.cpp @@ -0,0 +1,265 @@ +/********************************************************************************* + * Modifications Copyright 2017-2019 eBay Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include "common/homestore_config.hpp" +#include "common/homestore_assert.hpp" +#include "common/homestore_utils.hpp" +#include "test_common/hs_repl_test_common.hpp" +#include "replication/service/raft_repl_service.h" +#include "replication/repl_dev/raft_repl_dev.h" + +using namespace homestore; + +SISL_LOGGING_DECL(test_raft_repl_dev) +SISL_LOGGING_DEF(HOMESTORE_LOG_MODS) + +SISL_OPTION_GROUP(test_raft_repl_dev, + (block_size, "", "block_size", "block size to io", + ::cxxopts::value< uint32_t >()->default_value("4096"), "number")); +SISL_OPTIONS_ENABLE(logging, test_raft_repl_dev, iomgr, test_common_setup, test_repl_common_setup) + +static std::unique_ptr< test_common::HSReplTestHelper > g_helper; + +class TestReplicatedDB : public homestore::ReplDevListener { +public: + struct Key { + uint64_t id_; + bool operator<(Key const& other) const { return id_ < other.id_; } + }; + + struct Value { + int64_t lsn_; + uint64_t data_size_; + uint64_t data_pattern_; + MultiBlkId blkid_; + }; + + struct test_req : public repl_req_ctx { + struct journal_header { + uint64_t data_size; + uint64_t data_pattern; + }; + + journal_header jheader; + uint64_t key_id; + sisl::sg_list write_sgs; + sisl::sg_list read_sgs; + + sisl::blob header_blob() { return sisl::blob(uintptr_cast(&jheader), sizeof(journal_header)); } + sisl::blob key_blob() { return sisl::blob{uintptr_cast(&key_id), sizeof(uint64_t)}; } + + test_req() { + write_sgs.size = 0; + read_sgs.size = 0; + key_id = (uint64_t)rand() << 32 | rand(); + } + + ~test_req() { + for (auto const& iov : write_sgs.iovs) { + iomanager.iobuf_free(uintptr_cast(iov.iov_base)); + } + + for (auto const& iov : read_sgs.iovs) { + iomanager.iobuf_free(uintptr_cast(iov.iov_base)); + } + } + }; + + TestReplicatedDB() = default; + virtual ~TestReplicatedDB() = default; + + void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, MultiBlkId const& blkids, + cintrusive< repl_req_ctx >& ctx) override { + LOGINFO("[Replica={}] Received commit on lsn={}", g_helper->replica_num(), lsn); + ASSERT_EQ(header.size(), sizeof(test_req::journal_header)); + + auto jheader = r_cast< test_req::journal_header const* >(header.cbytes()); + Key k{.id_ = *(r_cast< uint64_t const* >(key.cbytes()))}; + Value v{ + .lsn_ = lsn, .data_size_ = jheader->data_size, .data_pattern_ = jheader->data_pattern, .blkid_ = blkids}; + + { + std::unique_lock lk(db_mtx_); + inmem_db_.insert_or_assign(k, v); + } + + if (ctx->is_proposer) { g_helper->runner().next_task(); } + } + + bool on_pre_commit(int64_t lsn, const sisl::blob& header, const sisl::blob& key, + cintrusive< repl_req_ctx >& ctx) override { + LOGINFO("[Replica={}] Received pre-commit on lsn={}", g_helper->replica_num(), lsn); + return true; + } + + void on_rollback(int64_t lsn, const sisl::blob& header, const sisl::blob& key, + cintrusive< repl_req_ctx >& ctx) override { + LOGINFO("[Replica={}] Received rollback on lsn={}", g_helper->replica_num(), lsn); + } + + blk_alloc_hints get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size) override { + return blk_alloc_hints{}; + } + + void on_replica_stop() override {} + + void db_write(uint64_t data_size, uint32_t max_size_per_iov) { + auto req = intrusive< test_req >(new test_req()); + req->jheader.data_size = data_size; + req->jheader.data_pattern = ((long long)rand() << 32) | rand(); + auto block_size = SISL_OPTIONS["block_size"].as< uint32_t >(); + + if (data_size != 0) { + req->write_sgs = + test_common::HSTestHelper::create_sgs(data_size, max_size_per_iov, req->jheader.data_pattern); + } + + repl_dev()->async_alloc_write(req->header_blob(), req->key_blob(), req->write_sgs, req); + } + + void validate_db_data() { + g_helper->runner().set_num_tasks(inmem_db_.size()); + + LOGINFO("[{}]: Total {} keys committed, validating them", boost::uuids::to_string(repl_dev()->group_id()), + inmem_db_.size()); + auto it = inmem_db_.begin(); + g_helper->runner().set_task([this, &it]() { + Key k; + Value v; + { + std::unique_lock lk(db_mtx_); + std::tie(k, v) = *it; + ++it; + } + auto block_size = SISL_OPTIONS["block_size"].as< uint32_t >(); + auto read_sgs = test_common::HSTestHelper::create_sgs(v.data_size_, block_size); + + repl_dev()->async_read(v.blkid_, read_sgs, v.data_size_).thenValue([read_sgs, k, v](auto const ec) { + RELEASE_ASSERT(!ec, "Read of blkid={} for key={} error={}", v.blkid_.to_string(), k.id_, ec.message()); + for (auto const& iov : read_sgs.iovs) { + test_common::HSTestHelper::validate_data_buf(uintptr_cast(iov.iov_base), iov.iov_len, + v.data_pattern_); + iomanager.iobuf_free(uintptr_cast(iov.iov_base)); + } + g_helper->runner().next_task(); + }); + }); + g_helper->runner().execute().get(); + } + + uint64_t db_size() const { + std::shared_lock lk(db_mtx_); + return inmem_db_.size(); + } + +private: + std::map< Key, Value > inmem_db_; + std::shared_mutex db_mtx_; +}; + +class RaftReplDevTest : public testing::Test { +public: + void SetUp() override { + // By default it will create one db + auto db = std::make_shared< TestReplicatedDB >(); + g_helper->register_listener(db); + dbs_.emplace_back(std::move(db)); + } + + void generate_writes(uint64_t data_size, uint32_t max_size_per_iov) { + pick_one_db().db_write(data_size, max_size_per_iov); + } + + void wait_for_all_writes(uint64_t exp_writes) { + while (true) { + uint64_t total_writes{0}; + for (auto const& db : dbs_) { + total_writes += db->db_size(); + } + + if (total_writes >= exp_writes) { break; } + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + } + + void validate_all_data() { + for (auto const& db : dbs_) { + db->validate_db_data(); + } + } + + TestReplicatedDB& pick_one_db() { return *dbs_[0]; } + +private: + std::vector< std::shared_ptr< TestReplicatedDB > > dbs_; +}; + +TEST_F(RaftReplDevTest, All_Append) { + LOGINFO("Homestore replica={} setup completed", g_helper->replica_num()); + g_helper->sync_for_test_start(); + + if (g_helper->replica_num() == 0) { + g_helper->sync_dataset_size(SISL_OPTIONS["num_io"].as< uint64_t >()); + auto block_size = SISL_OPTIONS["block_size"].as< uint32_t >(); + LOGINFO("Run on worker threads to schedule append on repldev for {} Bytes.", block_size); + g_helper->runner().set_task([this, block_size]() { this->generate_writes(block_size, block_size); }); + g_helper->runner().execute().get(); + } + + this->wait_for_all_writes(g_helper->dataset_size()); + + g_helper->sync_for_verify_start(); + LOGINFO("Validate all data written so far by reading them"); + this->validate_all_data(); + + g_helper->sync_for_cleanup_start(); +} + +int main(int argc, char* argv[]) { + int parsed_argc{argc}; + char** orig_argv = argv; + + ::testing::InitGoogleTest(&parsed_argc, argv); + SISL_OPTIONS_LOAD(parsed_argc, argv, logging, test_raft_repl_dev, iomgr, test_common_setup, test_repl_common_setup); + + FLAGS_folly_global_cpu_executor_threads = 4; + g_helper = std::make_unique< test_common::HSReplTestHelper >("test_raft_repl_dev", orig_argv); + g_helper->setup(); + + (g_helper->replica_num() == 0) ? ::testing::GTEST_FLAG(filter) = "*Primary_*:*All_*" + : ::testing::GTEST_FLAG(filter) = "*Secondary_*::*All_*"; + + auto ret = RUN_ALL_TESTS(); + g_helper->teardown(); + return ret; +} \ No newline at end of file diff --git a/src/tests/test_solo_repl_dev.cpp b/src/tests/test_solo_repl_dev.cpp index 86b1dca7d..1c47801e9 100644 --- a/src/tests/test_solo_repl_dev.cpp +++ b/src/tests/test_solo_repl_dev.cpp @@ -47,7 +47,7 @@ using namespace homestore; using namespace test_common; -SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) +SISL_LOGGING_DEF(HOMESTORE_LOG_MODS) SISL_OPTIONS_ENABLE(logging, test_solo_repl_dev, iomgr, test_common_setup) SISL_LOGGING_DECL(test_solo_repl_dev) @@ -60,63 +60,6 @@ static constexpr uint64_t Ki{1024}; static constexpr uint64_t Mi{Ki * Ki}; static constexpr uint64_t Gi{Ki * Mi}; -struct Runner { - uint64_t total_tasks{0}; - uint32_t qdepth{8}; - std::atomic< uint64_t > issued_tasks{0}; - std::atomic< uint64_t > pending_tasks{0}; - std::function< void(void) > task; - folly::Promise< folly::Unit > comp_promise; - - Runner(uint64_t num_tasks, uint32_t qd = 8) : total_tasks{num_tasks}, qdepth{qd} { - if (total_tasks < (uint64_t)qdepth) { total_tasks = qdepth; } - } - - Runner() : Runner{SISL_OPTIONS["num_io"].as< uint64_t >()} {} - - void set_task(std::function< void(void) > f) { task = std::move(f); } - - folly::Future< folly::Unit > execute() { - for (uint32_t i{0}; i < qdepth; ++i) { - run_task(); - } - return comp_promise.getFuture(); - } - - void next_task() { - auto ptasks = pending_tasks.fetch_sub(1) - 1; - if ((issued_tasks.load() < total_tasks)) { - run_task(); - } else if (ptasks == 0) { - comp_promise.setValue(); - } - } - - void run_task() { - ++issued_tasks; - ++pending_tasks; - iomanager.run_on_forget(iomgr::reactor_regex::random_worker, task); - } -}; - -struct Waiter { - std::atomic< uint64_t > expected_comp{0}; - std::atomic< uint64_t > actual_comp{0}; - folly::Promise< folly::Unit > comp_promise; - - Waiter(uint64_t num_op) : expected_comp{num_op} {} - Waiter() : Waiter{SISL_OPTIONS["num_io"].as< uint64_t >()} {} - - folly::Future< folly::Unit > start(std::function< void(void) > f) { - f(); - return comp_promise.getFuture(); - } - - void one_complete() { - if ((actual_comp.fetch_add(1) + 1) >= expected_comp.load()) { comp_promise.setValue(); } - } -}; - struct test_repl_req : public repl_req_ctx { sisl::byte_array header; sisl::byte_array key; @@ -174,7 +117,7 @@ class SoloReplDevTest : public testing::Test { void on_rollback(int64_t lsn, const sisl::blob& header, const sisl::blob& key, cintrusive< repl_req_ctx >& ctx) override {} - blk_alloc_hints get_blk_alloc_hints(sisl::blob const& header, cintrusive< repl_req_ctx >& ctx) override { + blk_alloc_hints get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size) override { return blk_alloc_hints{}; } @@ -191,16 +134,16 @@ class SoloReplDevTest : public testing::Test { repl_impl_type get_impl_type() const override { return repl_impl_type::solo; } bool need_timeline_consistency() const { return true; } - std::unique_ptr< ReplDevListener > create_repl_dev_listener(uuid_t) override { - return std::make_unique< Listener >(m_test); + shared< ReplDevListener > create_repl_dev_listener(uuid_t) override { + return std::make_shared< Listener >(m_test); } - std::string lookup_peer(uuid_t uuid) const override { return std::string(""); } - uint16_t lookup_port() const override { return 0; } + std::pair< std::string, uint16_t > lookup_peer(uuid_t uuid) const override { return std::make_pair("", 0u); } + replica_id_t get_my_repl_id() const override { return hs_utils::gen_random_uuid(); } }; protected: - Runner m_io_runner; - Waiter m_task_waiter; + test_common::Runner m_io_runner; + test_common::Waiter m_task_waiter; shared< ReplDev > m_repl_dev1; shared< ReplDev > m_repl_dev2; uuid_t m_uuid1; @@ -256,7 +199,7 @@ class SoloReplDevTest : public testing::Test { } if (data_size != 0) { - req->write_sgs = HSTestHelper::create_sgs(data_size, g_block_size, max_size_per_iov, hdr->data_pattern); + req->write_sgs = HSTestHelper::create_sgs(data_size, max_size_per_iov, hdr->data_pattern); } auto& rdev = (rand() % 2) ? m_repl_dev1 : m_repl_dev2; @@ -274,7 +217,7 @@ class SoloReplDevTest : public testing::Test { uint32_t size = blkids.blk_count() * g_block_size; if (size) { - auto read_sgs = HSTestHelper::create_sgs(size, g_block_size, size); + auto read_sgs = HSTestHelper::create_sgs(size, size); LOGDEBUG("[{}] Validating replay of lsn={} blkid = {}", boost::uuids::to_string(rdev.group_id()), lsn, blkids.to_string()); rdev.async_read(blkids, read_sgs, size) @@ -298,7 +241,7 @@ class SoloReplDevTest : public testing::Test { void on_write_complete(ReplDev& rdev, intrusive< test_repl_req > req) { // If we did send some data to the repl_dev, validate it by doing async_read if (req->write_sgs.size != 0) { - req->read_sgs = HSTestHelper::create_sgs(req->write_sgs.size, g_block_size, req->write_sgs.size); + req->read_sgs = HSTestHelper::create_sgs(req->write_sgs.size, req->write_sgs.size); auto const cap = hs()->repl_service().get_cap_stats(); LOGDEBUG("Write complete with cap stats: used={} total={}", cap.used_capacity, cap.total_capacity); @@ -353,8 +296,6 @@ TEST_F(SoloReplDevTest, TestHeaderOnly) { } SISL_OPTION_GROUP(test_solo_repl_dev, - (num_io, "", "num_io", "number of io", ::cxxopts::value< uint64_t >()->default_value("300"), - "number"), (block_size, "", "block_size", "block size to io", ::cxxopts::value< uint32_t >()->default_value("4096"), "number")); From d7fbe53b264cd12154a0e217cee94ed655393269 Mon Sep 17 00:00:00 2001 From: Jie Yao Date: Thu, 28 Dec 2023 19:31:40 -0700 Subject: [PATCH 6/9] bump verison since we have an important new feature now, we need to update homestore verion --- conanfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conanfile.py b/conanfile.py index 20dc6a996..18387456c 100644 --- a/conanfile.py +++ b/conanfile.py @@ -5,7 +5,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "4.10.1" + version = "5.0.1" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" topics = ("ebay", "nublox") From 8e654294576d6f764e5f024d7be7e13f213b6960 Mon Sep 17 00:00:00 2001 From: Brian Szmyd Date: Fri, 29 Dec 2023 17:55:10 +0000 Subject: [PATCH 7/9] Add Nuraft cache. --- .github/workflows/build_commit.yml | 14 ++++++++++- .github/workflows/build_dependencies.yml | 30 ++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build_commit.yml b/.github/workflows/build_commit.yml index 761d6687b..36ebd4575 100644 --- a/.github/workflows/build_commit.yml +++ b/.github/workflows/build_commit.yml @@ -31,6 +31,18 @@ jobs: tooling: None if: ${{ github.event_name != 'pull_request' }} + NuraftMesgDeps: + needs: SislDeps + uses: eBay/nuraft_mesg/.github/workflows/build_dependencies.yml@main + with: + branch: main + platform: ${{ inputs.platform }} + build-type: ${{ inputs.build-type }} + malloc-impl: ${{ inputs.malloc-impl }} + prerelease: ${{ inputs.prerelease }} + tooling: None + if: ${{ github.event_name != 'pull_request' }} + IOMgrDeps: needs: SislDeps uses: eBay/iomanager/.github/workflows/build_dependencies.yml@master @@ -44,7 +56,7 @@ jobs: if: ${{ github.event_name != 'pull_request' }} HomestoreDeps: - needs: IOMgrDeps + needs: [IOMgrDeps, NuraftMesgDeps] uses: ./.github/workflows/build_dependencies.yml with: branch: ${{ github.ref }} diff --git a/.github/workflows/build_dependencies.yml b/.github/workflows/build_dependencies.yml index 4ffac8419..ad8c87aeb 100644 --- a/.github/workflows/build_dependencies.yml +++ b/.github/workflows/build_dependencies.yml @@ -119,6 +119,14 @@ jobs: ref: master if: ${{ inputs.testing == 'True' || steps.restore-cache.outputs.cache-hit != 'true' }} + - name: Retrieve Dependencies + uses: actions/checkout@v3 + with: + repository: eBay/nuraft_mesg + path: import/nuraft_mesg + ref: main + if: ${{ inputs.testing == 'True' || steps.restore-cache.outputs.cache-hit != 'true' }} + - name: Load IOMgr Cache uses: eBay/sisl/.github/actions/load_conan@stable/v8.x with: @@ -128,6 +136,15 @@ jobs: fail_on_cache_miss: true if: ${{ inputs.testing == 'True' || steps.restore-cache.outputs.cache-hit != 'true' }} + - name: Load NuraftMesg Cache + uses: eBay/sisl/.github/actions/load_conan@stable/v8.x + with: + testing: 'False' + path: import/nuraft_mesg + key_prefix: NuraftMesgDeps-${{ inputs.platform }}-${{ inputs.build-type }}-${{ inputs.malloc-impl }}-${{ inputs.prerelease }} + fail_on_cache_miss: true + if: ${{ inputs.testing == 'True' || steps.restore-cache.outputs.cache-hit != 'true' }} + - name: Setup Conan uses: eBay/sisl/.github/actions/setup_conan@stable/v8.x with: @@ -139,6 +156,7 @@ jobs: sudo apt-get install -y python3-pyelftools libaio-dev python -m pip install pyelftools conan export import/iomgr oss/master + conan export import/nuraft_mesg oss/main cached_pkgs=$(ls -1d ~/.conan/data/*/*/*/*/package | sed 's,.*data/,,' | cut -d'/' -f1,2 | paste -sd',' - -) echo "::info:: Pre-cached: ${cached_pkgs}" if: ${{ inputs.testing == 'True' || steps.restore-cache.outputs.cache-hit != 'true' }} @@ -150,6 +168,7 @@ jobs: -o sisl:malloc_impl=${{ inputs.malloc-impl }} \ -o sisl:testing=False \ -o iomgr:testing=off \ + -o nuraft_mesg:testing=False \ -o testing=off \ -s build_type=${{ inputs.build-type }} \ --build missing \ @@ -178,6 +197,15 @@ jobs: fail_on_cache_miss: true if: ${{ inputs.testing == 'True' && github.event_name != 'pull_request' && steps.restore-cache.outputs.cache-hit != 'true' }} + - name: Reload NuraftMesg Cache + uses: eBay/sisl/.github/actions/load_conan@stable/v8.x + with: + testing: 'False' + path: import/nuraft_mesg + key_prefix: NuraftMesgDeps-${{ inputs.platform }}-${{ inputs.build-type }}-${{ inputs.malloc-impl }}-${{ inputs.prerelease }} + fail_on_cache_miss: true + if: ${{ inputs.testing == 'True' && github.event_name != 'pull_request' && steps.restore-cache.outputs.cache-hit != 'true' }} + - name: Create and Test Package run: | sanitize=$([[ "${{ inputs.tooling }}" == "Sanitize" ]] && echo "True" || echo "False") @@ -186,6 +214,7 @@ jobs: -o sisl:malloc_impl=${{ inputs.malloc-impl }} \ -o sisl:testing=False \ -o iomgr:testing=off \ + -o nuraft_mesg:testing=False \ -o homestore:sanitize=${sanitize} \ -s build_type=${{ inputs.build-type }} \ --build missing \ @@ -199,6 +228,7 @@ jobs: -o sisl:malloc_impl=${{ inputs.malloc-impl }} \ -o sisl:testing=False \ -o iomgr:testing=off \ + -o nuraft_mesg:testing=False \ -o coverage=True \ -s build_type=${{ inputs.build-type }} \ --build missing \ From 43d5e1fa352a25b1e5bd66cd64234df0aa35c2dd Mon Sep 17 00:00:00 2001 From: Brian Szmyd Date: Fri, 29 Dec 2023 18:00:44 +0000 Subject: [PATCH 8/9] Fix cache prefix --- .github/workflows/build_dependencies.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_dependencies.yml b/.github/workflows/build_dependencies.yml index ad8c87aeb..be97712f1 100644 --- a/.github/workflows/build_dependencies.yml +++ b/.github/workflows/build_dependencies.yml @@ -141,7 +141,7 @@ jobs: with: testing: 'False' path: import/nuraft_mesg - key_prefix: NuraftMesgDeps-${{ inputs.platform }}-${{ inputs.build-type }}-${{ inputs.malloc-impl }}-${{ inputs.prerelease }} + key_prefix: NuMesgDeps-${{ inputs.platform }}-${{ inputs.build-type }}-${{ inputs.malloc-impl }}-${{ inputs.prerelease }} fail_on_cache_miss: true if: ${{ inputs.testing == 'True' || steps.restore-cache.outputs.cache-hit != 'true' }} @@ -202,7 +202,7 @@ jobs: with: testing: 'False' path: import/nuraft_mesg - key_prefix: NuraftMesgDeps-${{ inputs.platform }}-${{ inputs.build-type }}-${{ inputs.malloc-impl }}-${{ inputs.prerelease }} + key_prefix: NuMesgDeps-${{ inputs.platform }}-${{ inputs.build-type }}-${{ inputs.malloc-impl }}-${{ inputs.prerelease }} fail_on_cache_miss: true if: ${{ inputs.testing == 'True' && github.event_name != 'pull_request' && steps.restore-cache.outputs.cache-hit != 'true' }} From 65ca351b3c46b42ccb0f0d399205e0e60957ee06 Mon Sep 17 00:00:00 2001 From: Brian Szmyd Date: Wed, 3 Jan 2024 18:10:08 +0000 Subject: [PATCH 9/9] Fix release linking. --- src/CMakeLists.txt | 7 +++++-- src/include/homestore/homestore_decl.hpp | 6 +++--- src/lib/logging.cpp | 4 ++++ src/lib/replication/service/generic_repl_svc.cpp | 3 ++- src/tests/CMakeLists.txt | 1 + src/tests/index_btree_benchmark.cpp | 2 +- src/tests/log_dev_benchmark.cpp | 2 +- src/tests/log_store_benchmark.cpp | 2 +- src/tests/test_append_blkalloc.cpp | 2 +- src/tests/test_blk_cache_queue.cpp | 2 +- src/tests/test_blk_read_tracker.cpp | 2 +- src/tests/test_blkalloc.cpp | 2 +- src/tests/test_blkid.cpp | 4 ++-- src/tests/test_btree_node.cpp | 3 ++- src/tests/test_cp_mgr.cpp | 2 +- src/tests/test_data_service.cpp | 2 +- src/tests/test_device_manager.cpp | 2 +- src/tests/test_home_raft_logstore.cpp | 2 +- src/tests/test_index_btree.cpp | 2 +- src/tests/test_journal_vdev.cpp | 2 +- src/tests/test_log_dev.cpp | 2 +- src/tests/test_log_store.cpp | 2 +- src/tests/test_mem_btree.cpp | 1 + src/tests/test_meta_blk_mgr.cpp | 2 +- src/tests/test_pdev.cpp | 2 +- src/tests/test_raft_repl_dev.cpp | 6 +++--- src/tests/test_solo_repl_dev.cpp | 2 +- 27 files changed, 41 insertions(+), 30 deletions(-) create mode 100644 src/lib/logging.cpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 6e1b0766a..651c79796 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -13,9 +13,11 @@ find_package(NuraftMesg QUIET REQUIRED) list(APPEND COMMON_DEPS iomgr::iomgr farmhash::farmhash - sisl::sisl - nuraft::nuraft + -Wl,--whole-archive NuraftMesg::proto + -Wl,--no-whole-archive + nuraft::nuraft + sisl::sisl ) if (${isa-l_FOUND}) list(APPEND COMMON_DEPS isa-l::isa-l) @@ -57,6 +59,7 @@ set(HOMESTORE_OBJECTS $ lib/homestore.cpp lib/crc.cpp + lib/logging.cpp #$ #$ ) diff --git a/src/include/homestore/homestore_decl.hpp b/src/include/homestore/homestore_decl.hpp index 3db07a924..1935ccd35 100644 --- a/src/include/homestore/homestore_decl.hpp +++ b/src/include/homestore/homestore_decl.hpp @@ -196,8 +196,8 @@ struct cap_attrs { }; #endif +} // namespace homestore + ////////////// Misc /////////////////// #define HOMESTORE_LOG_MODS \ - btree, device, blkalloc, cp, metablk, wbcache, logstore, transient, replication, nuraft - -} // namespace homestore + btree, device, blkalloc, cp, metablk, wbcache, logstore, transient, replication diff --git a/src/lib/logging.cpp b/src/lib/logging.cpp new file mode 100644 index 000000000..2ba71ef04 --- /dev/null +++ b/src/lib/logging.cpp @@ -0,0 +1,4 @@ +#include +#include + +SISL_LOGGING_DEF(HOMESTORE_LOG_MODS) diff --git a/src/lib/replication/service/generic_repl_svc.cpp b/src/lib/replication/service/generic_repl_svc.cpp index 91f81c333..d169d4ce2 100644 --- a/src/lib/replication/service/generic_repl_svc.cpp +++ b/src/lib/replication/service/generic_repl_svc.cpp @@ -119,7 +119,8 @@ void SoloReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cooki { std::unique_lock lg(m_rd_map_mtx); - auto [it, happened] = m_rd_map.emplace(group_id, rdev); + auto [_, happened] = m_rd_map.emplace(group_id, rdev); + (void) happened; HS_DBG_ASSERT(happened, "Unable to put the repl_dev in rd map for group_id={}", group_id); } } diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt index 511118934..708aa161b 100644 --- a/src/tests/CMakeLists.txt +++ b/src/tests/CMakeLists.txt @@ -37,6 +37,7 @@ if (${build_nonio_tests}) add_executable(test_mem_btree ${TEST_MEMBTREE_SOURCE_FILES}) target_link_libraries(test_mem_btree ${COMMON_TEST_DEPS} GTest::gtest) add_test(NAME MemBtree COMMAND test_mem_btree) + set_tests_properties(MemBtree PROPERTIES TIMEOUT 180) add_executable(test_blk_read_tracker) target_sources(test_blk_read_tracker PRIVATE test_blk_read_tracker.cpp ../lib/blkdata_svc/blk_read_tracker.cpp ../lib/blkalloc/blk.cpp) diff --git a/src/tests/index_btree_benchmark.cpp b/src/tests/index_btree_benchmark.cpp index b1b100334..d36bea643 100644 --- a/src/tests/index_btree_benchmark.cpp +++ b/src/tests/index_btree_benchmark.cpp @@ -42,7 +42,7 @@ void* globle_helper{nullptr}; #define GET_BENCHMARK_HELPER(BTREE_TYPE) static_cast< IndexBtreeBenchmark< BTREE_TYPE >* >(globle_helper) -SISL_LOGGING_DEF(HOMESTORE_LOG_MODS) +SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) std::vector< std::string > test_common::HSTestHelper::s_dev_names; SISL_OPTIONS_ENABLE(logging, index_btree_benchmark, iomgr, test_common_setup) diff --git a/src/tests/log_dev_benchmark.cpp b/src/tests/log_dev_benchmark.cpp index 107f5c08f..a16b616e7 100644 --- a/src/tests/log_dev_benchmark.cpp +++ b/src/tests/log_dev_benchmark.cpp @@ -28,7 +28,7 @@ #include "logstore/log_dev.hpp" -SISL_LOGGING_DEF(HOMESTORE_LOG_MODS) +SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) RCU_REGISTER_INIT static constexpr size_t ITERATIONS{100000}; diff --git a/src/tests/log_store_benchmark.cpp b/src/tests/log_store_benchmark.cpp index 885a1474f..b8d43e6ec 100644 --- a/src/tests/log_store_benchmark.cpp +++ b/src/tests/log_store_benchmark.cpp @@ -35,7 +35,7 @@ #include "test_common/homestore_test_common.hpp" using namespace homestore; -SISL_LOGGING_DEF(HOMESTORE_LOG_MODS) +SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) std::vector< std::string > test_common::HSTestHelper::s_dev_names; SISL_OPTIONS_ENABLE(logging, log_store_benchmark, iomgr, test_common_setup) diff --git a/src/tests/test_append_blkalloc.cpp b/src/tests/test_append_blkalloc.cpp index 22ff9975c..8deca826e 100644 --- a/src/tests/test_append_blkalloc.cpp +++ b/src/tests/test_append_blkalloc.cpp @@ -44,7 +44,7 @@ //////////////////////////////////////////////////////////////////////////////////////////////////// RCU_REGISTER_INIT -SISL_LOGGING_DEF(HOMESTORE_LOG_MODS) +SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) SISL_OPTIONS_ENABLE(logging, test_append_blkalloc, iomgr, test_common_setup) SISL_LOGGING_DECL(test_append_blkalloc) diff --git a/src/tests/test_blk_cache_queue.cpp b/src/tests/test_blk_cache_queue.cpp index ab2bf5818..840c921af 100644 --- a/src/tests/test_blk_cache_queue.cpp +++ b/src/tests/test_blk_cache_queue.cpp @@ -28,7 +28,7 @@ #include "blkalloc/varsize_blk_allocator.h" #include "blkalloc/blk_cache_queue.h" -SISL_LOGGING_DEF(HOMESTORE_LOG_MODS) +SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) using namespace homestore; diff --git a/src/tests/test_blk_read_tracker.cpp b/src/tests/test_blk_read_tracker.cpp index 7d9eef662..dec5d1e4f 100644 --- a/src/tests/test_blk_read_tracker.cpp +++ b/src/tests/test_blk_read_tracker.cpp @@ -25,7 +25,7 @@ using namespace homestore; -SISL_LOGGING_DEF(HOMESTORE_LOG_MODS) +SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) SISL_OPTIONS_ENABLE(logging, test_blk_read_tracker) VENUM(op_type_t, uint8_t, insert = 0, remove = 1, wait_on = 2, max_op = 3); diff --git a/src/tests/test_blkalloc.cpp b/src/tests/test_blkalloc.cpp index 6f0040556..0d6204022 100644 --- a/src/tests/test_blkalloc.cpp +++ b/src/tests/test_blkalloc.cpp @@ -41,7 +41,7 @@ #include "blkalloc/fixed_blk_allocator.h" #include "blkalloc/varsize_blk_allocator.h" -SISL_LOGGING_DEF(HOMESTORE_LOG_MODS) +SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) using namespace homestore; diff --git a/src/tests/test_blkid.cpp b/src/tests/test_blkid.cpp index e1d5a9358..0123232be 100644 --- a/src/tests/test_blkid.cpp +++ b/src/tests/test_blkid.cpp @@ -7,7 +7,7 @@ #include -SISL_LOGGING_DEF(HOMESTORE_LOG_MODS) +SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) SISL_OPTIONS_ENABLE(logging, test_blkid) SISL_OPTION_GROUP(test_blkid, @@ -175,4 +175,4 @@ int main(int argc, char* argv[]) { spdlog::set_pattern("[%D %T%z] [%^%l%$] [%t] %v"); return RUN_ALL_TESTS(); -} \ No newline at end of file +} diff --git a/src/tests/test_btree_node.cpp b/src/tests/test_btree_node.cpp index 0af03fa95..1ff602f19 100644 --- a/src/tests/test_btree_node.cpp +++ b/src/tests/test_btree_node.cpp @@ -28,7 +28,8 @@ static constexpr uint32_t g_max_keys{6000}; static std::uniform_int_distribution< uint32_t > g_randkey_generator{0, g_max_keys - 1}; using namespace homestore; -SISL_LOGGING_DEF(HOMESTORE_LOG_MODS) +SISL_LOGGING_DEF(btree) +SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) struct FixedLenNodeTest { using NodeType = SimpleNode< TestFixedKey, TestFixedValue >; diff --git a/src/tests/test_cp_mgr.cpp b/src/tests/test_cp_mgr.cpp index bd5c8811e..33a0c77d4 100644 --- a/src/tests/test_cp_mgr.cpp +++ b/src/tests/test_cp_mgr.cpp @@ -27,7 +27,7 @@ using namespace homestore; -SISL_LOGGING_DEF(HOMESTORE_LOG_MODS) +SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) SISL_OPTIONS_ENABLE(logging, test_cp_mgr, iomgr, test_common_setup) SISL_LOGGING_DECL(test_cp_mgr) diff --git a/src/tests/test_data_service.cpp b/src/tests/test_data_service.cpp index db44d0867..5af59445f 100644 --- a/src/tests/test_data_service.cpp +++ b/src/tests/test_data_service.cpp @@ -49,7 +49,7 @@ using namespace homestore; RCU_REGISTER_INIT -SISL_LOGGING_DEF(HOMESTORE_LOG_MODS) +SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) SISL_OPTIONS_ENABLE(logging, test_data_service, iomgr, test_common_setup) SISL_LOGGING_DECL(test_data_service) diff --git a/src/tests/test_device_manager.cpp b/src/tests/test_device_manager.cpp index b9801423e..13bef9d6b 100644 --- a/src/tests/test_device_manager.cpp +++ b/src/tests/test_device_manager.cpp @@ -35,7 +35,7 @@ #include "device/virtual_dev.hpp" using namespace homestore; -SISL_LOGGING_DEF(HOMESTORE_LOG_MODS) +SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) SISL_OPTIONS_ENABLE(logging, test_device_manager, iomgr) SISL_OPTION_GROUP(test_device_manager, diff --git a/src/tests/test_home_raft_logstore.cpp b/src/tests/test_home_raft_logstore.cpp index fdb6759d3..d9c9df4c5 100644 --- a/src/tests/test_home_raft_logstore.cpp +++ b/src/tests/test_home_raft_logstore.cpp @@ -10,7 +10,7 @@ using namespace homestore; -SISL_LOGGING_DEF(HOMESTORE_LOG_MODS) +SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) static constexpr uint32_t g_max_logsize{512}; static std::random_device g_rd{}; diff --git a/src/tests/test_index_btree.cpp b/src/tests/test_index_btree.cpp index 3d4193424..1e833b059 100644 --- a/src/tests/test_index_btree.cpp +++ b/src/tests/test_index_btree.cpp @@ -27,7 +27,7 @@ using namespace homestore; -SISL_LOGGING_DEF(HOMESTORE_LOG_MODS) +SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) SISL_OPTIONS_ENABLE(logging, test_index_btree, iomgr, test_common_setup) SISL_LOGGING_DECL(test_index_btree) diff --git a/src/tests/test_journal_vdev.cpp b/src/tests/test_journal_vdev.cpp index e146144c1..693eb0925 100644 --- a/src/tests/test_journal_vdev.cpp +++ b/src/tests/test_journal_vdev.cpp @@ -40,7 +40,7 @@ using namespace homestore; RCU_REGISTER_INIT -SISL_LOGGING_DEF(HOMESTORE_LOG_MODS) +SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) SISL_OPTIONS_ENABLE(logging, test_vdev, iomgr, test_common_setup) std::vector< std::string > test_common::HSTestHelper::s_dev_names; diff --git a/src/tests/test_log_dev.cpp b/src/tests/test_log_dev.cpp index 9738d8efc..b41a5f9a5 100644 --- a/src/tests/test_log_dev.cpp +++ b/src/tests/test_log_dev.cpp @@ -31,7 +31,7 @@ using namespace homestore; RCU_REGISTER_INIT -SISL_LOGGING_DEF(HOMESTORE_LOG_MODS) +SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) std::vector< logdev_key > s_logdev_keys; static uint64_t first_offset{~static_cast< uint64_t >(0)}; diff --git a/src/tests/test_log_store.cpp b/src/tests/test_log_store.cpp index dfa916c19..656092105 100644 --- a/src/tests/test_log_store.cpp +++ b/src/tests/test_log_store.cpp @@ -54,7 +54,7 @@ using namespace homestore; RCU_REGISTER_INIT -SISL_LOGGING_DEF(HOMESTORE_LOG_MODS) +SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) std::vector< std::string > test_common::HSTestHelper::s_dev_names; struct test_log_data { diff --git a/src/tests/test_mem_btree.cpp b/src/tests/test_mem_btree.cpp index 75aaece01..f6df10d0e 100644 --- a/src/tests/test_mem_btree.cpp +++ b/src/tests/test_mem_btree.cpp @@ -31,6 +31,7 @@ using namespace homestore; SISL_LOGGING_DEF(btree) +SISL_LOGGING_INIT(btree) SISL_OPTIONS_ENABLE(logging, test_mem_btree) SISL_OPTION_GROUP( diff --git a/src/tests/test_meta_blk_mgr.cpp b/src/tests/test_meta_blk_mgr.cpp index 1f7c0262d..559fd3eef 100644 --- a/src/tests/test_meta_blk_mgr.cpp +++ b/src/tests/test_meta_blk_mgr.cpp @@ -45,7 +45,7 @@ using namespace homestore; RCU_REGISTER_INIT -SISL_LOGGING_DEF(HOMESTORE_LOG_MODS) +SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) std::vector< std::string > test_common::HSTestHelper::s_dev_names; SISL_OPTIONS_ENABLE(logging, test_meta_blk_mgr, iomgr, test_common_setup) diff --git a/src/tests/test_pdev.cpp b/src/tests/test_pdev.cpp index fb5629041..d5670abaf 100644 --- a/src/tests/test_pdev.cpp +++ b/src/tests/test_pdev.cpp @@ -34,7 +34,7 @@ #include "device/physical_dev.hpp" using namespace homestore; -SISL_LOGGING_DEF(HOMESTORE_LOG_MODS) +SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) SISL_OPTIONS_ENABLE(logging, test_pdev, iomgr) SISL_OPTION_GROUP(test_pdev, diff --git a/src/tests/test_raft_repl_dev.cpp b/src/tests/test_raft_repl_dev.cpp index 36542f6f4..b42f04d94 100644 --- a/src/tests/test_raft_repl_dev.cpp +++ b/src/tests/test_raft_repl_dev.cpp @@ -40,8 +40,8 @@ using namespace homestore; -SISL_LOGGING_DECL(test_raft_repl_dev) -SISL_LOGGING_DEF(HOMESTORE_LOG_MODS) +SISL_LOGGING_DEF(test_raft_repl_dev) +SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) SISL_OPTION_GROUP(test_raft_repl_dev, (block_size, "", "block_size", "block size to io", @@ -262,4 +262,4 @@ int main(int argc, char* argv[]) { auto ret = RUN_ALL_TESTS(); g_helper->teardown(); return ret; -} \ No newline at end of file +} diff --git a/src/tests/test_solo_repl_dev.cpp b/src/tests/test_solo_repl_dev.cpp index 1c47801e9..492a8006a 100644 --- a/src/tests/test_solo_repl_dev.cpp +++ b/src/tests/test_solo_repl_dev.cpp @@ -47,7 +47,7 @@ using namespace homestore; using namespace test_common; -SISL_LOGGING_DEF(HOMESTORE_LOG_MODS) +SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) SISL_OPTIONS_ENABLE(logging, test_solo_repl_dev, iomgr, test_common_setup) SISL_LOGGING_DECL(test_solo_repl_dev)