diff --git a/.github/workflows/build_commit.yml b/.github/workflows/build_commit.yml index 6d5870aa3..36ebd4575 100644 --- a/.github/workflows/build_commit.yml +++ b/.github/workflows/build_commit.yml @@ -20,10 +20,22 @@ on: type: string jobs: - Sisl10Deps: - uses: eBay/sisl/.github/workflows/build_dependencies.yml@stable/v10.x + SislDeps: + uses: eBay/sisl/.github/workflows/build_dependencies.yml@master with: - branch: stable/v10.x + branch: master + platform: ${{ inputs.platform }} + build-type: ${{ inputs.build-type }} + malloc-impl: ${{ inputs.malloc-impl }} + prerelease: ${{ inputs.prerelease }} + tooling: None + if: ${{ github.event_name != 'pull_request' }} + + NuraftMesgDeps: + needs: SislDeps + uses: eBay/nuraft_mesg/.github/workflows/build_dependencies.yml@main + with: + branch: main platform: ${{ inputs.platform }} build-type: ${{ inputs.build-type }} malloc-impl: ${{ inputs.malloc-impl }} @@ -32,7 +44,7 @@ jobs: if: ${{ github.event_name != 'pull_request' }} IOMgrDeps: - needs: Sisl10Deps + needs: SislDeps uses: eBay/iomanager/.github/workflows/build_dependencies.yml@master with: branch: master @@ -44,7 +56,7 @@ jobs: if: ${{ github.event_name != 'pull_request' }} HomestoreDeps: - needs: IOMgrDeps + needs: [IOMgrDeps, NuraftMesgDeps] uses: ./.github/workflows/build_dependencies.yml with: branch: ${{ github.ref }} diff --git a/.github/workflows/build_dependencies.yml b/.github/workflows/build_dependencies.yml index 5d74d5250..be97712f1 100644 --- a/.github/workflows/build_dependencies.yml +++ b/.github/workflows/build_dependencies.yml @@ -108,7 +108,7 @@ jobs: uses: eBay/sisl/.github/actions/load_conan@stable/v8.x with: load_any: 'True' - key_prefix: Sisl10Deps-${{ inputs.platform }}-${{ inputs.build-type }}-${{ inputs.malloc-impl }}-${{ inputs.prerelease }} + key_prefix: SislDeps-${{ inputs.platform }}-${{ inputs.build-type }}-${{ inputs.malloc-impl }}-${{ inputs.prerelease }} if: ${{ inputs.testing == 'True' || steps.restore-cache.outputs.cache-hit != 'true' }} - name: Retrieve Dependencies @@ -119,6 +119,14 @@ jobs: ref: master if: ${{ inputs.testing == 'True' || steps.restore-cache.outputs.cache-hit != 'true' }} + - name: Retrieve Dependencies + uses: actions/checkout@v3 + with: + repository: eBay/nuraft_mesg + path: import/nuraft_mesg + ref: main + if: ${{ inputs.testing == 'True' || steps.restore-cache.outputs.cache-hit != 'true' }} + - name: Load IOMgr Cache uses: eBay/sisl/.github/actions/load_conan@stable/v8.x with: @@ -128,6 +136,15 @@ jobs: fail_on_cache_miss: true if: ${{ inputs.testing == 'True' || steps.restore-cache.outputs.cache-hit != 'true' }} + - name: Load NuraftMesg Cache + uses: eBay/sisl/.github/actions/load_conan@stable/v8.x + with: + testing: 'False' + path: import/nuraft_mesg + key_prefix: NuMesgDeps-${{ inputs.platform }}-${{ inputs.build-type }}-${{ inputs.malloc-impl }}-${{ inputs.prerelease }} + fail_on_cache_miss: true + if: ${{ inputs.testing == 'True' || steps.restore-cache.outputs.cache-hit != 'true' }} + - name: Setup Conan uses: eBay/sisl/.github/actions/setup_conan@stable/v8.x with: @@ -139,6 +156,7 @@ jobs: sudo apt-get install -y python3-pyelftools libaio-dev python -m pip install pyelftools conan export import/iomgr oss/master + conan export import/nuraft_mesg oss/main cached_pkgs=$(ls -1d ~/.conan/data/*/*/*/*/package | sed 's,.*data/,,' | cut -d'/' -f1,2 | paste -sd',' - -) echo "::info:: Pre-cached: ${cached_pkgs}" if: ${{ inputs.testing == 'True' || steps.restore-cache.outputs.cache-hit != 'true' }} @@ -150,6 +168,7 @@ jobs: -o sisl:malloc_impl=${{ inputs.malloc-impl }} \ -o sisl:testing=False \ -o iomgr:testing=off \ + -o nuraft_mesg:testing=False \ -o testing=off \ -s build_type=${{ inputs.build-type }} \ --build missing \ @@ -166,7 +185,7 @@ jobs: uses: eBay/sisl/.github/actions/load_conan@stable/v8.x with: load_any: 'True' - key_prefix: Sisl10Deps-${{ inputs.platform }}-${{ inputs.build-type }}-${{ inputs.malloc-impl }}-${{ inputs.prerelease }} + key_prefix: SislDeps-${{ inputs.platform }}-${{ inputs.build-type }}-${{ inputs.malloc-impl }}-${{ inputs.prerelease }} if: ${{ inputs.testing == 'True' && github.event_name != 'pull_request' && steps.restore-cache.outputs.cache-hit != 'true' }} - name: Reload IOMgr Cache @@ -178,6 +197,15 @@ jobs: fail_on_cache_miss: true if: ${{ inputs.testing == 'True' && github.event_name != 'pull_request' && steps.restore-cache.outputs.cache-hit != 'true' }} + - name: Reload NuraftMesg Cache + uses: eBay/sisl/.github/actions/load_conan@stable/v8.x + with: + testing: 'False' + path: import/nuraft_mesg + key_prefix: NuMesgDeps-${{ inputs.platform }}-${{ inputs.build-type }}-${{ inputs.malloc-impl }}-${{ inputs.prerelease }} + fail_on_cache_miss: true + if: ${{ inputs.testing == 'True' && github.event_name != 'pull_request' && steps.restore-cache.outputs.cache-hit != 'true' }} + - name: Create and Test Package run: | sanitize=$([[ "${{ inputs.tooling }}" == "Sanitize" ]] && echo "True" || echo "False") @@ -186,6 +214,7 @@ jobs: -o sisl:malloc_impl=${{ inputs.malloc-impl }} \ -o sisl:testing=False \ -o iomgr:testing=off \ + -o nuraft_mesg:testing=False \ -o homestore:sanitize=${sanitize} \ -s build_type=${{ inputs.build-type }} \ --build missing \ @@ -199,6 +228,7 @@ jobs: -o sisl:malloc_impl=${{ inputs.malloc-impl }} \ -o sisl:testing=False \ -o iomgr:testing=off \ + -o nuraft_mesg:testing=False \ -o coverage=True \ -s build_type=${{ inputs.build-type }} \ --build missing \ diff --git a/.gitignore b/.gitignore index 97b58ba32..816ffced6 100644 --- a/.gitignore +++ b/.gitignore @@ -105,3 +105,4 @@ cmake-*/** # Visual Studio CMakeSettings.json .vs/** +.cache diff --git a/conanfile.py b/conanfile.py index 84d48f86e..2f2eb1d15 100644 --- a/conanfile.py +++ b/conanfile.py @@ -5,8 +5,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "4.9.4" - + version = "5.0.2" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" topics = ("ebay", "nublox") @@ -55,11 +54,13 @@ def build_requirements(self): self.build_requires("gtest/1.14.0") def requirements(self): - self.requires("iomgr/[~=10, include_prerelease=True]@oss/master") - self.requires("sisl/[>=10.3]") + self.requires("iomgr/[~=11, include_prerelease=True]@oss/master") + self.requires("sisl/[~=11, include_prerelease=True]@oss/master") + self.requires("nuraft_mesg/[~=2, include_prerelease=True]@oss/main") self.requires("farmhash/cci.20190513@") - self.requires("isa-l/2.30.0") + if self.settings.arch in ['x86', 'x86_64']: + self.requires("isa-l/2.30.0") self.requires("spdk/21.07.y") def build(self): diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 30329c54e..651c79796 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -3,25 +3,34 @@ cmake_minimum_required(VERSION 3.13) set(CMAKE_THREAD_PREFER_PTHREAD TRUE) find_package(Threads) -find_library(LIB_AIO aio REQUIRED) -find_package(isa-l REQUIRED) -find_package(iomgr REQUIRED) -find_package(farmhash REQUIRED) -find_package(GTest REQUIRED) - -set (COMMON_DEPS - iomgr::iomgr - farmhash::farmhash - isa-l::isa-l - sisl::sisl - ) - -set(COMMON_TEST_DEPS - ${COMMON_DEPS} - ${spdk_LIBRARY_LIST} - ${dpdk_LIBRARY_LIST} - GTest::gmock - ) +find_library(LIB_AIO aio QUIET REQUIRED) +find_package(isa-l QUIET) +find_package(iomgr QUIET REQUIRED) +find_package(farmhash QUIET REQUIRED) +find_package(GTest QUIET REQUIRED) +find_package(NuraftMesg QUIET REQUIRED) + +list(APPEND COMMON_DEPS + iomgr::iomgr + farmhash::farmhash + -Wl,--whole-archive + NuraftMesg::proto + -Wl,--no-whole-archive + nuraft::nuraft + sisl::sisl +) +if (${isa-l_FOUND}) + list(APPEND COMMON_DEPS isa-l::isa-l) +else () + add_flags("-DNO_ISAL") +endif() + +list(APPEND COMMON_TEST_DEPS + ${COMMON_DEPS} + ${spdk_LIBRARY_LIST} + ${dpdk_LIBRARY_LIST} + GTest::gmock +) include_directories (BEFORE lib/) include_directories (BEFORE include/) @@ -49,6 +58,8 @@ set(HOMESTORE_OBJECTS $ $ lib/homestore.cpp + lib/crc.cpp + lib/logging.cpp #$ #$ ) diff --git a/src/include/homestore/blk.h b/src/include/homestore/blk.h index fdcaee7d7..eea67e6d0 100644 --- a/src/include/homestore/blk.h +++ b/src/include/homestore/blk.h @@ -133,6 +133,7 @@ struct MultiBlkId : public BlkId { BlkId to_single_blkid() const; static uint32_t expected_serialized_size(uint16_t num_pieces); + static uint32_t max_serialized_size(); static int compare(MultiBlkId const& one, MultiBlkId const& two); struct iterator { diff --git a/src/include/homestore/blkdata_service.hpp b/src/include/homestore/blkdata_service.hpp index da0d5d403..1c5692642 100644 --- a/src/include/homestore/blkdata_service.hpp +++ b/src/include/homestore/blkdata_service.hpp @@ -99,7 +99,7 @@ class BlkDataService { * @return A Future that will resolve to an error code indicating the result of the write operation. */ folly::Future< std::error_code > async_write(const char* buf, uint32_t size, MultiBlkId const& bid, - bool part_of_batch); + bool part_of_batch = false); /** * @brief : asynchronous write with input block ids; * @@ -171,6 +171,13 @@ class BlkDataService { */ uint32_t get_blk_size() const { return m_blk_size; } + /** + * @brief : get the blk size of this data service; + * + * @return : blk size + */ + uint32_t get_align_size() const; + /** * @brief : get the read block tracker handle; * diff --git a/src/include/homestore/btree/btree_kv.hpp b/src/include/homestore/btree/btree_kv.hpp index 18dd832a8..c995a7cc9 100644 --- a/src/include/homestore/btree/btree_kv.hpp +++ b/src/include/homestore/btree/btree_kv.hpp @@ -256,8 +256,8 @@ class BtreeLinkInfo : public BtreeValue { sisl::blob serialize() const override { sisl::blob b; - b.size = sizeof(bnode_link_info); - b.bytes = uintptr_cast(const_cast< bnode_link_info* >(&info)); + b.set_size(sizeof(bnode_link_info)); + b.set_bytes(r_cast< const uint8_t* >(&info)); return b; } uint32_t serialized_size() const override { return sizeof(bnode_link_info); } @@ -265,8 +265,8 @@ class BtreeLinkInfo : public BtreeValue { std::string to_string() const override { return fmt::format("{}.{}", info.m_bnodeid, info.m_link_version); } void deserialize(const sisl::blob& b, bool copy) override { - DEBUG_ASSERT_EQ(b.size, sizeof(bnode_link_info), "BtreeLinkInfo deserialize received invalid blob"); - auto other = r_cast< bnode_link_info* >(b.bytes); + DEBUG_ASSERT_EQ(b.size(), sizeof(bnode_link_info), "BtreeLinkInfo deserialize received invalid blob"); + auto other = r_cast< bnode_link_info const* >(b.cbytes()); set_bnode_id(other->m_bnodeid); set_link_version(other->m_link_version); } diff --git a/src/include/homestore/btree/detail/btree_node.hpp b/src/include/homestore/btree/detail/btree_node.hpp index 8f713e534..f4e8aa8eb 100644 --- a/src/include/homestore/btree/detail/btree_node.hpp +++ b/src/include/homestore/btree/detail/btree_node.hpp @@ -24,8 +24,7 @@ #include #include "btree_internal.hpp" #include -// #include -#include +#include namespace homestore { ENUM(locktype_t, uint8_t, NONE, READ, WRITE) @@ -273,9 +272,9 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > { virtual BtreeLinkInfo get_edge_value() const { return BtreeLinkInfo{edge_id(), edge_link_version()}; } virtual void set_edge_value(const BtreeValue& v) { - const auto b = v.serialize(); - auto l = r_cast< BtreeLinkInfo::bnode_link_info* >(b.bytes); - DEBUG_ASSERT_EQ(b.size, sizeof(BtreeLinkInfo::bnode_link_info)); + auto const b = v.serialize(); + auto const l = r_cast< BtreeLinkInfo::bnode_link_info const* >(b.cbytes()); + DEBUG_ASSERT_EQ(b.size(), sizeof(BtreeLinkInfo::bnode_link_info)); set_edge_info(*l); } diff --git a/src/include/homestore/btree/detail/prefix_node.hpp b/src/include/homestore/btree/detail/prefix_node.hpp index a72b13adb..c0ee8844e 100644 --- a/src/include/homestore/btree/detail/prefix_node.hpp +++ b/src/include/homestore/btree/detail/prefix_node.hpp @@ -79,21 +79,20 @@ class FixedPrefixNode : public VariantNode< K, V > { sisl::blob const kblob = s_cast< K const& >(key).serialize_prefix(); sisl::blob const vblob = s_cast< V const& >(val).serialize_prefix(); - DEBUG_ASSERT_EQ(kblob.size, key_size(), "Prefix key size mismatch with serialized prefix size"); - DEBUG_ASSERT_EQ(vblob.size, value_size(), "Prefix value size mismatch with serialized prefix size"); + DEBUG_ASSERT_EQ(kblob.size(), key_size(), "Prefix key size mismatch with serialized prefix size"); + DEBUG_ASSERT_EQ(vblob.size(), value_size(), "Prefix value size mismatch with serialized prefix size"); uint8_t* cur_ptr = uintptr_cast(this) + sizeof(prefix_entry); - std::memcpy(cur_ptr, kblob.bytes, kblob.size); - cur_ptr += kblob.size; - std::memcpy(cur_ptr, vblob.bytes, vblob.size); + std::memcpy(cur_ptr, kblob.cbytes(), kblob.size()); + cur_ptr += kblob.size(); + std::memcpy(cur_ptr, vblob.cbytes(), vblob.size()); } } sisl::blob key_buf() const { - return sisl::blob{const_cast< uint8_t* >(r_cast< uint8_t const* >(this) + sizeof(prefix_entry)), - key_size()}; + return sisl::blob{r_cast< uint8_t const* >(this) + sizeof(prefix_entry), key_size()}; } - sisl::blob val_buf() const { return sisl::blob{key_buf().bytes + key_buf().size, value_size()}; } + sisl::blob val_buf() const { return sisl::blob{key_buf().cbytes() + key_buf().size(), value_size()}; } }; struct suffix_entry { @@ -131,19 +130,19 @@ class FixedPrefixNode : public VariantNode< K, V > { kblob = key.serialize(); vblob = val.serialize(); } - DEBUG_ASSERT_EQ(kblob.size, key_size(), "Suffix key size mismatch with serialized suffix size"); - DEBUG_ASSERT_EQ(vblob.size, value_size(), "Suffix value size mismatch with serialized suffix size"); + DEBUG_ASSERT_EQ(kblob.size(), key_size(), "Suffix key size mismatch with serialized suffix size"); + DEBUG_ASSERT_EQ(vblob.size(), value_size(), "Suffix value size mismatch with serialized suffix size"); - std::memcpy(cur_ptr, kblob.bytes, kblob.size); - cur_ptr += kblob.size; - std::memcpy(cur_ptr, vblob.bytes, vblob.size); + std::memcpy(cur_ptr, kblob.cbytes(), kblob.size()); + cur_ptr += kblob.size(); + std::memcpy(cur_ptr, vblob.cbytes(), vblob.size()); } sisl::blob key_buf() const { return sisl::blob{const_cast< uint8_t* >(r_cast< uint8_t const* >(this) + sizeof(suffix_entry)), key_size()}; } - sisl::blob val_buf() const { return sisl::blob{key_buf().bytes + key_buf().size, value_size()}; } + sisl::blob val_buf() const { return sisl::blob{key_buf().bytes() + key_buf().size(), value_size()}; } }; #pragma pack() @@ -778,7 +777,7 @@ class FixedPrefixNode : public VariantNode< K, V > { K prevKey; while (i < this->total_entries()) { K key = BtreeNode::get_nth_key< K >(i, false); - uint64_t kp = *(uint64_t*)key.serialize().bytes; + uint64_t kp = *(uint64_t*)key.serialize().bytes(); if (i > 0 && prevKey.compare(key) > 0) { DEBUG_ASSERT(false, "Found non sorted entry: {} -> {}", kp, to_string()); } diff --git a/src/include/homestore/btree/detail/simple_node.hpp b/src/include/homestore/btree/detail/simple_node.hpp index b9f29d7b9..fd01a5560 100644 --- a/src/include/homestore/btree/detail/simple_node.hpp +++ b/src/include/homestore/btree/detail/simple_node.hpp @@ -183,9 +183,7 @@ class SimpleNode : public VariantNode< K, V > { void get_nth_key_internal(uint32_t ind, BtreeKey& out_key, bool copy) const override { DEBUG_ASSERT_LT(ind, this->total_entries(), "node={}", to_string()); - sisl::blob b; - b.bytes = (uint8_t*)(this->node_data_area_const() + (get_nth_obj_size(ind) * ind)); - b.size = get_nth_key_size(ind); + sisl::blob b{this->node_data_area_const() + (get_nth_obj_size(ind) * ind), get_nth_key_size(ind)}; out_key.deserialize(b, copy); } @@ -322,11 +320,11 @@ class SimpleNode : public VariantNode< K, V > { set_nth_value(ind, v); } else { uint8_t* entry = this->node_data_area() + (get_nth_obj_size(ind) * ind); - sisl::blob key_blob = k.serialize(); - memcpy((void*)entry, key_blob.bytes, key_blob.size); + sisl::blob const key_blob = k.serialize(); + memcpy((void*)entry, key_blob.cbytes(), key_blob.size()); - sisl::blob val_blob = v.serialize(); - memcpy((void*)(entry + key_blob.size), val_blob.bytes, val_blob.size); + sisl::blob const val_blob = v.serialize(); + memcpy((void*)(entry + key_blob.size()), val_blob.cbytes(), val_blob.size()); } } @@ -343,20 +341,20 @@ class SimpleNode : public VariantNode< K, V > { void set_nth_key(uint32_t ind, BtreeKey* key) { uint8_t* entry = this->node_data_area() + (get_nth_obj_size(ind) * ind); - sisl::blob b = key->serialize(); - memcpy(entry, b.bytes, b.size); + sisl::blob const b = key->serialize(); + memcpy(entry, b.cbytes(), b.size()); } void set_nth_value(uint32_t ind, const BtreeValue& v) { sisl::blob b = v.serialize(); if (ind >= this->total_entries()) { RELEASE_ASSERT_EQ(this->is_leaf(), false, "setting value outside bounds on leaf node"); - DEBUG_ASSERT_EQ(b.size, sizeof(BtreeLinkInfo::bnode_link_info), + DEBUG_ASSERT_EQ(b.size(), sizeof(BtreeLinkInfo::bnode_link_info), "Invalid value size being set for non-leaf node"); - this->set_edge_info(*r_cast< BtreeLinkInfo::bnode_link_info* >(b.bytes)); + this->set_edge_info(*r_cast< BtreeLinkInfo::bnode_link_info const* >(b.cbytes())); } else { uint8_t* entry = this->node_data_area() + (get_nth_obj_size(ind) * ind) + get_nth_key_size(ind); - std::memcpy(entry, b.bytes, b.size); + std::memcpy(entry, b.cbytes(), b.size()); } } }; diff --git a/src/include/homestore/btree/detail/varlen_node.hpp b/src/include/homestore/btree/detail/varlen_node.hpp index 289dd8b7d..b448e8a58 100644 --- a/src/include/homestore/btree/detail/varlen_node.hpp +++ b/src/include/homestore/btree/detail/varlen_node.hpp @@ -93,7 +93,7 @@ class VariableNode : public VariantNode< K, V > { K prevKey; while (i < this->total_entries()) { K key = BtreeNode::get_nth_key< K >(i, false); - uint64_t kp = *(uint64_t*)key.serialize().bytes; + uint64_t kp = *(uint64_t*)key.serialize().bytes(); if (i > 0 && prevKey.compare(key) > 0) { DEBUG_ASSERT(false, "Found non sorted entry: {} -> {}", kp, to_string()); } @@ -140,16 +140,16 @@ class VariableNode : public VariantNode< K, V > { sisl::blob kblob = key.serialize(); sisl::blob vblob = val.serialize(); - DEBUG_ASSERT_EQ(kblob.size, key.serialized_size(), + DEBUG_ASSERT_EQ(kblob.size(), key.serialized_size(), "Key Serialized size returned different after serialization"); - DEBUG_ASSERT_EQ(vblob.size, val.serialized_size(), + DEBUG_ASSERT_EQ(vblob.size(), val.serialized_size(), "Value Serialized size returned different after serialization"); // we can avoid memcpy if addresses of val_ptr and vblob.bytes is same. In place update - if (key_ptr != kblob.bytes) { std::memcpy(key_ptr, kblob.bytes, kblob.size); } - if (val_ptr != vblob.bytes) { std::memcpy(val_ptr, vblob.bytes, vblob.size); } - set_nth_key_len(get_nth_record_mutable(ind), kblob.size); - set_nth_value_len(get_nth_record_mutable(ind), vblob.size); + if (key_ptr != kblob.cbytes()) { std::memcpy(key_ptr, kblob.cbytes(), kblob.size()); } + if (val_ptr != vblob.cbytes()) { std::memcpy(val_ptr, vblob.cbytes(), vblob.size()); } + set_nth_key_len(get_nth_record_mutable(ind), kblob.size()); + set_nth_value_len(get_nth_record_mutable(ind), vblob.size()); get_var_node_header()->m_available_space += cur_obj_size - new_obj_size; this->inc_gen(); } else { @@ -228,13 +228,8 @@ class VariableNode : public VariantNode< K, V > { bool full_move{false}; while (ind >= end_ind) { // Get the ith key and value blob and then remove the entry from here and insert to the other node - sisl::blob kb; - kb.bytes = (uint8_t*)get_nth_obj(ind); - kb.size = get_nth_key_size(ind); - - sisl::blob vb; - vb.bytes = kb.bytes + kb.size; - vb.size = get_nth_value_size(ind); + sisl::blob const kb{get_nth_obj(ind), get_nth_key_size(ind)}; + sisl::blob const vb{kb.cbytes() + kb.size(), get_nth_value_size(ind)}; auto sz = other.insert(0, kb, vb); if (!sz) { break; } @@ -269,15 +264,10 @@ class VariableNode : public VariantNode< K, V > { uint32_t ind = this->total_entries() - 1; while (ind > 0) { - sisl::blob kb; - kb.bytes = (uint8_t*)get_nth_obj(ind); - kb.size = get_nth_key_size(ind); - - sisl::blob vb; - vb.bytes = kb.bytes + kb.size; - vb.size = get_nth_value_size(ind); + sisl::blob const kb{get_nth_obj(ind), get_nth_key_size(ind)}; + sisl::blob const vb{kb.cbytes() + kb.size(), get_nth_value_size(ind)}; - if ((kb.size + vb.size + this->get_record_size()) > size_to_move) { + if ((kb.size() + vb.size() + this->get_record_size()) > size_to_move) { // We reached threshold of how much we could move break; } @@ -326,11 +316,11 @@ class VariableNode : public VariantNode< K, V > { auto idx = start_idx; uint32_t n = 0; while (idx < other.total_entries()) { - sisl::blob kb{(uint8_t*)other.get_nth_obj(idx), other.get_nth_key_size(idx)}; - sisl::blob vb{kb.bytes + kb.size, other.get_nth_value_size(idx)}; + sisl::blob const kb{(uint8_t*)other.get_nth_obj(idx), other.get_nth_key_size(idx)}; + sisl::blob const vb{kb.cbytes() + kb.size(), other.get_nth_value_size(idx)}; // We reached threshold of how much we could move - if ((kb.size + vb.size + other.get_record_size()) > copy_size) { break; } + if ((kb.size() + vb.size() + other.get_record_size()) > copy_size) { break; } auto sz = insert(this->total_entries(), kb, vb); if (sz == 0) { break; } @@ -356,8 +346,8 @@ class VariableNode : public VariantNode< K, V > { auto idx = start_idx; uint32_t n = 0; while (n < nentries) { - sisl::blob kb{(uint8_t*)other.get_nth_obj(idx), other.get_nth_key_size(idx)}; - sisl::blob vb{kb.bytes + kb.size, other.get_nth_value_size(idx)}; + sisl::blob const kb{other.get_nth_obj(idx), other.get_nth_key_size(idx)}; + sisl::blob const vb{kb.cbytes() + kb.size(), other.get_nth_value_size(idx)}; auto sz = insert(this->total_entries(), kb, vb); if (sz == 0) { break; } @@ -465,8 +455,8 @@ class VariableNode : public VariantNode< K, V > { void set_nth_key(uint32_t ind, const BtreeKey& key) { const auto kb = key.serialize(); assert(ind < this->total_entries()); - assert(kb.size == get_nth_key_size(ind)); - memcpy(uintptr_cast(get_nth_obj(ind)), kb.bytes, kb.size); + assert(kb.size() == get_nth_key_size(ind)); + memcpy(uintptr_cast(get_nth_obj(ind)), kb.cbytes(), kb.size()); } bool has_room_for_put(btree_put_type put_type, uint32_t key_size, uint32_t value_size) const override { @@ -589,7 +579,7 @@ class VariableNode : public VariantNode< K, V > { assert(ind <= this->total_entries()); LOGTRACEMOD(btree, "{}:{}:{}:{}", ind, get_var_node_header()->tail_offset(), get_arena_free_space(), get_var_node_header()->available_space()); - uint16_t obj_size = key_blob.size + val_blob.size; + uint16_t obj_size = key_blob.size() + val_blob.size(); uint16_t to_insert_size = obj_size + this->get_record_size(); if (to_insert_size > get_var_node_header()->available_space()) { RELEASE_ASSERT(false, "insert failed insert size {} available size {}", to_insert_size, @@ -615,15 +605,15 @@ class VariableNode : public VariantNode< K, V > { get_var_node_header()->m_available_space -= (obj_size + this->get_record_size()); // Create a new record - set_nth_key_len(rec_ptr, key_blob.size); - set_nth_value_len(rec_ptr, val_blob.size); + set_nth_key_len(rec_ptr, key_blob.size()); + set_nth_value_len(rec_ptr, val_blob.size()); set_record_data_offset(rec_ptr, get_var_node_header()->m_tail_arena_offset); // Copy the contents of key and value in the offset uint8_t* raw_data_ptr = offset_to_ptr_mutable(get_var_node_header()->m_tail_arena_offset); - memcpy(raw_data_ptr, key_blob.bytes, key_blob.size); - raw_data_ptr += key_blob.size; - memcpy(raw_data_ptr, val_blob.bytes, val_blob.size); + memcpy(raw_data_ptr, key_blob.cbytes(), key_blob.size()); + raw_data_ptr += key_blob.size(); + memcpy(raw_data_ptr, val_blob.cbytes(), val_blob.size()); // Increment the entries and generation number this->inc_entries(); diff --git a/src/include/homestore/checkpoint/cp.hpp b/src/include/homestore/checkpoint/cp.hpp index 843db24d9..427ecf4f4 100644 --- a/src/include/homestore/checkpoint/cp.hpp +++ b/src/include/homestore/checkpoint/cp.hpp @@ -21,6 +21,7 @@ #include #include +#include #include /* diff --git a/src/include/homestore/crc.h b/src/include/homestore/crc.h new file mode 100644 index 000000000..9c21f3d6c --- /dev/null +++ b/src/include/homestore/crc.h @@ -0,0 +1,16 @@ +#pragma once + +// Only x86 and x86_64 supported by Intel Storage Acceleration library +#ifndef NO_ISAL +#include + +#else + +extern "C" { +// crc16_t10dif reference function, slow crc16 from the definition. +uint16_t crc16_t10dif(uint16_t seed, const unsigned char* buf, uint64_t len); + +// crc32_ieee reference function, slow crc32 from the definition. +uint32_t crc32_ieee(uint32_t seed, const unsigned char* buf, uint64_t len); +} +#endif diff --git a/src/include/homestore/homestore.hpp b/src/include/homestore/homestore.hpp index 9aae7661a..263986639 100644 --- a/src/include/homestore/homestore.hpp +++ b/src/include/homestore/homestore.hpp @@ -51,6 +51,8 @@ class HomeStore; class CPManager; class VirtualDev; class ChunkSelector; +class ReplDevListener; +class ReplApplication; using HomeStoreSafePtr = std::shared_ptr< HomeStore >; @@ -96,12 +98,6 @@ struct HS_SERVICE { } }; -VENUM(repl_impl_type, uint8_t, - server_side, // Completely homestore controlled replication - client_assisted, // Client assisting in replication - solo // For single node - no replication -); - /* * IO errors handling by homestore. * Write error :- Reason :- Disk error, space full,btree node read fail @@ -118,7 +114,7 @@ class HomeStore { std::unique_ptr< MetaBlkService > m_meta_service; std::unique_ptr< LogStoreService > m_log_service; std::unique_ptr< IndexService > m_index_service; - std::unique_ptr< ReplicationService > m_repl_service; + std::shared_ptr< ReplicationService > m_repl_service; std::unique_ptr< DeviceManager > m_dev_mgr; shared< sisl::logging::logger_t > m_periodic_logger; @@ -149,7 +145,7 @@ class HomeStore { HomeStore& with_data_service(cshared< ChunkSelector >& custom_chunk_selector = nullptr); HomeStore& with_log_service(); HomeStore& with_index_service(std::unique_ptr< IndexServiceCallbacks > cbs); - HomeStore& with_repl_data_service(repl_impl_type repl_type, + HomeStore& with_repl_data_service(cshared< ReplApplication >& repl_app, cshared< ChunkSelector >& custom_chunk_selector = nullptr); bool start(const hs_input_params& input, hs_before_services_starting_cb_t svcs_starting_cb = nullptr); diff --git a/src/include/homestore/homestore_decl.hpp b/src/include/homestore/homestore_decl.hpp index 99c6f234e..1935ccd35 100644 --- a/src/include/homestore/homestore_decl.hpp +++ b/src/include/homestore/homestore_decl.hpp @@ -196,9 +196,8 @@ struct cap_attrs { }; #endif +} // namespace homestore + ////////////// Misc /////////////////// #define HOMESTORE_LOG_MODS \ - btree_structures, btree_nodes, btree_generics, btree, cache, device, blkalloc, vol_io_wd, volume, flip, cp, \ - metablk, indx_mgr, wbcache, logstore, replay, transient, IOMGR_LOG_MODS - -} // namespace homestore + btree, device, blkalloc, cp, metablk, wbcache, logstore, transient, replication diff --git a/src/include/homestore/replication/repl_decls.h b/src/include/homestore/replication/repl_decls.h index c496c37e2..9f9fee69f 100644 --- a/src/include/homestore/replication/repl_decls.h +++ b/src/include/homestore/replication/repl_decls.h @@ -18,8 +18,8 @@ using blkid_list_t = folly::small_vector< BlkId, 4 >; // Fully qualified domain pba, unique pba id across replica set struct RemoteBlkId { RemoteBlkId() = default; - RemoteBlkId(uint32_t s, const BlkId& b) : server_id{s}, blkid{b} {} - uint32_t server_id{0}; + RemoteBlkId(int32_t s, const MultiBlkId& b) : server_id{s}, blkid{b} {} + int32_t server_id{0}; MultiBlkId blkid; bool operator==(RemoteBlkId const& o) const { return (server_id == o.server_id) && (blkid == o.blkid); } @@ -27,9 +27,8 @@ struct RemoteBlkId { using remote_blkid_list_t = folly::small_vector< RemoteBlkId, 4 >; -// data service api names -static std::string const SEND_DATA{"send_data"}; -static std::string const FETCH_DATA{"fetch_data"}; +using replica_id_t = uuid_t; +using group_id_t = uuid_t; } // namespace homestore diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h index bfdae7fc4..27727c943 100644 --- a/src/include/homestore/replication/repl_dev.h +++ b/src/include/homestore/replication/repl_dev.h @@ -1,13 +1,52 @@ #pragma once +#include + #include #include +#include +#include #include - +#include #include +namespace nuraft { +template < typename T > +using ptr = std::shared_ptr< T >; + +class buffer; +} // namespace nuraft + namespace homestore { class ReplDev; +struct repl_req_ctx; +using raft_buf_ptr_t = nuraft::ptr< nuraft::buffer >; +using repl_req_ptr_t = boost::intrusive_ptr< repl_req_ctx >; + +VENUM(repl_req_state_t, uint32_t, + INIT = 0, // Initial state + BLK_ALLOCATED = 1 << 0, // Local block is allocated + DATA_RECEIVED = 1 << 1, // Data has been received and being written to the storage + DATA_WRITTEN = 1 << 2, // Data has been written to the storage + LOG_RECEIVED = 1 << 3, // Log is received and waiting for data + LOG_FLUSHED = 1 << 4 // Log has been flushed +) + +struct repl_key { + int32_t server_id{0}; // Server Id which this req is originated from + uint64_t term; // RAFT term number + uint64_t dsn{0}; // Data sequence number to tie the data with the raft journal entry + + struct Hasher { + size_t operator()(repl_key const& rk) const { + return std::hash< int32_t >()(rk.server_id) ^ std::hash< uint64_t >()(rk.term) ^ + std::hash< uint64_t >()(rk.dsn); + } + }; + + bool operator==(repl_key const& other) const = default; + std::string to_string() const { return fmt::format("server={}, term={}, dsn={}", server_id, term, dsn); } +}; struct repl_journal_entry; struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost::thread_safe_counter > { @@ -18,17 +57,41 @@ struct repl_req_ctx : public boost::intrusive_ref_counter< repl_req_ctx, boost:: int64_t get_lsn() const { return lsn; } MultiBlkId const& get_local_blkid() const { return local_blkid; } -private: - sisl::blob header; // User header - sisl::blob key; // Key to replicate - sisl::sg_list value; // Raw value - applicable only to leader req - MultiBlkId local_blkid; // List of corresponding local blkids for the value - RemoteBlkId remote_blkid; // List of remote blkid for the value - std::unique_ptr< uint8_t[] > journal_buf; // Buf for the journal entry - repl_journal_entry* journal_entry{nullptr}; // pointer to the journal entry - int64_t lsn{0}; // Lsn for this replication req - - void alloc_journal_entry(uint32_t size); + uint64_t dsn() const { return rkey.dsn; } + uint64_t term() const { return rkey.term; } + void alloc_journal_entry(uint32_t size, bool is_raft_buf); + raft_buf_ptr_t& raft_journal_buf(); + uint8_t* raw_journal_buf(); + + std::string to_string() const; + std::string to_compact_string() const; + +public: + repl_key rkey; // Unique key for the request + sisl::blob header; // User header + sisl::blob key; // User supplied key for this req + int64_t lsn{0}; // Lsn for this replication req + bool is_proposer{false}; // Is the repl_req proposed by this node + + //////////////// Value related section ///////////////// + sisl::sg_list value; // Raw value - applicable only to leader req + MultiBlkId local_blkid; // Local BlkId for the value + RemoteBlkId remote_blkid; // Corresponding remote blkid for the value + + //////////////// Journal/Buf related section ///////////////// + std::variant< std::unique_ptr< uint8_t[] >, raft_buf_ptr_t > journal_buf; // Buf for the journal entry + repl_journal_entry* journal_entry{nullptr}; // pointer to the journal entry + + //////////////// Replication state related section ///////////////// + std::mutex state_mtx; + std::atomic< uint32_t > state{uint32_cast(repl_req_state_t::INIT)}; // State of the replication request + folly::Promise< folly::Unit > data_written_promise; // Promise to be fulfilled when data is written + + //////////////// Communication packet/builder section ///////////////// + sisl::io_blob_list_t pkts; + flatbuffers::FlatBufferBuilder fb_builder; + sisl::io_blob_safe buf_for_unaligned_data; + intrusive< sisl::GenericRpcData > rpc_data; }; // @@ -38,7 +101,7 @@ class ReplDevListener { public: virtual ~ReplDevListener() = default; - void set_repl_dev(ReplDev* rdev) { m_repl_dev = std::move(rdev); } + void set_repl_dev(ReplDev* rdev) { m_repl_dev = rdev; } virtual ReplDev* repl_dev() { return m_repl_dev; } /// @brief Called when the log entry has been committed in the replica set. @@ -99,9 +162,9 @@ class ReplDevListener { /// write. In cases where caller don't care about the hints can return default blk_alloc_hints. /// /// @param header Header originally passed with repl_dev::async_alloc_write() api on the leader - /// @param Original context passed as part of repl_dev::async_alloc_write + /// @param data_size Size needed to be allocated for /// @return Expected to return blk_alloc_hints for this write - virtual blk_alloc_hints get_blk_alloc_hints(sisl::blob const& header, cintrusive< repl_req_ctx >& ctx) = 0; + virtual blk_alloc_hints get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size) = 0; /// @brief Called when the replica set is being stopped virtual void on_replica_stop() = 0; @@ -135,7 +198,7 @@ class ReplDev { /// @param ctx - User supplied context which will be passed to listener /// callbacks virtual void async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& value, - intrusive< repl_req_ctx > ctx) = 0; + repl_req_ptr_t ctx) = 0; /// @brief Reads the data and returns a future to continue on /// @param bid Block id to read @@ -160,14 +223,23 @@ class ReplDev { /// @brief Gets the group_id this repldev is working for /// @return group_id - virtual uuid_t group_id() const = 0; - - virtual void attach_listener(std::unique_ptr< ReplDevListener > listener) { m_listener = std::move(listener); } + virtual group_id_t group_id() const = 0; + /// @brief Gets the block size with which IO will happen on this device + /// @return Block size virtual uint32_t get_blk_size() const = 0; + virtual void attach_listener(shared< ReplDevListener > listener) { m_listener = std::move(listener); } + protected: - std::unique_ptr< ReplDevListener > m_listener; + shared< ReplDevListener > m_listener; }; } // namespace homestore + +template <> +struct fmt::formatter< homestore::repl_key > : fmt::formatter< std::string > { + auto format(const homestore::repl_key& a, format_context& ctx) const { + return fmt::formatter< std::string >::format(a.to_string(), ctx); + } +}; diff --git a/src/include/homestore/replication_service.hpp b/src/include/homestore/replication_service.hpp index 379f31767..72d24d626 100644 --- a/src/include/homestore/replication_service.hpp +++ b/src/include/homestore/replication_service.hpp @@ -5,8 +5,9 @@ #include #include - +#include #include +#include namespace homestore { @@ -36,15 +37,23 @@ struct hs_stats; template < typename V, typename E > using Result = folly::Expected< V, E >; -template < class V, class E > -using AsyncResult = folly::Future< Result< V, E > >; - template < class V > using ReplResult = Result< V, ReplServiceError >; -template < class V > +template < class V, class E > +using AsyncResult = folly::SemiFuture< Result< V, E > >; + +template < class V = folly::Unit > using AsyncReplResult = AsyncResult< V, ReplServiceError >; +VENUM(repl_impl_type, uint8_t, + server_side, // Completely homestore controlled replication + client_assisted, // Client assisting in replication + solo // For single node - no replication +); + +class ReplApplication; + class ReplicationService { public: ReplicationService() = default; @@ -55,32 +64,16 @@ class ReplicationService { /// @param members List of members to form this group /// @param listener state machine listener of all the events happening on the repl_dev (commit, precommit etc) /// @return A Future ReplDev on success or Future ReplServiceError upon error - virtual AsyncReplResult< shared< ReplDev > > create_repl_dev(uuid_t group_id, - std::set< std::string, std::less<> >&& members, - std::unique_ptr< ReplDevListener > listener) = 0; - - /// @brief Opens the Repl Device for a given group id. It is expected that the repl dev is already created and used - /// this method for recovering. It is possible that repl_dev is not ready and in that case it will provide Repl - /// Device after it is ready and thus returns a Future. - /// - /// NOTE 1: If callers does an open for a repl device which was not created before, then at the end of - /// initialization an error is returned saying ReplServiceError::SERVER_NOT_FOUND - /// - /// NOTE 2: If the open repl device is called after Replication service is started, then it returns an error - /// ReplServiceError::BAD_REQUEST - /// @param group_id Group id to open the repl device with - /// @param listener state machine listener of all the events happening on the repl_dev (commit, precommit etc) - /// @return A Future ReplDev on successful open of ReplDev or Future ReplServiceError upon error - virtual AsyncReplResult< shared< ReplDev > > open_repl_dev(uuid_t group_id, - std::unique_ptr< ReplDevListener > listener) = 0; + virtual AsyncReplResult< shared< ReplDev > > create_repl_dev(group_id_t group_id, + std::set< replica_id_t > const& members) = 0; - virtual folly::Future< ReplServiceError > replace_member(uuid_t group_id, std::string const& member_out, - std::string const& member_in) const = 0; + virtual AsyncReplResult<> replace_member(group_id_t group_id, replica_id_t member_out, + replica_id_t member_in) const = 0; /// @brief Get the repl dev for a given group id if it is already created or opened /// @param group_id Group id interested in /// @return ReplDev is opened or ReplServiceError::SERVER_NOT_FOUND if it doesn't exist - virtual ReplResult< shared< ReplDev > > get_repl_dev(uuid_t group_id) const = 0; + virtual ReplResult< shared< ReplDev > > get_repl_dev(group_id_t group_id) const = 0; /// @brief Iterate over all repl devs and then call the callback provided /// @param cb Callback with repl dev @@ -89,5 +82,29 @@ class ReplicationService { /// @brief get the capacity stats form underlying backend; /// @return the capacity stats; virtual hs_stats get_cap_stats() const = 0; + + virtual meta_sub_type get_meta_blk_name() const = 0; }; + +//////////////// Application which uses Replication needs to be provide the following callbacks //////////////// +class ReplApplication { +public: + // Returns the required implementation type of replication + virtual repl_impl_type get_impl_type() const = 0; + + // Is the replica recovery needs timeline consistency. This is used to determine if the replica needs to be + // recovered by key or by block of data. At present only non-timeline consistent replication is supported. + virtual bool need_timeline_consistency() const = 0; + + // Called when the repl dev is found upon restart of the homestore instance. The caller should return an instance of + // Listener corresponding to the ReplDev which will be used to perform the precommit/commit/rollback. + virtual shared< ReplDevListener > create_repl_dev_listener(group_id_t group_id) = 0; + + // Given the uuid of the peer, get their address and port + virtual std::pair< std::string, uint16_t > lookup_peer(replica_id_t uuid) const = 0; + + // Get the current application/server repl uuid + virtual replica_id_t get_my_repl_id() const = 0; +}; + } // namespace homestore diff --git a/src/include/homestore/superblk_handler.hpp b/src/include/homestore/superblk_handler.hpp index 2c74c2059..699edc5b6 100644 --- a/src/include/homestore/superblk_handler.hpp +++ b/src/include/homestore/superblk_handler.hpp @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -68,18 +69,18 @@ class superblk { m_meta_mgr_cookie = voidptr_cast(meta_cookie); m_raw_buf = meta_service().is_aligned_buf_needed(buf.size()) ? buf.extract(meta_service().align_size()) : buf.extract(0); - m_sb = r_cast< T* >(m_raw_buf->bytes); + m_sb = r_cast< T* >(m_raw_buf->bytes()); return m_sb; } - T* create(uint32_t size) { + T* create(uint32_t size = sizeof(T)) { if (meta_service().is_aligned_buf_needed(size)) { auto al_sz = meta_service().align_size(); m_raw_buf = sisl::make_byte_array(uint32_cast(sisl::round_up(size, al_sz)), al_sz, sisl::buftag::metablk); } else { m_raw_buf = sisl::make_byte_array(uint32_cast(size), 0, sisl::buftag::metablk); } - m_sb = new (m_raw_buf->bytes) T(); + m_sb = new (m_raw_buf->bytes()) T(); return m_sb; } @@ -92,14 +93,14 @@ class superblk { m_sb = nullptr; } - uint32_t size() const { return m_raw_buf->size; } + uint32_t size() const { return m_raw_buf->size(); } sisl::byte_array raw_buf() { return m_raw_buf; } void write() { if (m_meta_mgr_cookie) { - meta_service().update_sub_sb(m_raw_buf->bytes, m_raw_buf->size, m_meta_mgr_cookie); + meta_service().update_sub_sb(m_raw_buf->cbytes(), m_raw_buf->size(), m_meta_mgr_cookie); } else { - meta_service().add_sub_sb(m_metablk_name, m_raw_buf->bytes, m_raw_buf->size, m_meta_mgr_cookie); + meta_service().add_sub_sb(m_metablk_name, m_raw_buf->cbytes(), m_raw_buf->size(), m_meta_mgr_cookie); } } @@ -116,4 +117,74 @@ class superblk { std::string m_metablk_name; }; +class json_superblk { +private: + void* m_meta_mgr_cookie{nullptr}; + nlohmann::json m_json_sb; + std::string m_metablk_name; + +public: + static uint64_t next_count() { + static std::atomic< uint64_t > s_count{0}; + return ++s_count; + } + + json_superblk(const std::string& meta_name = "") { set_name(meta_name); } + + void set_name(const std::string& meta_name) { + if (meta_name.empty()) { + m_metablk_name = "meta_blk_" + std::to_string(next_count()); + } else { + m_metablk_name = meta_name; + } + } + + nlohmann::json& load(const sisl::byte_view& buf, void* meta_cookie) { + m_meta_mgr_cookie = voidptr_cast(meta_cookie); + std::string_view const b{c_charptr_cast(buf.bytes()), buf.size()}; + + try { + m_json_sb = nlohmann::json::from_msgpack(b); + } catch (nlohmann::json::exception const& e) { + DEBUG_ASSERT(false, "Failed to load superblk for meta_blk={}", m_metablk_name); + return m_json_sb; + } + return m_json_sb; + } + + nlohmann::json& create() { return m_json_sb; } + + void destroy() { + if (m_meta_mgr_cookie) { + meta_service().remove_sub_sb(m_meta_mgr_cookie); + m_meta_mgr_cookie = nullptr; + } + m_json_sb = nlohmann::json{}; + } + + uint32_t size() const { return m_json_sb.size(); } + + void write() { + auto do_write = [this](sisl::blob const& b) { + if (m_meta_mgr_cookie) { + meta_service().update_sub_sb(b.cbytes(), b.size(), m_meta_mgr_cookie); + } else { + meta_service().add_sub_sb(m_metablk_name, b.cbytes(), b.size(), m_meta_mgr_cookie); + } + }; + + auto const packed_data = nlohmann::json::to_msgpack(m_json_sb); + auto const size = packed_data.size(); + if (meta_service().is_aligned_buf_needed(size)) { + sisl::io_blob_safe buffer(size, meta_service().align_size()); + std::memcpy(buffer.bytes(), packed_data.data(), size); + do_write(buffer); + } else { + do_write(sisl::blob{r_cast< uint8_t const* >(packed_data.data()), uint32_cast(size)}); + } + } + + nlohmann::json& operator*() { return m_json_sb; } +}; + } // namespace homestore diff --git a/src/lib/blkalloc/bitmap_blk_allocator.cpp b/src/lib/blkalloc/bitmap_blk_allocator.cpp index 9b51718d6..6b6d32ee2 100644 --- a/src/lib/blkalloc/bitmap_blk_allocator.cpp +++ b/src/lib/blkalloc/bitmap_blk_allocator.cpp @@ -60,9 +60,9 @@ void BitmapBlkAllocator::cp_flush(CP*) { if (m_is_disk_bm_dirty.load()) { sisl::byte_array bitmap_buf = acquire_underlying_buffer(); if (m_meta_blk_cookie) { - meta_service().update_sub_sb(bitmap_buf->bytes, bitmap_buf->size, m_meta_blk_cookie); + meta_service().update_sub_sb(bitmap_buf->cbytes(), bitmap_buf->size(), m_meta_blk_cookie); } else { - meta_service().add_sub_sb(get_name(), bitmap_buf->bytes, bitmap_buf->size, m_meta_blk_cookie); + meta_service().add_sub_sb(get_name(), bitmap_buf->cbytes(), bitmap_buf->size(), m_meta_blk_cookie); } m_is_disk_bm_dirty.store(false); // No longer dirty now, needs to be set before releasing the buffer release_underlying_buffer(); diff --git a/src/lib/blkalloc/blk.cpp b/src/lib/blkalloc/blk.cpp index e8143e59c..0bdc32a33 100644 --- a/src/lib/blkalloc/blk.cpp +++ b/src/lib/blkalloc/blk.cpp @@ -32,7 +32,7 @@ uint32_t BlkId::serialized_size() const { return sizeof(BlkId); } uint32_t BlkId::expected_serialized_size() { return sizeof(BlkId); } void BlkId::deserialize(sisl::blob const& b, bool copy) { - serialized* other = r_cast< serialized* >(b.bytes); + serialized* other = r_cast< serialized const* >(b.cbytes()); s = *other; } @@ -100,9 +100,9 @@ uint32_t MultiBlkId::serialized_size() const { } void MultiBlkId::deserialize(sisl::blob const& b, bool copy) { - MultiBlkId* other = r_cast< MultiBlkId* >(b.bytes); + MultiBlkId* other = r_cast< MultiBlkId const* >(b.cbytes()); s = other->s; - if (b.size == sizeof(BlkId)) { + if (b.size() == sizeof(BlkId)) { n_addln_piece = 0; } else { n_addln_piece = other->n_addln_piece; @@ -117,6 +117,8 @@ uint32_t MultiBlkId::expected_serialized_size(uint16_t num_pieces) { return sz; } +uint32_t MultiBlkId::max_serialized_size() { return expected_serialized_size(max_pieces); } + uint16_t MultiBlkId::num_pieces() const { return BlkId::is_valid() ? n_addln_piece + 1 : 0; } bool MultiBlkId::has_room() const { return (n_addln_piece < max_addln_pieces); } diff --git a/src/lib/blkdata_svc/blkdata_service.cpp b/src/lib/blkdata_svc/blkdata_service.cpp index 9b899e97b..822ca6566 100644 --- a/src/lib/blkdata_svc/blkdata_service.cpp +++ b/src/lib/blkdata_svc/blkdata_service.cpp @@ -231,4 +231,6 @@ uint64_t BlkDataService::get_total_capacity() const { return m_vdev->size(); } uint64_t BlkDataService::get_used_capacity() const { return m_vdev->used_size(); } +uint32_t BlkDataService::get_align_size() const { return m_vdev->align_size(); } + } // namespace homestore diff --git a/src/lib/common/homestore_config.fbs b/src/lib/common/homestore_config.fbs index e299ee8f9..8ea23aba6 100644 --- a/src/lib/common/homestore_config.fbs +++ b/src/lib/common/homestore_config.fbs @@ -189,6 +189,36 @@ table MetaBlkStore { sanity_check_interval: uint32 = 10 (hotswap); } +table Consensus { + // Backoff for any rpc failure + rpc_backoff_ms: uint32 = 250; + + // Frequency of Raft heartbeat + heartbeat_period_ms: uint32 = 250; + + // Re-election timeout low and high mark + elect_to_low_ms: uint32 = 900; + elect_to_high_ms: uint32 = 1400; + + // When a new member is being synced, the batch size of number of logs to be shipped + log_sync_batch_size: int32 = 100; + + // Log distance with which snapshot/compact needs to happen. 0 means snapshot is disabled + snapshot_freq_distance: int32 = 0; + + // Max append batch size + max_append_batch_size: int32 = 64; + + // Threshold of log gap from leader to consider a replica as stale + stale_log_gap_hi_threshold: int32 = 200; + + // Threshold of log gap from leader to consider a replica as come out of stale and became fresh + stale_log_gap_lo_threshold: int32 = 30; + + // Minimum log gap a replica has to be from leader before joining the replica set. + min_log_gap_to_join: int32 = 30; +} + table HomeStoreSettings { version: uint32 = 1; generic: Generic; @@ -199,6 +229,7 @@ table HomeStoreSettings { logstore: LogStore; resource_limits: ResourceLimits; metablk: MetaBlkStore; + consensus: Consensus; } root_type HomeStoreSettings; diff --git a/src/lib/crc.cpp b/src/lib/crc.cpp new file mode 100644 index 000000000..9b1b296a1 --- /dev/null +++ b/src/lib/crc.cpp @@ -0,0 +1,43 @@ +// Only x86 and x86_64 supported by Intel Storage Acceleration library +#ifdef NO_ISAL + +#include +#include +extern "C" { +#define MAX_ITER 8 + +// crc16_t10dif reference function, slow crc16 from the definition. +uint16_t crc16_t10dif(uint16_t seed, const unsigned char* buf, uint64_t len) { + size_t rem = seed; + unsigned int i, j; + + uint16_t poly = 0x8bb7; // t10dif standard + + for (i = 0; i < len; i++) { + rem = rem ^ (buf[i] << 8); + for (j = 0; j < MAX_ITER; j++) { + rem = rem << 1; + rem = (rem & 0x10000) ? rem ^ poly : rem; + } + } + return rem; +} + +// crc32_ieee reference function, slow crc32 from the definition. +uint32_t crc32_ieee(uint32_t seed, const unsigned char* buf, uint64_t len) { + uint64_t rem = ~seed; + unsigned int i, j; + + uint32_t poly = 0x04C11DB7; // IEEE standard + + for (i = 0; i < len; i++) { + rem = rem ^ ((uint64_t)buf[i] << 24); + for (j = 0; j < MAX_ITER; j++) { + rem = rem << 1; + rem = (rem & 0x100000000ULL) ? rem ^ poly : rem; + } + } + return ~rem; +} +} +#endif diff --git a/src/lib/device/device.h b/src/lib/device/device.h index ad8749576..aec65638e 100644 --- a/src/lib/device/device.h +++ b/src/lib/device/device.h @@ -17,7 +17,7 @@ #include #include -#include +#include #include #include #include @@ -71,7 +71,7 @@ struct vdev_info { void set_pdev_choice(vdev_multi_pdev_opts_t opts) { multi_pdev_choice = enum_value(opts); } void set_user_private(const sisl::blob& data) { - std::memcpy(&user_private, data.bytes, std::min(data.size, uint32_cast(user_private_size))); + std::memcpy(&user_private, data.cbytes(), std::min(data.size(), uint32_cast(user_private_size))); } uint8_t* get_user_private_mutable() { return &(user_private[0]); } const uint8_t* get_user_private() const { return &(user_private[0]); } diff --git a/src/lib/device/device_manager.cpp b/src/lib/device/device_manager.cpp index ecab5562c..0ae9d7d6f 100644 --- a/src/lib/device/device_manager.cpp +++ b/src/lib/device/device_manager.cpp @@ -15,7 +15,7 @@ #include #include -#include +#include #include #include diff --git a/src/lib/device/physical_dev.cpp b/src/lib/device/physical_dev.cpp index 33f243824..4c4b3da2a 100644 --- a/src/lib/device/physical_dev.cpp +++ b/src/lib/device/physical_dev.cpp @@ -22,7 +22,6 @@ #include #include #include -#include #include #include "device/chunk.h" @@ -117,7 +116,7 @@ PhysicalDev::PhysicalDev(const dev_info& dinfo, int oflags, const pdev_info_head PhysicalDev::~PhysicalDev() { close_device(); } -void PhysicalDev::write_super_block(uint8_t* buf, uint32_t sb_size, uint64_t offset) { +void PhysicalDev::write_super_block(uint8_t const* buf, uint32_t sb_size, uint64_t offset) { auto err_c = m_drive_iface->sync_write(m_iodev.get(), c_charptr_cast(buf), sb_size, offset); if (m_super_blk_in_footer) { @@ -223,7 +222,7 @@ void PhysicalDev::submit_batch() { m_drive_iface->submit_batch(); } void PhysicalDev::format_chunks() { m_chunk_info_slots = std::make_unique< sisl::Bitset >(hs_super_blk::chunk_info_bitmap_size(m_dev_info)); auto bitmap_mem = m_chunk_info_slots->serialize(m_pdev_info.dev_attr.align_size); - write_super_block(bitmap_mem->bytes, bitmap_mem->size, hs_super_blk::chunk_sb_offset()); + write_super_block(bitmap_mem->cbytes(), bitmap_mem->size(), hs_super_blk::chunk_sb_offset()); } std::vector< shared< Chunk > > PhysicalDev::create_chunks(const std::vector< uint32_t >& chunk_ids, uint32_t vdev_id, @@ -261,7 +260,7 @@ std::vector< shared< Chunk > > PhysicalDev::create_chunks(const std::vector< uin // Finally serialize the entire bitset and persist the chunk info bitmap itself auto bitmap_mem = m_chunk_info_slots->serialize(m_pdev_info.dev_attr.align_size); - write_super_block(bitmap_mem->bytes, bitmap_mem->size, hs_super_blk::chunk_sb_offset()); + write_super_block(bitmap_mem->cbytes(), bitmap_mem->size(), hs_super_blk::chunk_sb_offset()); } catch (const std::out_of_range& e) { LOGERROR("Creation of chunks failed because of space, removing {} partially created chunks", ret_chunks.size()); for (auto& chunk : ret_chunks) { @@ -295,7 +294,7 @@ shared< Chunk > PhysicalDev::create_chunk(uint32_t chunk_id, uint32_t vdev_id, u get_stream(chunk).m_chunks_map.insert(std::pair{chunk_id, chunk}); auto bitmap_mem = m_chunk_info_slots->serialize(m_pdev_info.dev_attr.align_size); - write_super_block(bitmap_mem->bytes, bitmap_mem->size, hs_super_blk::chunk_sb_offset()); + write_super_block(bitmap_mem->cbytes(), bitmap_mem->size(), hs_super_blk::chunk_sb_offset()); cinfo->~chunk_info(); hs_utils::iobuf_free(buf, sisl::buftag::superblk); @@ -330,7 +329,7 @@ void PhysicalDev::load_chunks(std::function< bool(cshared< Chunk >&) >&& chunk_f // Read the chunk info bitmap area from super block and load them into in-memory bitmap of chunk slots auto buf_arr = make_byte_array(hs_super_blk::chunk_info_bitmap_size(m_dev_info), m_pdev_info.dev_attr.align_size, sisl::buftag::superblk); - read_super_block(buf_arr->bytes, buf_arr->size, hs_super_blk::chunk_sb_offset()); + read_super_block(buf_arr->bytes(), buf_arr->size(), hs_super_blk::chunk_sb_offset()); m_chunk_info_slots = std::make_unique< sisl::Bitset >(buf_arr); // Walk through each of the chunk info and create corresponding chunks @@ -390,7 +389,7 @@ void PhysicalDev::do_remove_chunk(cshared< Chunk >& chunk) { // Reset the info slot and write it to super block m_chunk_info_slots->reset_bit(chunk->slot_number()); auto bitmap_mem = m_chunk_info_slots->serialize(m_pdev_info.dev_attr.align_size); - write_super_block(bitmap_mem->bytes, bitmap_mem->size, hs_super_blk::chunk_sb_offset()); + write_super_block(bitmap_mem->cbytes(), bitmap_mem->size(), hs_super_blk::chunk_sb_offset()); get_stream(chunk).m_chunks_map.erase(chunk->chunk_id()); cinfo->~chunk_info(); diff --git a/src/lib/device/physical_dev.hpp b/src/lib/device/physical_dev.hpp index 951e61f34..ade1be9a2 100644 --- a/src/lib/device/physical_dev.hpp +++ b/src/lib/device/physical_dev.hpp @@ -25,7 +25,7 @@ #include #include -#include +#include #include #include #include @@ -96,10 +96,10 @@ struct chunk_info { void set_free() { chunk_allocated = 0x00; } void set_selector_private(const sisl::blob& data) { - std::memcpy(&chunk_selector_private, data.bytes, std::min(data.size, uint32_cast(selector_private_size))); + std::memcpy(&chunk_selector_private, data.cbytes(), std::min(data.size(), uint32_cast(selector_private_size))); } void set_user_private(const sisl::blob& data) { - std::memcpy(&user_private, data.bytes, std::min(data.size, uint32_cast(user_private_size))); + std::memcpy(&user_private, data.cbytes(), std::min(data.size(), uint32_cast(user_private_size))); } void compute_checksum() { @@ -148,7 +148,7 @@ class PhysicalDev { static uint64_t get_dev_size(const std::string& devname); std::error_code read_super_block(uint8_t* buf, uint32_t sb_size, uint64_t offset); - void write_super_block(uint8_t* buf, uint32_t sb_size, uint64_t offset); + void write_super_block(uint8_t const* buf, uint32_t sb_size, uint64_t offset); void close_device(); //////////////////////////// Chunk Creation/Load related methods ///////////////////////////////////////// diff --git a/src/lib/homestore.cpp b/src/lib/homestore.cpp index 5c8444783..bd51bf961 100644 --- a/src/lib/homestore.cpp +++ b/src/lib/homestore.cpp @@ -41,7 +41,7 @@ #include "common/resource_mgr.hpp" #include "meta/meta_sb.hpp" #include "logstore/log_store_family.hpp" -#include "replication/service/repl_service_impl.h" +#include "replication/service/generic_repl_svc.h" /* * IO errors handling by homestore. @@ -52,13 +52,12 @@ * Handling :- logdev doesn't support any read error. It panic for read errors. * If HS see write error/read error during recovery then it panic the system. */ - namespace homestore { HomeStoreSafePtr HomeStore::s_instance{nullptr}; static std::unique_ptr< IndexServiceCallbacks > s_index_cbs; -static repl_impl_type s_repl_impl_type{repl_impl_type::solo}; -shared< ChunkSelector > s_custom_chunk_selector{nullptr}; +static shared< ChunkSelector > s_custom_chunk_selector{nullptr}; +static shared< ReplApplication > s_repl_app{nullptr}; HomeStore* HomeStore::instance() { if (s_instance == nullptr) { s_instance = std::make_shared< HomeStore >(); } @@ -83,11 +82,11 @@ HomeStore& HomeStore::with_log_service() { return *this; } -HomeStore& HomeStore::with_repl_data_service(repl_impl_type repl_type, +HomeStore& HomeStore::with_repl_data_service(cshared< ReplApplication >& repl_app, cshared< ChunkSelector >& custom_chunk_selector) { m_services.svcs |= HS_SERVICE::REPLICATION | HS_SERVICE::LOG_REPLICATED | HS_SERVICE::LOG_LOCAL; m_services.svcs &= ~HS_SERVICE::DATA; // ReplicationDataSvc or DataSvc are mutually exclusive - s_repl_impl_type = repl_type; + s_repl_app = repl_app; s_custom_chunk_selector = std::move(custom_chunk_selector); return *this; } @@ -128,7 +127,7 @@ bool HomeStore::start(const hs_input_params& input, hs_before_services_starting_ if (has_data_service()) { m_data_service = std::make_unique< BlkDataService >(std::move(s_custom_chunk_selector)); } if (has_index_service()) { m_index_service = std::make_unique< IndexService >(std::move(s_index_cbs)); } if (has_repl_data_service()) { - m_repl_service = std::make_unique< ReplicationServiceImpl >(s_repl_impl_type); + m_repl_service = GenericReplService::create(std::move(s_repl_app)); m_data_service = std::make_unique< BlkDataService >(std::move(s_custom_chunk_selector)); } m_cp_mgr = std::make_unique< CPManager >(); @@ -213,7 +212,7 @@ void HomeStore::do_start() { m_data_service->start(); } else if (has_repl_data_service()) { m_data_service->start(); - s_cast< ReplicationServiceImpl* >(m_repl_service.get())->start(); + s_cast< GenericReplService* >(m_repl_service.get())->start(); } // In case of custom recovery, let consumer starts the recovery and it is consumer module's responsibilities @@ -231,6 +230,11 @@ void HomeStore::shutdown() { LOGINFO("Homestore shutdown is started"); + if (has_repl_data_service()) { + s_cast< GenericReplService* >(m_repl_service.get())->stop(); + m_repl_service.reset(); + } + if (has_index_service()) { m_index_service->stop(); // m_index_service.reset(); @@ -248,10 +252,6 @@ void HomeStore::shutdown() { if (has_data_service()) { m_data_service.reset(); } - if (has_repl_data_service()) { - s_cast< ReplicationServiceImpl* >(m_repl_service.get())->stop(); - m_repl_service.reset(); - } m_dev_mgr->close_devices(); m_dev_mgr.reset(); m_cp_mgr->shutdown(); diff --git a/src/lib/index/index_cp.hpp b/src/lib/index/index_cp.hpp index b0ed2dbe6..440939c80 100644 --- a/src/lib/index/index_cp.hpp +++ b/src/lib/index/index_cp.hpp @@ -37,6 +37,7 @@ struct IndexCPContext : public VDevCPContext { sisl::ConcurrentInsertVector< IndexBufferPtr >::iterator m_dirty_buf_it; public: + IndexCPContext(CP* cp) : VDevCPContext(cp) {} virtual ~IndexCPContext() = default; @@ -152,6 +153,7 @@ struct IndexCPContext : public VDevCPContext { } }; + class IndexWBCache; class IndexCPCallbacks : public CPCallbacks { public: diff --git a/src/lib/logging.cpp b/src/lib/logging.cpp new file mode 100644 index 000000000..2ba71ef04 --- /dev/null +++ b/src/lib/logging.cpp @@ -0,0 +1,4 @@ +#include +#include + +SISL_LOGGING_DEF(HOMESTORE_LOG_MODS) diff --git a/src/lib/logstore/log_dev.cpp b/src/lib/logstore/log_dev.cpp index b8ea1e2ef..c4c4ad579 100644 --- a/src/lib/logstore/log_dev.cpp +++ b/src/lib/logstore/log_dev.cpp @@ -19,7 +19,6 @@ #include #include -#include #include #include @@ -223,12 +222,12 @@ void LogDev::assert_next_pages(log_stream_reader& lstream) { int64_t LogDev::append_async(const logstore_id_t store_id, const logstore_seq_num_t seq_num, const sisl::io_blob& data, void* cb_context) { - auto prev_size = m_pending_flush_size.fetch_add(data.size, std::memory_order_relaxed); + auto prev_size = m_pending_flush_size.fetch_add(data.size(), std::memory_order_relaxed); const auto idx = m_log_idx.fetch_add(1, std::memory_order_acq_rel); auto threshold_size = LogDev::flush_data_threshold_size(); m_log_records->create(idx, store_id, seq_num, data, cb_context); - if (prev_size < threshold_size && ((prev_size + data.size) >= threshold_size) && + if (prev_size < threshold_size && ((prev_size + data.size()) >= threshold_size) && !m_is_flushing.load(std::memory_order_relaxed)) { flush_if_needed(); } @@ -236,18 +235,10 @@ int64_t LogDev::append_async(const logstore_id_t store_id, const logstore_seq_nu } log_buffer LogDev::read(const logdev_key& key, serialized_log_record& return_record_header) { - static thread_local sisl::aligned_unique_ptr< uint8_t, sisl::buftag::logread > read_buf; + auto buf = sisl::make_byte_array(initial_read_size, m_flush_size_multiple, sisl::buftag::logread); + m_vdev->sync_pread(buf->bytes(), initial_read_size, key.dev_offset); - // First read the offset and read the log_group. Then locate the log_idx within that and get the actual data - // Read about 4K of buffer - if (!read_buf) { - read_buf = sisl::aligned_unique_ptr< uint8_t, sisl::buftag::logread >::make_sized(m_flush_size_multiple, - initial_read_size); - } - auto rbuf = read_buf.get(); - m_vdev->sync_pread(rbuf, initial_read_size, key.dev_offset); - - auto* header = r_cast< const log_group_header* >(rbuf); + auto* header = r_cast< const log_group_header* >(buf->cbytes()); HS_REL_ASSERT_EQ(header->magic_word(), LOG_GROUP_HDR_MAGIC, "Log header corrupted with magic mismatch!"); HS_REL_ASSERT_EQ(header->get_version(), log_group_header::header_version, "Log header version mismatch!"); HS_REL_ASSERT_LE(header->start_idx(), key.idx, "log key offset does not match with log_idx"); @@ -257,44 +248,30 @@ log_buffer LogDev::read(const logdev_key& key, serialized_log_record& return_rec // We can only do crc match in read if we have read all the blocks. We don't want to aggressively read more data // than we need to just to compare CRC for read operation. It can be done during recovery. if (header->total_size() <= initial_read_size) { - crc32_t const crc = crc32_ieee(init_crc32, reinterpret_cast< const uint8_t* >(rbuf) + sizeof(log_group_header), + crc32_t const crc = crc32_ieee(init_crc32, (buf->cbytes() + sizeof(log_group_header)), header->total_size() - sizeof(log_group_header)); HS_REL_ASSERT_EQ(header->this_group_crc(), crc, "CRC mismatch on read data"); } - auto record_header = header->nth_record(key.idx - header->start_log_idx); uint32_t const data_offset = (record_header->offset + (record_header->get_inlined() ? 0 : header->oob_data_offset)); - log_buffer const b = uint32_cast(record_header->size); - if ((data_offset + b.size()) < initial_read_size) { - std::memcpy(static_cast< void* >(b.bytes()), static_cast< const void* >(rbuf + data_offset), - b.size()); // Already read them enough, copy the data + sisl::byte_view ret_view; + if ((data_offset + record_header->size) < initial_read_size) { + ret_view = sisl::byte_view{buf, data_offset, record_header->size}; } else { - // Round them data offset to dma boundary in-order to make sure pread on direct io succeed. We need to skip - // the rounded portion while copying to user buffer auto const rounded_data_offset = sisl::round_down(data_offset, m_vdev->align_size()); - auto const rounded_size = sisl::round_up(b.size() + data_offset - rounded_data_offset, m_vdev->align_size()); - - // Allocate a fresh aligned buffer, if size cannot fit standard size - if (rounded_size > initial_read_size) { - rbuf = hs_utils::iobuf_alloc(rounded_size, sisl::buftag::logread, m_vdev->align_size()); - } - - /* THIS_LOGDEV_LOG(TRACE, - "Addln read as data resides outside initial_read_size={} key.idx={} - key.group_dev_offset={} " "data_offset={} size={} rounded_data_offset={} rounded_size={}", initial_read_size, - key.idx, key.dev_offset, data_offset, b.size(), rounded_data_offset, rounded_size); */ - m_vdev->sync_pread(rbuf, rounded_size, key.dev_offset + rounded_data_offset); - std::memcpy(static_cast< void* >(b.bytes()), - static_cast< const void* >(rbuf + data_offset - rounded_data_offset), b.size()); - - // Free the buffer in case we allocated above - if (rounded_size > initial_read_size) { hs_utils::iobuf_free(rbuf, sisl::buftag::logread); } + auto const rounded_size = + sisl::round_up(record_header->size + data_offset - rounded_data_offset, m_vdev->align_size()); + auto new_buf = sisl::make_byte_array(rounded_size, m_vdev->align_size(), sisl::buftag::logread); + m_vdev->sync_pread(new_buf->bytes(), rounded_size, key.dev_offset + rounded_data_offset); + ret_view = sisl::byte_view{new_buf, s_cast< uint32_t >(data_offset - rounded_data_offset), record_header->size}; } + return_record_header = serialized_log_record(record_header->size, record_header->offset, record_header->get_inlined(), record_header->store_seq_num, record_header->store_id); - return b; + + return ret_view; } logstore_id_t LogDev::reserve_store_id() { @@ -774,8 +751,8 @@ bool LogDevMetadata::resize_logdev_sb_if_needed() { logstore_superblk* sb_area = m_sb->get_logstore_superblk(); std::fill_n(sb_area, store_capacity(), logstore_superblk::default_value()); - std::memcpy(voidptr_cast(m_sb.raw_buf()->bytes), static_cast< const void* >(old_buf->bytes), - std::min(old_buf->size, m_sb.size())); + std::memcpy(voidptr_cast(m_sb.raw_buf()->bytes()), static_cast< const void* >(old_buf->cbytes()), + std::min(old_buf->size(), m_sb.size())); return true; } else { return false; @@ -859,8 +836,8 @@ bool LogDevMetadata::resize_rollback_sb_if_needed() { const auto old_buf = m_rollback_sb.raw_buf(); m_rollback_sb.create(req_sz); - std::memcpy(voidptr_cast(m_rollback_sb.raw_buf()->bytes), static_cast< const void* >(old_buf->bytes), - std::min(old_buf->size, m_rollback_sb.size())); + std::memcpy(voidptr_cast(m_rollback_sb.raw_buf()->bytes()), static_cast< const void* >(old_buf->cbytes()), + std::min(old_buf->size(), m_rollback_sb.size())); return true; } else { return false; diff --git a/src/lib/logstore/log_dev.hpp b/src/lib/logstore/log_dev.hpp index 01f36ecce..f6d4fb606 100644 --- a/src/lib/logstore/log_dev.hpp +++ b/src/lib/logstore/log_dev.hpp @@ -104,11 +104,11 @@ struct log_record { log_record& operator=(log_record&&) noexcept = delete; ~log_record() = default; - size_t serialized_size() const { return sizeof(serialized_log_record) + data.size; } + size_t serialized_size() const { return sizeof(serialized_log_record) + data.size(); } bool is_inlineable(const uint64_t flush_size_multiple) const { // Need inlining if size is smaller or size/buffer is not in dma'ble boundary. - return (is_size_inlineable(data.size, flush_size_multiple) || - ((reinterpret_cast< uintptr_t >(data.bytes) % flush_size_multiple) != 0) || !data.aligned); + return (is_size_inlineable(data.size(), flush_size_multiple) || + ((r_cast< const uintptr_t >(data.cbytes()) % flush_size_multiple) != 0) || !data.is_aligned()); } static bool is_size_inlineable(const size_t sz, const uint64_t flush_size_multiple) { @@ -159,11 +159,7 @@ struct log_group_header { assert(idx - start_log_idx < n_log_records); const serialized_log_record* const lr{nth_record(start_log_idx - idx)}; - - sisl::blob b{}; - b.bytes = const_cast< uint8_t* >(lr->get_inlined() ? inline_area() : oob_area()) + lr->offset; - b.size = lr->size; - return b; + return sisl::blob{(lr->get_inlined() ? inline_area() : oob_area() + lr->offset), lr->size}; } uint32_t magic_word() const { return magic; } @@ -252,7 +248,7 @@ class LogGroup { void stop(); void reset(const uint32_t max_records); void create_overflow_buf(const uint32_t min_needed); - bool add_record(const log_record& record, const int64_t log_idx); + bool add_record(log_record& record, const int64_t log_idx); bool can_accomodate(const log_record& record) const { return (m_nrecords <= m_max_records); } const iovec_array& finish(const crc32_t prev_crc); @@ -809,7 +805,7 @@ class LogDev { bool m_stopped{false}; // Is Logdev stopped. We don't need lock here, because it is updated under flush lock logstore_family_id_t m_family_id; // The family id this logdev is part of JournalVirtualDev* m_vdev{nullptr}; - HomeStoreSafePtr m_hs; // Back pointer to homestore + HomeStoreSafePtr m_hs; // Back pointer to homestore std::multimap< logid_t, logstore_id_t > m_garbage_store_ids; Clock::time_point m_last_flush_time; diff --git a/src/lib/logstore/log_group.cpp b/src/lib/logstore/log_group.cpp index 7c68e581d..2f54da8d8 100644 --- a/src/lib/logstore/log_group.cpp +++ b/src/lib/logstore/log_group.cpp @@ -15,8 +15,6 @@ *********************************************************************************/ #include -#include - #include #include "common/homestore_assert.hpp" #include "log_dev.hpp" @@ -74,7 +72,7 @@ void LogGroup::create_overflow_buf(const uint32_t min_needed) { m_iovecs[0].iov_base = m_cur_log_buf; } -bool LogGroup::add_record(const log_record& record, const int64_t log_idx) { +bool LogGroup::add_record(log_record& record, const int64_t log_idx) { if (m_nrecords >= m_max_records) { LOGDEBUGMOD(logstore, "Will exceed estimated records={} if we add idx={} record. Hence stopping adding in this batch", @@ -82,9 +80,9 @@ bool LogGroup::add_record(const log_record& record, const int64_t log_idx) { return false; } - m_actual_data_size += record.data.size; - if ((m_inline_data_pos + record.data.size) >= m_cur_buf_len) { - create_overflow_buf(m_inline_data_pos + record.data.size); + m_actual_data_size += record.data.size(); + if ((m_inline_data_pos + record.data.size()) >= m_cur_buf_len) { + create_overflow_buf(m_inline_data_pos + record.data.size()); } // We use log_idx reference in the header as we expect each slot record is in order. @@ -93,22 +91,22 @@ bool LogGroup::add_record(const log_record& record, const int64_t log_idx) { // assert(header()->start_log_idx - log_idx); // Fill the slots - m_record_slots[m_nrecords].size = record.data.size; + m_record_slots[m_nrecords].size = record.data.size(); m_record_slots[m_nrecords].store_id = record.store_id; m_record_slots[m_nrecords].store_seq_num = record.seq_num; if (record.is_inlineable(m_flush_multiple_size)) { m_record_slots[m_nrecords].offset = m_inline_data_pos; m_record_slots[m_nrecords].set_inlined(true); - std::memcpy(s_cast< void* >(m_cur_log_buf + m_inline_data_pos), s_cast< const void* >(record.data.bytes), - record.data.size); - m_inline_data_pos += record.data.size; + std::memcpy(s_cast< void* >(m_cur_log_buf + m_inline_data_pos), s_cast< const void* >(record.data.cbytes()), + record.data.size()); + m_inline_data_pos += record.data.size(); m_iovecs[0].iov_len = m_inline_data_pos; } else { // We do not round it now, it will be rounded during finish m_record_slots[m_nrecords].offset = m_oob_data_pos; m_record_slots[m_nrecords].set_inlined(false); - m_iovecs.emplace_back(s_cast< void* >(record.data.bytes), record.data.size); - m_oob_data_pos += record.data.size; + m_iovecs.emplace_back(s_cast< void* >(record.data.bytes()), record.data.size()); + m_oob_data_pos += record.data.size(); } ++m_nrecords; diff --git a/src/lib/logstore/log_store.cpp b/src/lib/logstore/log_store.cpp index 637374420..f51e29944 100644 --- a/src/lib/logstore/log_store.cpp +++ b/src/lib/logstore/log_store.cpp @@ -95,7 +95,7 @@ void HomeLogStore::write_async(logstore_req* req, const log_req_comp_cb_t& cb) { m_records.create(req->seq_num); COUNTER_INCREMENT(m_metrics, logstore_append_count, 1); - HISTOGRAM_OBSERVE(m_metrics, logstore_record_size, req->data.size); + HISTOGRAM_OBSERVE(m_metrics, logstore_record_size, req->data.size()); m_logdev.append_async(m_store_id, req->seq_num, req->data, static_cast< void* >(req)); } diff --git a/src/lib/logstore/log_stream.cpp b/src/lib/logstore/log_stream.cpp index d7c0ce8e2..1d4b2e82c 100644 --- a/src/lib/logstore/log_stream.cpp +++ b/src/lib/logstore/log_stream.cpp @@ -13,8 +13,6 @@ * specific language governing permissions and limitations under the License. * *********************************************************************************/ -#include - #include "device/chunk.h" #include "common/homestore_assert.hpp" #include "common/homestore_config.hpp" @@ -22,7 +20,6 @@ #include "log_dev.hpp" #include "device/journal_vdev.hpp" - namespace homestore { SISL_LOGGING_DECL(logstore) @@ -46,7 +43,7 @@ sisl::byte_view log_stream_reader::next_group(off_t* out_dev_offset) { } HS_REL_ASSERT_GE(m_cur_log_buf.size(), m_read_size_multiple); - const auto* header = r_cast< log_group_header* >(m_cur_log_buf.bytes()); + const auto* header = r_cast< log_group_header const* >(m_cur_log_buf.bytes()); if (header->magic_word() != LOG_GROUP_HDR_MAGIC) { LOGINFOMOD(logstore, "Logdev data not seeing magic at pos {}, must have come to end of logdev", m_vdev->dev_offset(m_cur_read_bytes)); @@ -135,18 +132,13 @@ sisl::byte_view log_stream_reader::group_in_next_page() { sisl::byte_view log_stream_reader::read_next_bytes(uint64_t nbytes) { // TO DO: Might need to address alignment based on data or fast type auto out_buf = - hs_utils::create_byte_view(nbytes + m_cur_log_buf.size(), true, sisl::buftag::logread, m_vdev->align_size()); - auto ret_buf = out_buf; - if (m_cur_log_buf.size()) { - memcpy(out_buf.bytes(), m_cur_log_buf.bytes(), m_cur_log_buf.size()); - out_buf.move_forward(m_cur_log_buf.size()); - } + hs_utils::make_byte_array(nbytes + m_cur_log_buf.size(), true, sisl::buftag::logread, m_vdev->align_size()); + if (m_cur_log_buf.size()) { memcpy(out_buf->bytes(), m_cur_log_buf.bytes(), m_cur_log_buf.size()); } const auto prev_pos = m_vdev->seeked_pos(); - m_vdev->sync_next_read(out_buf.bytes(), nbytes); + m_vdev->sync_next_read(out_buf->bytes() + m_cur_log_buf.size(), nbytes); LOGINFOMOD(logstore, "LogStream read {} bytes from vdev offset {} and vdev cur offset {}", nbytes, prev_pos, m_vdev->seeked_pos()); - ret_buf.set_size(nbytes + m_cur_log_buf.size()); - return ret_buf; + return sisl::byte_view{out_buf}; } } // namespace homestore diff --git a/src/lib/meta/meta_blk_service.cpp b/src/lib/meta/meta_blk_service.cpp index 03cfd1ae7..1fcf9c65f 100644 --- a/src/lib/meta/meta_blk_service.cpp +++ b/src/lib/meta/meta_blk_service.cpp @@ -24,7 +24,6 @@ #include #include -#include #include #include @@ -637,16 +636,16 @@ void MetaBlkService::write_meta_blk_internal(meta_blk* mblk, const uint8_t* cont // TO DO: Might need to differentiate based on data or fast type const uint64_t max_dst_size = sisl::round_up(sisl::Compress::max_compress_len(sz), align_size()); if (max_dst_size <= max_compress_memory_size()) { - if (max_dst_size > m_compress_info.size) { + if (max_dst_size > m_compress_info.size()) { free_compress_buf(); alloc_compress_buf(max_dst_size); } - std::memset(voidptr_cast(m_compress_info.bytes), 0, max_dst_size); + std::memset(voidptr_cast(m_compress_info.bytes()), 0, max_dst_size); size_t compressed_size = max_dst_size; const auto ret = sisl::Compress::compress(r_cast< const char* >(context_data), - r_cast< char* >(m_compress_info.bytes), sz, &compressed_size); + r_cast< char* >(m_compress_info.bytes()), sz, &compressed_size); if (ret != 0) { LOGERROR("hs_compress_default indicates a failure trying to compress the data, ret: {}", ret); HS_REL_ASSERT(false, "failed to compress"); @@ -670,7 +669,7 @@ void MetaBlkService::write_meta_blk_internal(meta_blk* mblk, const uint8_t* cont HS_REL_ASSERT_GE(max_dst_size, uint64_cast(mblk->hdr.h.context_sz)); // point context_data to compressed data; - context_data = m_compress_info.bytes; + context_data = m_compress_info.cbytes(); data_sz = mblk->hdr.h.context_sz; } else { // back off compression if compress ratio doesn't meet criteria. @@ -1019,7 +1018,7 @@ sisl::byte_array MetaBlkService::read_sub_sb_internal(const meta_blk* mblk) cons hs_utils::make_byte_array(mblk->hdr.h.context_sz, false /* aligned */, sisl::buftag::metablk, align_size()); HS_DBG_ASSERT_EQ(mblk->hdr.h.ovf_bid.is_valid(), false, "[type={}], unexpected ovf_bid: {}", mblk->hdr.h.type, mblk->hdr.h.ovf_bid.to_string()); - std::memcpy(buf->bytes, mblk->get_context_data(), mblk->hdr.h.context_sz); + std::memcpy(buf->bytes(), mblk->get_context_data(), mblk->hdr.h.context_sz); } else { // // read through the ovf blk chain to get the buffer; @@ -1053,7 +1052,7 @@ sisl::byte_array MetaBlkService::read_sub_sb_internal(const meta_blk* mblk) cons } // TO DO: Might need to differentiate based on data or fast type - read(data_bid[i], buf->bytes + read_offset, sisl::round_up(read_sz_per_db, align_size())); + read(data_bid[i], buf->bytes() + read_offset, sisl::round_up(read_sz_per_db, align_size())); read_offset_in_this_ovf += read_sz_per_db; read_offset += read_sz_per_db; @@ -1120,7 +1119,7 @@ void MetaBlkService::recover_meta_block(meta_blk* mblk) { if (itr != std::end(m_sub_info)) { // if subsystem registered crc protection, verify crc before sending to subsystem; if (itr->second.do_crc) { - const auto crc = crc32_ieee(init_crc32, s_cast< const uint8_t* >(buf->bytes), mblk->hdr.h.context_sz); + const auto crc = crc32_ieee(init_crc32, buf->cbytes(), mblk->hdr.h.context_sz); HS_REL_ASSERT_EQ(crc, uint32_cast(mblk->hdr.h.crc), "[type={}], CRC mismatch: {}/{}, on mblk bid: {}, context_sz: {}", mblk->hdr.h.type, crc, @@ -1140,8 +1139,8 @@ void MetaBlkService::recover_meta_block(meta_blk* mblk) { auto decompressed_buf{hs_utils::make_byte_array(mblk->hdr.h.src_context_sz, true /* aligned */, sisl::buftag::compression, align_size())}; size_t decompressed_size = mblk->hdr.h.src_context_sz; - const auto ret{sisl::Compress::decompress(r_cast< const char* >(buf->bytes), - r_cast< char* >(decompressed_buf->bytes), + const auto ret{sisl::Compress::decompress(r_cast< const char* >(buf->cbytes()), + r_cast< char* >(decompressed_buf->bytes()), mblk->hdr.h.compressed_sz, &decompressed_size)}; if (ret != 0) { LOGERROR("[type={}], negative result: {} from decompress trying to decompress the " @@ -1256,13 +1255,12 @@ bool MetaBlkService::is_aligned_buf_needed(size_t size) const { return (size <= bool MetaBlkService::s_self_recover{false}; -void MetaBlkService::free_compress_buf() { hs_utils::iobuf_free(m_compress_info.bytes, sisl::buftag::compression); } +void MetaBlkService::free_compress_buf() { hs_utils::iobuf_free(m_compress_info.bytes(), sisl::buftag::compression); } void MetaBlkService::alloc_compress_buf(size_t size) { - m_compress_info.size = size; - m_compress_info.bytes = hs_utils::iobuf_alloc(size, sisl::buftag::compression, align_size()); - - HS_REL_ASSERT_NOTNULL(m_compress_info.bytes, "fail to allocate iobuf for compression of size: {}", size); + m_compress_info = + sisl::blob{hs_utils::iobuf_alloc(size, sisl::buftag::compression, align_size()), uint32_cast(size)}; + HS_REL_ASSERT_NOTNULL(m_compress_info.cbytes(), "fail to allocate iobuf for compression of size: {}", size); } uint64_t MetaBlkService::meta_blk_context_sz() const { return block_size() - META_BLK_HDR_MAX_SZ; } @@ -1507,24 +1505,24 @@ nlohmann::json MetaBlkService::populate_json(int log_level, meta_blk_map_t& meta } sisl::byte_array buf = read_sub_sb_internal(it->second); - if (free_space < buf->size) { + if (free_space < buf->size()) { j[x.first]["meta_bids"][std::to_string(bid_cnt)] = "Not_able_to_dump_to_file_exceeding_allowed_space"; HS_LOG_EVERY_N(WARN, metablk, 100, "[type={}] Skip dumping to file, exceeding allowed space: {}, " "requested_size: {}, " "total_free: {}, free_fs_percent: {}", - x.first, free_space, buf->size, total_free, + x.first, free_space, buf->size(), total_free, HS_DYNAMIC_CONFIG(metablk.percent_of_free_space)); continue; } const std::string file_path = fmt::format("{}/{}_{}", dump_dir, x.first, bid_cnt); std::ofstream f{file_path}; - f.write(r_cast< const char* >(buf->bytes), buf->size); + f.write(r_cast< const char* >(buf->bytes()), buf->size()); j[x.first]["meta_bids"][std::to_string(bid_cnt)] = file_path; - free_space -= buf->size; + free_space -= buf->size(); } ++bid_cnt; diff --git a/src/lib/replication/CMakeLists.txt b/src/lib/replication/CMakeLists.txt index c71bb4516..448d14adc 100644 --- a/src/lib/replication/CMakeLists.txt +++ b/src/lib/replication/CMakeLists.txt @@ -3,9 +3,27 @@ include (${CMAKE_SOURCE_DIR}/cmake/test_mode.cmake) include_directories (BEFORE ..) include_directories (BEFORE .) +list(APPEND SCHEMA_FLAGS "--scoped-enums" "--gen-name-strings" "--cpp-std=c++17" "--cpp-static-reflection" "--reflect-names") + +flatbuffers_generate_headers( + TARGET hs_replication_fb + SCHEMAS push_data_rpc.fbs + FLAGS ${SCHEMA_FLAGS} +) + add_library(hs_replication OBJECT) target_sources(hs_replication PRIVATE - service/repl_service_impl.cpp + service/generic_repl_svc.cpp + service/raft_repl_service.cpp repl_dev/solo_repl_dev.cpp + repl_dev/common.cpp + repl_dev/raft_repl_dev.cpp + repl_dev/raft_state_machine.cpp + log_store/repl_log_store.cpp + log_store/home_raft_log_store.cpp ) -target_link_libraries(hs_replication ${COMMON_DEPS}) +target_link_libraries(hs_replication PRIVATE ${COMMON_DEPS} hs_common hs_replication_fb) + +#set(FLATBUFFERS_FLATC_EXECUTABLE ${flatbuffers_LIB_DIRS}/../bin/flatc) +#flatbuffer_gen_cpp(${FLATBUFFERS_FLATC_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/generated/ hs_replication rpc/push_data_rpc.fbs rpc/fetch_data_rpc.fbs) + diff --git a/src/lib/replication/fetch_data_rpc.fbs b/src/lib/replication/fetch_data_rpc.fbs new file mode 100644 index 000000000..d73d4dd1f --- /dev/null +++ b/src/lib/replication/fetch_data_rpc.fbs @@ -0,0 +1,34 @@ +namespace homestore; + +table RequestEntry { + lsn : int64; // LSN of the raft log if known + raft_term : uint64; // Raft term number + dsn : uint64; // Data Sequence number + user_header: [ubyte]; // User header bytes + user_key : [ubyte]; // User key data + blkid_originator : int32; // Originally which replica's blkid is this + remote_blkid : [ubyte]; // Serialized remote blkid +} + +table FetchDataRequest { + entries : [RequestEntry]; // Array of request entries +} + +table ResponseEntry { + lsn : [int64]; // LSN of the raft log if known + dsn : uint64; // Data Sequence number + raft_term : uint64; // Raft term number + data_size : uint32; // Size of the data which is sent as separate non flatbuffer +} + +table FetchDataResponse { + issuer_replica_id : int32; // Replica id of the issuer + entries : [ResponseEntry]; // Array of request entries +} + +table FetchData { + request : FetchDataRequest; + response : FetchDataResponse; +} + +root_type FetchData; \ No newline at end of file diff --git a/src/lib/replication/log_store/home_raft_log_store.cpp b/src/lib/replication/log_store/home_raft_log_store.cpp new file mode 100644 index 000000000..bb3b7d2fb --- /dev/null +++ b/src/lib/replication/log_store/home_raft_log_store.cpp @@ -0,0 +1,267 @@ +/********************************************************************************* + * Modifications Copyright 2017-2019 eBay Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ + +#include "home_raft_log_store.h" +#include "storage_engine_buffer.h" +#include + +using namespace homestore; + +SISL_LOGGING_DECL(replication) + +#define REPL_STORE_LOG(level, msg, ...) \ + LOG##level##MOD_FMT(replication, ([&](fmt::memory_buffer& buf, const char* msgcb, auto&&... args) -> bool { \ + fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}:{}] "}, \ + fmt::make_format_args(file_name(__FILE__), __LINE__)); \ + fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}={}] "}, \ + fmt::make_format_args("replstore", m_logstore_id)); \ + fmt::vformat_to(fmt::appender{buf}, fmt::string_view{msgcb}, \ + fmt::make_format_args(std::forward< decltype(args) >(args)...)); \ + return true; \ + }), \ + msg, ##__VA_ARGS__); + +namespace homestore { +static constexpr store_lsn_t to_store_lsn(uint64_t raft_lsn) { return s_cast< store_lsn_t >(raft_lsn) - 1; } +static constexpr store_lsn_t to_store_lsn(repl_lsn_t repl_lsn) { return repl_lsn - 1; } +static constexpr repl_lsn_t to_repl_lsn(store_lsn_t store_lsn) { return store_lsn + 1; } + +static nuraft::ptr< nuraft::log_entry > to_nuraft_log_entry(const log_buffer& log_bytes) { + uint8_t const* raw_ptr = log_bytes.bytes(); + uint64_t term = *r_cast< uint64_t const* >(raw_ptr); + raw_ptr += sizeof(uint64_t); + nuraft::log_val_type type = static_cast< nuraft::log_val_type >(*raw_ptr); + raw_ptr += sizeof(nuraft::log_val_type); + + size_t data_len = log_bytes.size() - sizeof(uint64_t) - sizeof(nuraft::log_val_type); + auto nb = nuraft::buffer::alloc(data_len); + nb->put_raw(raw_ptr, data_len); + return nuraft::cs_new< nuraft::log_entry >(term, nb, type); +} + +static uint64_t extract_term(const log_buffer& log_bytes) { + uint8_t const* raw_ptr = log_bytes.bytes(); + return (*r_cast< uint64_t const* >(raw_ptr)); +} + +HomeRaftLogStore::HomeRaftLogStore(logstore_id_t logstore_id) { + m_dummy_log_entry = nuraft::cs_new< nuraft::log_entry >(0, nuraft::buffer::alloc(0), nuraft::log_val_type::app_log); + + if (logstore_id == UINT32_MAX) { + m_log_store = logstore_service().create_new_log_store(LogStoreService::DATA_LOG_FAMILY_IDX, true); + if (!m_log_store) { throw std::runtime_error("Failed to create log store"); } + m_logstore_id = m_log_store->get_store_id(); + LOGDEBUGMOD(replication, "Opened new home log store id={}", m_logstore_id); + } else { + m_logstore_id = logstore_id; + LOGDEBUGMOD(replication, "Opening existing home log store id={}", logstore_id); + logstore_service().open_log_store(LogStoreService::DATA_LOG_FAMILY_IDX, logstore_id, true, + [this](shared< HomeLogStore > log_store) { + m_log_store = std::move(log_store); + DEBUG_ASSERT_EQ(m_logstore_id, m_log_store->get_store_id(), + "Mismatch in passed and create logstore id"); + REPL_STORE_LOG(DEBUG, "Home Log store created/opened successfully"); + }); + } +} + +void HomeRaftLogStore::remove_store() { + REPL_STORE_LOG(DEBUG, "Logstore is being physically removed"); + logstore_service().remove_log_store(LogStoreService::DATA_LOG_FAMILY_IDX, m_logstore_id); + m_log_store.reset(); +} + +ulong HomeRaftLogStore::next_slot() const { + uint64_t next_slot = to_repl_lsn(m_log_store->get_contiguous_issued_seq_num(m_last_durable_lsn)) + 1; + REPL_STORE_LOG(DEBUG, "next_slot()={}", next_slot); + return next_slot; +} + +ulong HomeRaftLogStore::start_index() const { + // start_index starts from 1. + ulong start_index = std::max((repl_lsn_t)1, to_repl_lsn(m_log_store->truncated_upto()) + 1); + REPL_STORE_LOG(DEBUG, "start_index()={}", start_index); + return start_index; +} + +nuraft::ptr< nuraft::log_entry > HomeRaftLogStore::last_entry() const { + store_lsn_t max_seq = m_log_store->get_contiguous_issued_seq_num(m_last_durable_lsn); + REPL_STORE_LOG(DEBUG, "last_entry() store seqnum={}", max_seq); + if (max_seq < 0) { return m_dummy_log_entry; } + + nuraft::ptr< nuraft::log_entry > nle; + try { + auto log_bytes = m_log_store->read_sync(max_seq); + nle = to_nuraft_log_entry(log_bytes); + } catch (const std::exception& e) { + REPL_STORE_LOG(ERROR, "last_entry() out_of_range={}", max_seq); + throw e; + } + + return nle; +} + +ulong HomeRaftLogStore::append(nuraft::ptr< nuraft::log_entry >& entry) { + REPL_STORE_LOG(TRACE, "append entry term={}, log_val_type={} size={}", entry->get_term(), + static_cast< uint32_t >(entry->get_val_type()), entry->get_buf().size()); + auto buf = entry->serialize(); + return append(buf); +} + +ulong HomeRaftLogStore::append(raft_buf_ptr_t& buffer) { + auto next_seq = m_log_store->append_async( + sisl::io_blob{buffer->data_begin(), uint32_cast(buffer->size()), false /* is_aligned */}, nullptr /* cookie */, + [buffer](int64_t, sisl::io_blob&, logdev_key, void*) {}); + return to_repl_lsn(next_seq); +} + +void HomeRaftLogStore::write_at(ulong index, nuraft::ptr< nuraft::log_entry >& entry) { + auto buf = entry->serialize(); + write_at(index, buf); +} + +void HomeRaftLogStore::write_at(ulong index, raft_buf_ptr_t& buffer) { + m_log_store->rollback_async(to_store_lsn(index) - 1, nullptr); + // we need to reset the durable lsn, because its ok to set to lower number as it will be updated on next flush + // calls, but it is dangerous to set higher number. + m_last_durable_lsn = -1; + append(buffer); +} + +void HomeRaftLogStore::end_of_append_batch(ulong start, ulong cnt) { + store_lsn_t end_lsn = to_store_lsn(start + cnt - 1); + m_log_store->flush_sync(end_lsn); + m_last_durable_lsn = end_lsn; +} + +nuraft::ptr< std::vector< nuraft::ptr< nuraft::log_entry > > > HomeRaftLogStore::log_entries(ulong start, ulong end) { + auto out_vec = std::make_shared< std::vector< nuraft::ptr< nuraft::log_entry > > >(); + m_log_store->foreach (to_store_lsn(start), [end, &out_vec](store_lsn_t cur, const log_buffer& entry) -> bool { + bool ret = (cur < to_store_lsn(end) - 1); + if (cur < to_store_lsn(end)) { out_vec->emplace_back(to_nuraft_log_entry(entry)); } + return ret; + }); + return out_vec; +} + +nuraft::ptr< nuraft::log_entry > HomeRaftLogStore::entry_at(ulong index) { + nuraft::ptr< nuraft::log_entry > nle; + try { + auto log_bytes = m_log_store->read_sync(to_store_lsn(index)); + nle = to_nuraft_log_entry(log_bytes); + } catch (const std::exception& e) { + REPL_STORE_LOG(ERROR, "entry_at({}) index out_of_range", index); + throw e; + } + return nle; +} + +ulong HomeRaftLogStore::term_at(ulong index) { + ulong term; + try { + auto log_bytes = m_log_store->read_sync(to_store_lsn(index)); + term = extract_term(log_bytes); + } catch (const std::exception& e) { + REPL_STORE_LOG(ERROR, "term_at({}) index out_of_range", index); + throw e; + } + return term; +} + +raft_buf_ptr_t HomeRaftLogStore::pack(ulong index, int32_t cnt) { + static constexpr size_t estimated_record_size = 128; + size_t estimated_size = cnt * estimated_record_size + sizeof(uint32_t); + + // << Format >> + // # records (N) 4 bytes + // +--- + // | log length (X) 4 bytes + // | log data X bytes + // +--- repeat N + raft_buf_ptr_t out_buf = nuraft::buffer::alloc(estimated_size); + out_buf->put(cnt); + + int32_t remain_cnt = cnt; + m_log_store->foreach ( + to_store_lsn(index), + [this, &out_buf, &remain_cnt]([[maybe_unused]] store_lsn_t cur, const log_buffer& entry) mutable -> bool { + if (remain_cnt-- > 0) { + size_t avail_size = out_buf->size() - out_buf->pos(); + if (avail_size < entry.size()) { + avail_size += std::max(out_buf->size() * 2, (size_t)entry.size()); + out_buf = nuraft::buffer::expand(*out_buf, avail_size); + } + REPL_STORE_LOG(TRACE, "packing lsn={} of size={}, avail_size in buffer={}", to_repl_lsn(cur), + entry.size(), avail_size); + out_buf->put(entry.bytes(), entry.size()); + } + return (remain_cnt > 0); + }); + return out_buf; +} + +void HomeRaftLogStore::apply_pack(ulong index, nuraft::buffer& pack) { + pack.pos(0); + auto num_entries = pack.get_int(); + + auto slot = next_slot(); + if (index < slot) { + // We are asked to apply/insert data behind next slot, so we must rollback before index and then append + m_log_store->rollback_async(to_store_lsn(index) - 1, nullptr); + } else if (index > slot) { + // We are asked to apply/insert data after next slot, so we need to fill in with dummy entries upto the slot + // before append the entries + REPL_STORE_LOG(WARN, + "RaftLogStore is asked to apply pack on lsn={}, but current lsn={} is behind, will be filling " + "with dummy data to make it functional, however, this could result in inconsistent data", + index, to_store_lsn(slot)); + while (index++ < slot) { + append(m_dummy_log_entry); + } + } + + for (int i{0}; i < num_entries; ++i) { + size_t entry_len; + auto* entry = const_cast< nuraft::byte* >(pack.get_bytes(entry_len)); + [[maybe_unused]] auto store_sn = + m_log_store->append_async(sisl::io_blob{entry, uint32_cast(entry_len), false}, nullptr, nullptr); + REPL_STORE_LOG(TRACE, "unpacking nth_entry={} of size={}, lsn={}", i + 1, entry_len, to_repl_lsn(store_sn)); + } + m_log_store->flush_sync(to_store_lsn(index) + num_entries - 1); +} + +bool HomeRaftLogStore::compact(ulong compact_lsn) { + auto cur_max_lsn = m_log_store->get_contiguous_issued_seq_num(m_last_durable_lsn); + if (cur_max_lsn < to_store_lsn(compact_lsn)) { + // We need to fill the remaining entries with dummy data. + for (auto lsn{cur_max_lsn + 1}; lsn <= to_store_lsn(compact_lsn); ++lsn) { + append(m_dummy_log_entry); + } + } + m_log_store->flush_sync(to_store_lsn(compact_lsn)); + m_log_store->truncate(to_store_lsn(compact_lsn)); + return true; +} + +bool HomeRaftLogStore::flush() { + m_log_store->flush_sync(); + return true; +} + +ulong HomeRaftLogStore::last_durable_index() { + m_last_durable_lsn = m_log_store->get_contiguous_completed_seq_num(m_last_durable_lsn); + return to_repl_lsn(m_last_durable_lsn); +} +} // namespace homestore diff --git a/src/lib/replication/log_store/home_raft_log_store.h b/src/lib/replication/log_store/home_raft_log_store.h new file mode 100644 index 000000000..c49cef310 --- /dev/null +++ b/src/lib/replication/log_store/home_raft_log_store.h @@ -0,0 +1,180 @@ +/********************************************************************************* + * Modifications Copyright 2017-2019 eBay Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ +#pragma once + +#include +#include + +#if defined __clang__ or defined __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#endif +#include +#if defined __clang__ or defined __GNUC__ +#pragma GCC diagnostic pop +#endif +#undef auto_lock + +namespace homestore { + +using store_lsn_t = int64_t; +using repl_lsn_t = int64_t; +using raft_buf_ptr_t = nuraft::ptr< nuraft::buffer >; + +class HomeRaftLogStore : public nuraft::log_store { +public: + HomeRaftLogStore(homestore::logstore_id_t logstore_id = UINT32_MAX); + virtual ~HomeRaftLogStore() = default; + + void remove_store(); + + /** + * The first available slot of the store, starts with 1. + * + * @return Last log index number + 1 + */ + virtual ulong next_slot() const override; + + /** + * The start index of the log store, at the very beginning, it must be 1. + * However, after some compact actions, this could be anything + * greater or equals to one. + * + * @return Starting log index number. + */ + virtual ulong start_index() const override; + + /** + * The last log entry in store. + * + * @return If no log entry exists: a dummy constant entry with + * value set to null and term set to zero. + */ + virtual nuraft::ptr< nuraft::log_entry > last_entry() const override; + + /** + * Append a log entry to store + * + * @param entry Log entry + * @return Log index number. + */ + virtual ulong append(nuraft::ptr< nuraft::log_entry >& entry) override; + + // An alternate method on entries already serialized into the raft buffer + ulong append(raft_buf_ptr_t& buffer); + + /** + * Overwrite a log entry at the given `index`. + * + * @param index Log index number to overwrite. + * @param entry New log entry to overwrite. + */ + virtual void write_at(ulong index, nuraft::ptr< nuraft::log_entry >& entry) override; + + // An alternate method on entries already serialized into the raft buffer + void write_at(ulong index, raft_buf_ptr_t& buffer); + + /** + * Invoked after a batch of logs is written as a part of + * a single append_entries request. + * + * @param start The start log index number (inclusive) + * @param cnt The number of log entries written. + */ + virtual void end_of_append_batch(ulong start, ulong cnt) override; + + /** + * Get log entries with index [start, end). + * + * @param start The start log index number (inclusive). + * @param end The end log index number (exclusive). + * @return The log entries between [start, end). + */ + virtual nuraft::ptr< std::vector< nuraft::ptr< nuraft::log_entry > > > log_entries(ulong start, ulong end) override; + + /** + * Get the log entry at the specified log index number. + * + * @param index Should be equal to or greater than 1. + * @return The log entry or null if index >= this->next_slot(). + */ + virtual nuraft::ptr< nuraft::log_entry > entry_at(ulong index) override; + + /** + * Get the term for the log entry at the specified index + * Suggest to stop the system if the index >= this->next_slot() + * + * @param index Should be equal to or greater than 1. + * @return The term for the specified log entry, or + * 0 if index < this->start_index(). + */ + virtual ulong term_at(ulong index) override; + + /** + * Pack cnt log items starts from index + * + * @param index The start log index number (inclusive). + * @param cnt The number of logs to pack. + * @return log pack + */ + virtual raft_buf_ptr_t pack(ulong index, int32_t cnt) override; + + /** + * Apply the log pack to current log store, starting from index. + * + * @param index The start log index number (inclusive). + * @param pack + */ + virtual void apply_pack(ulong index, nuraft::buffer& pack); + + /** + * Compact the log store by purging all log entries, + * including the log at the last_log_index. + * + * If current max log idx is smaller than given `last_log_index`, + * set start log idx to `last_log_index + 1`. + * + * @param last_log_index Log index number that will be purged up to (inclusive). + * @return True on success. + */ + virtual bool compact(ulong last_log_index) override; + + /** + * Synchronously flush all log entries in this log store to the backing storage + * so that all log entries are guaranteed to be durable upon process crash. + * + * @return `true` on success. + */ + virtual bool flush() override; + + /** + * This API is used only when `raft_params::parallel_log_appending_` flag is set. + * Please refer to the comment of the flag. + * + * NOTE: In homestore replication use cases, we use this even without parallel_log_appending_ flag is not set + * + * @return The last durable log index. + */ + virtual ulong last_durable_index() override; + + logstore_id_t logstore_id() const { return m_logstore_id; } + +private: + logstore_id_t m_logstore_id; + shared< HomeLogStore > m_log_store; + nuraft::ptr< nuraft::log_entry > m_dummy_log_entry; + store_lsn_t m_last_durable_lsn{-1}; +}; +} // namespace homestore \ No newline at end of file diff --git a/src/lib/replication/log_store/repl_log_store.cpp b/src/lib/replication/log_store/repl_log_store.cpp new file mode 100644 index 000000000..84a12925d --- /dev/null +++ b/src/lib/replication/log_store/repl_log_store.cpp @@ -0,0 +1,70 @@ +#include +#include "replication/log_store/repl_log_store.h" +#include "replication/repl_dev/raft_state_machine.h" +#include "replication/repl_dev/raft_repl_dev.h" +#include "replication/repl_dev/common.h" + +namespace homestore { + +uint64_t ReplLogStore::append(nuraft::ptr< nuraft::log_entry >& entry) { + repl_req_ptr_t rreq = m_sm.transform_journal_entry(entry); + ulong lsn; + if (rreq) { + lsn = HomeRaftLogStore::append(rreq->raft_journal_buf()); + m_sm.link_lsn_to_req(rreq, int64_cast(lsn)); + RD_LOG(INFO, "Raft Channel: Received log entry rreq=[{}]", rreq->to_compact_string()); + } else { + lsn = HomeRaftLogStore::append(entry); + } + return lsn; +} + +void ReplLogStore::write_at(ulong index, nuraft::ptr< nuraft::log_entry >& entry) { + repl_req_ptr_t rreq = m_sm.transform_journal_entry(entry); + if (rreq) { + HomeRaftLogStore::write_at(index, rreq->raft_journal_buf()); + m_sm.link_lsn_to_req(rreq, int64_cast(index)); + RD_LOG(INFO, "Raft Channel: Received log entry rreq=[{}]", rreq->to_compact_string()); + } else { + HomeRaftLogStore::write_at(index, entry); + } +} + +void ReplLogStore::end_of_append_batch(ulong start_lsn, ulong count) { + // Skip this call in leader, since this method will synchronously flush the data, which is not required for + // leader. Leader will call the flush as part of commit after receiving quorum, upon which time, there is a high + // possibility the log entry is already flushed. + if (!m_rd.is_leader()) { + int64_t end_lsn = int64_cast(start_lsn + count - 1); + + // Start fetch the batch of data for this lsn range from remote if its not available yet. + auto reqs = sisl::VectorPool< repl_req_ptr_t >::alloc(); + for (int64_t lsn = int64_cast(start_lsn); lsn <= end_lsn; ++lsn) { + reqs->emplace_back(m_sm.lsn_to_req(lsn)); + } + + // Check the map if data corresponding to all of these requsts have been received and written. If not, schedule + // a fetch and write. Once all requests are completed and written, these requests are poped out of the map and + // the future will be ready. + auto fut = m_rd.notify_after_data_written(reqs); + + // In the meanwhile, we can flush the journal for this lsn batch. It is ok to flush the entries in log before + // actual data is written, because, even if we have the log, it doesn't mean data is committed, until state + // machine reports that. This way the flush and fetch both can run in parallel. + HomeRaftLogStore::end_of_append_batch(start_lsn, count); + + // Wait for the fetch and write to be completed successfully. + std::move(fut).get(); + + // Mark all the pbas also completely written + for (auto const& rreq : *reqs) { + if (rreq) { rreq->state.fetch_or(uint32_cast(repl_req_state_t::LOG_FLUSHED)); } + } + + sisl::VectorPool< repl_req_ptr_t >::free(reqs); + } +} + +std::string ReplLogStore::rdev_name() const { return m_rd.rdev_name(); } + +} // namespace homestore diff --git a/src/lib/replication/log_store/repl_log_store.h b/src/lib/replication/log_store/repl_log_store.h new file mode 100644 index 000000000..c2fb615f2 --- /dev/null +++ b/src/lib/replication/log_store/repl_log_store.h @@ -0,0 +1,33 @@ +#pragma once + +#include +#include +#include "replication/log_store/home_raft_log_store.h" + +namespace homestore { + +class RaftReplDev; +class RaftStateMachine; + +class ReplLogStore : public HomeRaftLogStore { +private: + RaftReplDev& m_rd; + RaftStateMachine& m_sm; + std::mutex m_batch_mtx; + std::condition_variable m_batch_cv; + int64_t m_batch_lsn{0}; + +public: + template < typename... Args > + ReplLogStore(RaftReplDev& rd, RaftStateMachine& sm, Args&&... args) : + HomeRaftLogStore{std::forward< Args >(args)...}, m_rd{rd}, m_sm{sm} {} + + uint64_t append(nuraft::ptr< nuraft::log_entry >& entry) override; + void write_at(ulong index, nuraft::ptr< nuraft::log_entry >& entry) override; + void end_of_append_batch(ulong start_lsn, ulong count) override; + +private: + std::string rdev_name() const; +}; + +} // namespace homestore diff --git a/src/lib/replication/log_store/storage_engine_buffer.h b/src/lib/replication/log_store/storage_engine_buffer.h new file mode 100644 index 000000000..87bd90b11 --- /dev/null +++ b/src/lib/replication/log_store/storage_engine_buffer.h @@ -0,0 +1,251 @@ +#pragma once + +#if defined(WIN32) || defined(_WIN32) +#error "Unsupported platform, POSIX only!" +#endif + +extern "C" { +#include +} + +#include +#include +#include +#include +#include + +// Copied from NuKV JungleDB project written by Jung-Sang Ahn and modified by Harihara Kadayam + +namespace homestore { + +struct SEBuf { + /** + * Empty buffer. + */ + SEBuf() = default; + + /** + * Reference to given address. + */ + SEBuf(size_t _len, const void* _buf) : len(_len), buf((void*)_buf) {} + + /** + * Reference to given string object. + */ + SEBuf(const std::string& str) : len(str.size()), buf((void*)str.data()) {} + + /** + * Allocate own memory. + * If given length is 0, it will return an empty buffer. + */ + static SEBuf alloc(size_t _len) { + if (!_len) return SEBuf(); + return SEBuf(_len, malloc(_len)); + } + + /** + * Free own memory. + */ + inline void free() { + ::free(buf); + clear(); + } + + /** + * Clear internal pointer without free. + * User is responsible for managing memory to avoid memory leak. + */ + inline void clear() { + buf = nullptr; + len = 0; + } + + /** + * Return `true` if this buffer is empty. + */ + inline bool empty() const { return (buf == nullptr); } + + /** + * Return the size of this buffer. + */ + inline size_t size() const { return len; } + + /** + * Return the pointer to the data of this buffer. + */ + inline void* data() const { return buf; } + + /** + * Create a std::string object that is clone of this buffer. + */ + inline std::string toString() const { return std::string((const char*)buf, len); } + + /** + * Return a string replacing non-readable character with `.`. + * The max length of string will be upto given `limit`. + */ + std::string rStr(size_t limit = 16) const; + + /** + * Move ownership of data to given buffer `dst`. + */ + inline void moveTo(SEBuf& dst) { + dst = *this; + clear(); + } + + /** + * Make a copy of data and set it to given buffer `dst`. + */ + inline void copyTo(SEBuf& dst) const { + dst = alloc(len); + if (len) { memcpy(dst.buf, buf, len); } + } + + size_t len{0}; + void* buf{nullptr}; + + /** + * To easily free buffer (to avoid memory leak by mistake), + * similar to `std::lock_guard`. + */ + struct AutoFree { + AutoFree(SEBuf& buf) : bufToHold(buf) {} + ~AutoFree() { bufToHold.free(); } + SEBuf& bufToHold; + }; +}; +using SEBufHolder = SEBuf::AutoFree; + +struct SEBufSerializer { + SEBufSerializer(const SEBuf& _buf) : buf(_buf), offset(0), errHappened(false) {} + + inline bool isValid(size_t len) { + if (errHappened || len + pos() > buf.len) { + errHappened = true; + return false; + } + return true; + } + + inline bool ok() const { return !errHappened; } + + inline void pos(size_t _pos) { + assert(_pos <= buf.len); + offset = _pos; + } + + inline size_t pos() const { return offset; } + + inline void clearError() { errHappened = false; } + + inline void* data() { + uint8_t* ptr = (uint8_t*)buf.buf; + return ptr + pos(); + } + + inline void putU64(uint64_t val) { + if (!isValid(sizeof(val))) return; + uint64_t u64 = htobe64(val); + memcpy(data(), &u64, sizeof(u64)); + pos(pos() + sizeof(u64)); + } + + inline void putU32(uint32_t val) { + if (!isValid(sizeof(val))) return; + uint32_t u32 = htobe32(val); + memcpy(data(), &u32, sizeof(u32)); + pos(pos() + sizeof(u32)); + } + + inline void putU16(uint16_t val) { + if (!isValid(sizeof(val))) return; + uint16_t u16 = htobe16(val); + memcpy(data(), &u16, sizeof(u16)); + pos(pos() + sizeof(u16)); + } + + inline void putU8(uint8_t val) { + if (!isValid(sizeof(val))) return; + memcpy(data(), &val, sizeof(val)); + pos(pos() + sizeof(val)); + } + + inline void putRaw(size_t len, const void* src) { + memcpy(data(), src, len); + pos(pos() + len); + } + + inline void put(size_t len, const void* src) { + putU32(len); + if (!isValid(len)) return; + putRaw(len, src); + } + + inline void putString(const std::string& str) { put(str.size(), str.data()); } + + inline void putSEBuf(const SEBuf& buf) { put(buf.len, buf.buf); } + + inline uint64_t getU64() { + if (!isValid(sizeof(uint64_t))) return 0; + uint64_t u64; + memcpy(&u64, data(), sizeof(u64)); + pos(pos() + sizeof(u64)); + return be64toh(u64); + } + + inline uint32_t getU32() { + if (!isValid(sizeof(uint32_t))) return 0; + uint32_t u32; + memcpy(&u32, data(), sizeof(u32)); + pos(pos() + sizeof(u32)); + return be32toh(u32); + } + + inline uint16_t getU16() { + if (!isValid(sizeof(uint16_t))) return 0; + uint16_t u16; + memcpy(&u16, data(), sizeof(u16)); + pos(pos() + sizeof(u16)); + return be16toh(u16); + } + + inline uint8_t getU8() { + if (!isValid(sizeof(uint8_t))) return 0; + uint8_t u8; + memcpy(&u8, data(), sizeof(u8)); + pos(pos() + sizeof(u8)); + return u8; + } + + inline void* getRaw(size_t len) { + void* _data = data(); + pos(pos() + len); + return _data; + } + + inline void* get(size_t& len) { + len = getU32(); + if (!isValid(len)) return nullptr; + return getRaw(len); + } + + inline std::string getString() { + size_t _len; + void* _data = get(_len); + if (!_data) return std::string(); + return std::string((const char*)_data, _len); + } + + inline SEBuf getSEBuf() { + size_t _len; + void* _data = get(_len); + return SEBuf(_len, _data); + } + + const SEBuf& buf; + size_t offset; + bool errHappened; +}; + +} // namespace homestore diff --git a/src/lib/replication/push_data_rpc.fbs b/src/lib/replication/push_data_rpc.fbs new file mode 100644 index 000000000..0bf4ce896 --- /dev/null +++ b/src/lib/replication/push_data_rpc.fbs @@ -0,0 +1,13 @@ +native_include "boost/uuid/uuid.hpp"; +namespace homestore; + +table PushDataRequest { + issuer_replica_id : int32; // Replica id of the issuer + raft_term : uint64; // Raft term number + dsn : uint64; // Data Sequence number + user_header: [ubyte]; // User header bytes + user_key : [ubyte]; // User key data + data_size : uint32; // Data size, actual data is sent as separate blob not by flatbuffer +} + +root_type PushDataRequest; \ No newline at end of file diff --git a/src/lib/replication/repl_dev/common.cpp b/src/lib/replication/repl_dev/common.cpp new file mode 100644 index 000000000..db5540d61 --- /dev/null +++ b/src/lib/replication/repl_dev/common.cpp @@ -0,0 +1,51 @@ +#include +#include +#include +#include +#include "replication/repl_dev/common.h" +#include + +namespace homestore { + +void repl_req_ctx::alloc_journal_entry(uint32_t size, bool is_raft_buf) { + if (is_raft_buf) { + journal_buf = nuraft::buffer::alloc(size); + journal_entry = new (raft_journal_buf()->data_begin()) repl_journal_entry(); + } else { + journal_buf = std::unique_ptr< uint8_t[] >(new uint8_t[size]); + journal_entry = new (raw_journal_buf()) repl_journal_entry(); + } +} + +repl_req_ctx::~repl_req_ctx() { + if (journal_entry) { journal_entry->~repl_journal_entry(); } +} + +raft_buf_ptr_t& repl_req_ctx::raft_journal_buf() { return std::get< raft_buf_ptr_t >(journal_buf); } +uint8_t* repl_req_ctx::raw_journal_buf() { return std::get< std::unique_ptr< uint8_t[] > >(journal_buf).get(); } + +static std::string req_state_name(uint32_t state) { + if (state == (uint32_t)repl_req_state_t::INIT) { return "INIT"; } + + std::string ret; + if (state & (uint32_t)repl_req_state_t::BLK_ALLOCATED) { ret += "BLK_ALLOCATED | "; } + if (state & (uint32_t)repl_req_state_t::DATA_RECEIVED) { ret += "DATA_RECEIVED | "; } + if (state & (uint32_t)repl_req_state_t::DATA_WRITTEN) { ret += "DATA_WRITTEN | "; } + if (state & (uint32_t)repl_req_state_t::LOG_RECEIVED) { ret += "LOG_RECEIVED | "; } + if (state & (uint32_t)repl_req_state_t::LOG_FLUSHED) { ret += "LOG_FLUSHED"; } + return ret; +} + +std::string repl_req_ctx::to_string() const { + return fmt::format( + "repl_key=[{}], lsn={} state=[{}] header_size={} key_size={} is_proposer={} local_blkid={} remote_blkid={}", + rkey.to_string(), lsn, req_state_name(state.load()), header.size(), key.size(), is_proposer, + local_blkid.to_string(), remote_blkid.blkid.to_string()); +} + +std::string repl_req_ctx::to_compact_string() const { + return fmt::format("dsn={} term={} lsn={} state={} ref={}", rkey.dsn, rkey.term, lsn, req_state_name(state.load()), + this->use_count()); +} + +} // namespace homestore \ No newline at end of file diff --git a/src/lib/replication/repl_dev/common.h b/src/lib/replication/repl_dev/common.h new file mode 100644 index 000000000..aa6935581 --- /dev/null +++ b/src/lib/replication/repl_dev/common.h @@ -0,0 +1,89 @@ +/********************************************************************************* + * Modifications Copyright 2017-2019 eBay Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ +#pragma once + +#include + +#include +#include +#include +#include + +namespace homestore { +VENUM(journal_type_t, uint16_t, HS_LARGE_DATA = 0, HS_HEADER_ONLY = 1) + +struct repl_journal_entry { + static constexpr uint16_t JOURNAL_ENTRY_MAJOR = 1; + static constexpr uint16_t JOURNAL_ENTRY_MINOR = 1; + + // Major and minor version. For each major version underlying structures could change. Minor versions can only add + // fields, not change any existing fields. + uint16_t major_version{JOURNAL_ENTRY_MAJOR}; + uint16_t minor_version{JOURNAL_ENTRY_MINOR}; + + journal_type_t code; + int32_t server_id; + uint64_t dsn; // Data seq number + uint32_t user_header_size; + uint32_t key_size; + uint32_t value_size; + // Followed by user_header, then key, then MultiBlkId/value + + std::string to_string() const { + return fmt::format("version={}.{}, code={}, server_id={}, dsn={}, header_size={}, key_size={}, value_size={}", + major_version, minor_version, enum_name(code), server_id, dsn, user_header_size, key_size, + value_size); + } + + std::string to_compact_string() const { + return fmt::format("dsn={}, header_size={}, key_size={}, value_size={}", major_version, minor_version, + enum_name(code), server_id, dsn, user_header_size, key_size, value_size); + } +}; + +#pragma pack(1) +struct repl_dev_superblk { + static constexpr uint64_t REPL_DEV_SB_MAGIC = 0xABCDF00D; + static constexpr uint32_t REPL_DEV_SB_VERSION = 1; + + uint64_t magic{REPL_DEV_SB_MAGIC}; + uint32_t version{REPL_DEV_SB_VERSION}; + uuid_t group_id; // group_id of this replica set + logstore_id_t data_journal_id; // Logstore id for the data journal + int64_t commit_lsn; // LSN upto which this replica has committed + int64_t checkpoint_lsn; // LSN upto which this replica have checkpointed the data + uint64_t group_ordinal; // Ordinal number which will be used to indicate the rdevXYZ for debugging + + uint64_t get_magic() const { return magic; } + uint32_t get_version() const { return version; } +}; +#pragma pack() + +template < class V = folly::Unit > +auto make_async_error(ReplServiceError err) { + return folly::makeSemiFuture< ReplResult< V > >(folly::makeUnexpected(err)); +} + +template < class V > +auto make_async_success(V v) { + return folly::makeSemiFuture< ReplResult< V > >(std::move(v)); +} + +template < class V = folly::Unit > +auto make_async_success() { + return folly::makeSemiFuture< ReplResult< folly::Unit > >(folly::Unit{}); +} + +} // namespace homestore diff --git a/src/lib/replication/repl_dev/raft_repl_dev.cpp b/src/lib/replication/repl_dev/raft_repl_dev.cpp new file mode 100644 index 000000000..0ee5885e8 --- /dev/null +++ b/src/lib/replication/repl_dev/raft_repl_dev.cpp @@ -0,0 +1,452 @@ +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "common/homestore_assert.hpp" +#include "replication/service/raft_repl_service.h" +#include "replication/repl_dev/raft_repl_dev.h" +#include "push_data_rpc_generated.h" + +namespace homestore { +std::atomic< uint64_t > RaftReplDev::s_next_group_ordinal{1}; + +RaftReplDev::RaftReplDev(RaftReplService& svc, superblk< raft_repl_dev_superblk >&& rd_sb, bool load_existing) : + m_repl_svc{svc}, + m_msg_mgr{svc.msg_manager()}, + m_group_id{rd_sb->group_id}, + m_my_repl_id{svc.get_my_repl_uuid()}, + m_raft_server_id{nuraft_mesg::to_server_id(m_my_repl_id)}, + m_rd_sb{std::move(rd_sb)} { + m_state_machine = std::make_shared< RaftStateMachine >(*this); + + if (load_existing) { + m_data_journal = std::make_shared< ReplLogStore >(*this, *m_state_machine, m_rd_sb->data_journal_id); + m_next_dsn = m_rd_sb->last_applied_dsn + 1; + m_commit_upto_lsn = m_rd_sb->commit_lsn; + m_last_flushed_commit_lsn = m_commit_upto_lsn; + m_rdev_name = fmt::format("rdev{}", m_rd_sb->group_ordinal); + + // Its ok not to do compare exchange, because loading is always single threaded as of now + if (m_rd_sb->group_ordinal >= s_next_group_ordinal.load()) { + s_next_group_ordinal.store(m_rd_sb->group_ordinal + 1); + } + + if (m_rd_sb->is_timeline_consistent) { + logstore_service().open_log_store(LogStoreService::CTRL_LOG_FAMILY_IDX, m_rd_sb->free_blks_journal_id, + false, [this](shared< HomeLogStore > log_store) { + m_free_blks_journal = std::move(log_store); + m_rd_sb->free_blks_journal_id = m_free_blks_journal->get_store_id(); + }); + } + } else { + m_data_journal = std::make_shared< ReplLogStore >(*this, *m_state_machine); + m_rd_sb->data_journal_id = m_data_journal->logstore_id(); + m_rd_sb->last_applied_dsn = 0; + m_rd_sb->group_ordinal = s_next_group_ordinal.fetch_add(1); + m_rdev_name = fmt::format("rdev{}", m_rd_sb->group_ordinal); + + if (m_rd_sb->is_timeline_consistent) { + m_free_blks_journal = + logstore_service().create_new_log_store(LogStoreService::CTRL_LOG_FAMILY_IDX, false /* append_mode */); + m_rd_sb->free_blks_journal_id = m_free_blks_journal->get_store_id(); + } + m_rd_sb.write(); + } + + RD_LOG(INFO, "Started {} RaftReplDev group_id={}, replica_id={}, raft_server_id={} commited_lsn={} next_dsn={}", + (load_existing ? "Existing" : "New"), group_id_str(), my_replica_id_str(), m_raft_server_id, + m_commit_upto_lsn.load(), m_next_dsn.load()); + m_msg_mgr.bind_data_service_request(PUSH_DATA, m_group_id, bind_this(RaftReplDev::on_push_data_received, 1)); + // m_msg_mgr.bind_data_service_request(FETCH_DATA, m_group_id, bind_this(RaftReplDev::on_fetch_data_received, 2)); +} + +void RaftReplDev::use_config(json_superblk raft_config_sb) { m_raft_config_sb = std::move(raft_config_sb); } + +void RaftReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& value, + repl_req_ptr_t rreq) { + if (!rreq) { auto rreq = repl_req_ptr_t(new repl_req_ctx{}); } + rreq->header = header; + rreq->key = key; + rreq->value = value; + + // If it is header only entry, directly propose to the raft + if (rreq->value.size) { + rreq->rkey = + repl_key{.server_id = server_id(), .term = raft_server()->get_term(), .dsn = m_next_dsn.fetch_add(1)}; + push_data_to_all_followers(rreq); + + // Step 1: Alloc Blkid + auto status = data_service().alloc_blks(uint32_cast(rreq->value.size), + m_listener->get_blk_alloc_hints(rreq->header, rreq->value.size), + rreq->local_blkid); + HS_REL_ASSERT_EQ(status, BlkAllocStatus::SUCCESS); + + // Write the data + data_service().async_write(rreq->value, rreq->local_blkid).thenValue([this, rreq](auto&& err) { + HS_REL_ASSERT(!err, "Error in writing data"); // TODO: Find a way to return error to the Listener + rreq->state.fetch_or(uint32_cast(repl_req_state_t::DATA_WRITTEN)); + m_state_machine->propose_to_raft(std::move(rreq)); + }); + } else { + RD_LOG(INFO, "Skipping data channel send since value size is 0"); + rreq->state.fetch_or(uint32_cast(repl_req_state_t::DATA_WRITTEN)); + m_state_machine->propose_to_raft(std::move(rreq)); + } +} + +void RaftReplDev::push_data_to_all_followers(repl_req_ptr_t rreq) { + auto& builder = rreq->fb_builder; + + // Prepare the rpc request packet with all repl_reqs details + builder.FinishSizePrefixed(CreatePushDataRequest(builder, server_id(), rreq->rkey.term, rreq->rkey.dsn, + builder.CreateVector(rreq->header.cbytes(), rreq->header.size()), + builder.CreateVector(rreq->key.cbytes(), rreq->key.size()), + rreq->value.size)); + + rreq->pkts = sisl::io_blob::sg_list_to_ioblob_list(rreq->value); + rreq->pkts.insert(rreq->pkts.begin(), sisl::io_blob{builder.GetBufferPointer(), builder.GetSize(), false}); + + /*RD_LOG(INFO, "Data Channel: Pushing data to all followers: rreq=[{}] data=[{}]", rreq->to_string(), + flatbuffers::FlatBufferToString(builder.GetBufferPointer() + sizeof(flatbuffers::uoffset_t), + PushDataRequestTypeTable()));*/ + + RD_LOG(INFO, "Data Channel: Pushing data to all followers: rreq=[{}]", rreq->to_compact_string()); + + group_msg_service() + ->data_service_request_unidirectional(nuraft_mesg::role_regex::ALL, PUSH_DATA, rreq->pkts) + .via(&folly::InlineExecutor::instance()) + .thenValue([this, rreq = std::move(rreq)](auto e) { + // Release the buffer which holds the packets + RD_LOG(INFO, "Data Channel: Data push completed for rreq=[{}]", rreq->to_compact_string()); + rreq->fb_builder.Release(); + rreq->pkts.clear(); + }); +} + +void RaftReplDev::on_push_data_received(intrusive< sisl::GenericRpcData >& rpc_data) { + auto const& incoming_buf = rpc_data->request_blob(); + auto const fb_size = + flatbuffers::ReadScalar< flatbuffers::uoffset_t >(incoming_buf.cbytes()) + sizeof(flatbuffers::uoffset_t); + auto push_req = GetSizePrefixedPushDataRequest(incoming_buf.cbytes()); + sisl::blob header = sisl::blob{push_req->user_header()->Data(), push_req->user_header()->size()}; + sisl::blob key = sisl::blob{push_req->user_key()->Data(), push_req->user_key()->size()}; + + RD_LOG(TRACE, "PushData received on data channel: {}", + flatbuffers::FlatBufferToString(incoming_buf.cbytes() + sizeof(flatbuffers::uoffset_t), + PushDataRequestTypeTable())); + + auto rreq = follower_create_req( + repl_key{.server_id = push_req->issuer_replica_id(), .term = push_req->raft_term(), .dsn = push_req->dsn()}, + header, key, push_req->data_size()); + rreq->rpc_data = rpc_data; + + RD_LOG(INFO, "Data Channel: Received data rreq=[{}]", rreq->to_compact_string()); + + if (rreq->state.fetch_or(uint32_cast(repl_req_state_t::DATA_RECEIVED)) & + uint32_cast(repl_req_state_t::DATA_RECEIVED)) { + // We already received the data before, just ignore this data + // TODO: Should we forcibly overwrite the data with new data? + return; + } + + // Get the data portion from the buffer + HS_DBG_ASSERT_EQ(fb_size + push_req->data_size(), incoming_buf.size(), "Size mismatch of data size vs buffer size"); + uint8_t const* data = incoming_buf.cbytes() + fb_size; + + if (((uintptr_t)data % data_service().get_align_size()) != 0) { + // Unaligned buffer, create a new buffer and copy the entire buf + rreq->buf_for_unaligned_data = + std::move(sisl::io_blob_safe(push_req->data_size(), data_service().get_align_size())); + std::memcpy(rreq->buf_for_unaligned_data.bytes(), data, push_req->data_size()); + data = rreq->buf_for_unaligned_data.cbytes(); + } + + // Schedule a write and upon completion, mark the data as written. + data_service() + .async_write(r_cast< const char* >(data), push_req->data_size(), rreq->local_blkid) + .thenValue([this, rreq](auto&& err) { + RD_REL_ASSERT(!err, "Error in writing data"); // TODO: Find a way to return error to the Listener + rreq->state.fetch_or(uint32_cast(repl_req_state_t::DATA_WRITTEN)); + rreq->data_written_promise.setValue(); + RD_LOG(INFO, "Data Channel: Data Write completed rreq=[{}]", rreq->to_compact_string()); + }); +} + +static bool blob_equals(sisl::blob const& a, sisl::blob const& b) { + if (a.size() != b.size()) { return false; } + return (std::memcmp(a.cbytes(), b.cbytes(), a.size()) == 0); +} + +static MultiBlkId do_alloc_blk(uint32_t size, blk_alloc_hints const& hints) { + MultiBlkId blkid; + auto const status = data_service().alloc_blks(sisl::round_up(size, data_service().get_blk_size()), hints, blkid); + RELEASE_ASSERT_EQ(status, BlkAllocStatus::SUCCESS, "alloc_blks returned null, no space left!"); + return blkid; +} + +repl_req_ptr_t RaftReplDev::follower_create_req(repl_key const& rkey, sisl::blob const& user_header, + sisl::blob const& user_key, uint32_t data_size) { + auto const [it, happened] = m_repl_key_req_map.try_emplace(rkey, repl_req_ptr_t(new repl_req_ctx())); + RD_DBG_ASSERT((it != m_repl_key_req_map.end()), "Unexpected error in map_repl_key_to_req"); + auto rreq = it->second; + + if (!happened) { + // We already have the entry in the map, check if we are already allocated the blk by previous caller, in that + // case we need to return the req. + if (rreq->state.load() & uint32_cast(repl_req_state_t::BLK_ALLOCATED)) { + // Do validation if we have the correct mapping + RD_REL_ASSERT(blob_equals(user_header, rreq->header), "User header mismatch for repl_key={}", + rkey.to_string()); + RD_REL_ASSERT(blob_equals(user_key, rreq->key), "User key mismatch for repl_key={}", rkey.to_string()); + RD_LOG(INFO, "Repl_key=[{}] already received ", rkey.to_string()); + return rreq; + } + } + + // We need to allocate the block, since entry doesn't exist or if it exist, two threads are trying to do the same + // thing. So take state mutex and allocate the blk + std::unique_lock< std::mutex > lg(rreq->state_mtx); + if (rreq->state.load() & uint32_cast(repl_req_state_t::BLK_ALLOCATED)) { return rreq; } + rreq->rkey = rkey; + rreq->header = user_header; + rreq->key = user_key; + rreq->local_blkid = do_alloc_blk(data_size, m_listener->get_blk_alloc_hints(user_header, data_size)); + rreq->state.fetch_or(uint32_cast(repl_req_state_t::BLK_ALLOCATED)); + + return rreq; +} + +AsyncNotify RaftReplDev::notify_after_data_written(std::vector< repl_req_ptr_t >* rreqs) { + std::vector< folly::SemiFuture< folly::Unit > > futs; + futs.reserve(rreqs->size()); + + // Pop any entries that are already completed - from the entries list as well as from map + rreqs->erase(std::remove_if( + rreqs->begin(), rreqs->end(), + [this, &futs](repl_req_ptr_t const& rreq) { + if (rreq == nullptr) { return true; } + + if (rreq->state.load() & uint32_cast(repl_req_state_t::DATA_WRITTEN)) { + m_repl_key_req_map.erase(rreq->rkey); // Remove=Pop from map as well, since it is completed + RD_LOG(INFO, + "Raft Channel: Data write completed and blkid mapped, removing from map: rreq=[{}]", + rreq->to_compact_string()); + return true; // Remove from the pending list + } else { + futs.emplace_back(rreq->data_written_promise.getSemiFuture()); + return false; + } + }), + rreqs->end()); + + // All the entries are done already, no need to wait + if (rreqs->size() == 0) { return folly::makeFuture< folly::Unit >(folly::Unit{}); } + +#if 0 + // We are yet to support reactive fetch from remote. + if (m_resync_mode) { + // if in resync mode, fetch data from remote immediately; + check_and_fetch_remote_data(std::move(rreqs)); + } else { + // some blkids are not in completed state, let's schedule a timer to check it again; + // we wait for data channel to fill in the data. Still if its not done we trigger a fetch from remote; + m_wait_blkid_write_timer_hdl = iomanager.schedule_thread_timer( // timer wakes up in current thread; + HS_DYNAMIC_CONFIG(repl->wait_blkid_write_timer_sec) * 1000 * 1000 * 1000, false /* recurring */, + nullptr /* cookie */, [this, std::move(rreqs)](auto) { + check_and_fetch_remote_data(std::move(rreqs)); + }); + } + return ret; +#endif + + return folly::collectAll(futs).deferValue([this, rreqs](auto&& e) { + for (auto const& rreq : *rreqs) { + HS_DBG_ASSERT(rreq->state.load() & uint32_cast(repl_req_state_t::DATA_WRITTEN), + "Data written promise raised without updating DATA_WRITTEN state for rkey={}", + rreq->rkey.to_string()); + RD_LOG(INFO, "Raft Channel: Data write completed and blkid mapped, removing from map: rreq=[{}]", + rreq->to_compact_string()); + m_repl_key_req_map.erase(rreq->rkey); // Remove from map as well, since it is completed + } + return folly::makeSemiFuture< folly::Unit >(folly::Unit{}); + }); +} + +folly::Future< std::error_code > RaftReplDev::async_read(MultiBlkId const& bid, sisl::sg_list& sgs, uint32_t size, + bool part_of_batch) { + return data_service().async_read(bid, sgs, size, part_of_batch); +} + +void RaftReplDev::async_free_blks(int64_t, MultiBlkId const& bid) { + // TODO: For timeline consistency required, we should retain the blkid that is changed and write that to another + // journal. + data_service().async_free_blk(bid); +} + +bool RaftReplDev::is_leader() const { return m_repl_svc_ctx->is_raft_leader(); } + +uint32_t RaftReplDev::get_blk_size() const { return data_service().get_blk_size(); } + +nuraft_mesg::repl_service_ctx* RaftReplDev::group_msg_service() { return m_repl_svc_ctx.get(); } +nuraft::raft_server* RaftReplDev::raft_server() { return m_repl_svc_ctx->_server; } + +/////////////////////////////////// Config Serialize/Deserialize Section //////////////////////////////////// +static nlohmann::json serialize_server_config(std::list< nuraft::ptr< nuraft::srv_config > > const& server_list) { + auto servers = nlohmann::json::array(); + for (auto const& server_conf : server_list) { + if (!server_conf) { continue; } + servers.push_back(nlohmann::json{{"id", server_conf->get_id()}, + {"dc_id", server_conf->get_dc_id()}, + {"endpoint", server_conf->get_endpoint()}, + {"aux", server_conf->get_aux()}, + {"learner", server_conf->is_learner()}, + {"priority", server_conf->get_priority()}}); + } + return servers; +} + +static nlohmann::json serialize_cluster_config(const nuraft::cluster_config& config) { + return nlohmann::json{{"log_idx", config.get_log_idx()}, + {"prev_log_idx", config.get_prev_log_idx()}, + {"eventual_consistency", config.is_async_replication()}, + {"user_ctx", config.get_user_ctx()}, + {"servers", serialize_server_config(config.get_servers())}}; +} + +static nuraft::ptr< nuraft::srv_config > deserialize_server_config(nlohmann::json const& server) { + DEBUG_ASSERT(server.contains("id"), "Missing field") + auto const id = static_cast< int32_t >(server["id"]); + DEBUG_ASSERT(server.contains("dc_id"), "Missing field") + auto const dc_id = static_cast< int32_t >(server["dc_id"]); + DEBUG_ASSERT(server.contains("endpoint"), "Missing field") + auto const endpoint = server["endpoint"]; + DEBUG_ASSERT(server.contains("aux"), "Missing field") + auto const aux = server["aux"]; + DEBUG_ASSERT(server.contains("learner"), "Missing field") + auto const learner = server["learner"]; + DEBUG_ASSERT(server.contains("priority"), "Missing field") + auto const prior = static_cast< int32_t >(server["priority"]); + return nuraft::cs_new< nuraft::srv_config >(id, dc_id, endpoint, aux, learner, prior); +} + +static void deserialize_server_list(nlohmann::json const& servers, + std::list< nuraft::ptr< nuraft::srv_config > >& server_list) { + for (auto const& server_conf : servers) { + server_list.push_back(deserialize_server_config(server_conf)); + } +} + +nuraft::ptr< nuraft::cluster_config > deserialize_cluster_config(nlohmann::json const& cluster_config) { + DEBUG_ASSERT(cluster_config.contains("log_idx"), "Missing field") + auto const& log_idx = cluster_config["log_idx"]; + DEBUG_ASSERT(cluster_config.contains("prev_log_idx"), "Missing field") + auto const& prev_log_idx = cluster_config["prev_log_idx"]; + DEBUG_ASSERT(cluster_config.contains("eventual_consistency"), "Missing field") + auto const& eventual = cluster_config["eventual_consistency"]; + + auto raft_config = nuraft::cs_new< nuraft::cluster_config >(log_idx, prev_log_idx, eventual); + DEBUG_ASSERT(cluster_config.contains("user_ctx"), "Missing field") + raft_config->set_user_ctx(cluster_config["user_ctx"]); + DEBUG_ASSERT(cluster_config.contains("servers"), "Missing field") + deserialize_server_list(cluster_config["servers"], raft_config->get_servers()); + return raft_config; +} + +nuraft::ptr< nuraft::cluster_config > RaftReplDev::load_config() { + std::unique_lock lg{m_config_mtx}; + auto& js = *m_raft_config_sb; + + if (!js.contains("config")) { + auto cluster_conf = nuraft::cs_new< nuraft::cluster_config >(); + cluster_conf->get_servers().push_back( + nuraft::cs_new< nuraft::srv_config >(m_raft_server_id, my_replica_id_str())); + js["config"] = serialize_cluster_config(*cluster_conf); + } + return deserialize_cluster_config(js["config"]); +} + +void RaftReplDev::save_config(const nuraft::cluster_config& config) { + std::unique_lock lg{m_config_mtx}; + (*m_raft_config_sb)["config"] = serialize_cluster_config(config); + m_raft_config_sb.write(); +} + +void RaftReplDev::save_state(const nuraft::srv_state& state) { + std::unique_lock lg{m_config_mtx}; + (*m_raft_config_sb)["state"] = nlohmann::json{{"term", state.get_term()}, {"voted_for", state.get_voted_for()}}; + m_raft_config_sb.write(); +} + +nuraft::ptr< nuraft::srv_state > RaftReplDev::read_state() { + std::unique_lock lg{m_config_mtx}; + auto& js = *m_raft_config_sb; + auto state = nuraft::cs_new< nuraft::srv_state >(); + if (js["state"].empty()) { + js["state"] = nlohmann::json{{"term", state->get_term()}, {"voted_for", state->get_voted_for()}}; + } else { + try { + state->set_term(uint64_cast(js["state"]["term"])); + state->set_voted_for(static_cast< int >(js["state"]["voted_for"])); + } catch (std::out_of_range const&) { + LOGWARN("State data was not in the expected format [group_id={}]!", m_group_id) + } + } + return state; +} + +nuraft::ptr< nuraft::log_store > RaftReplDev::load_log_store() { return m_data_journal; } + +int32_t RaftReplDev::server_id() { return m_raft_server_id; } + +/////////////////////////////////// nuraft_mesg::mesg_state_mgr overrides //////////////////////////////////// +uint32_t RaftReplDev::get_logstore_id() const { return m_data_journal->logstore_id(); } + +std::shared_ptr< nuraft::state_machine > RaftReplDev::get_state_machine() { return m_state_machine; } + +void RaftReplDev::permanent_destroy() { + // TODO: Implement this +} +void RaftReplDev::leave() { + // TODO: Implement this +} + +/////////////////////////////////// Private metohds //////////////////////////////////// +void RaftReplDev::report_committed(repl_req_ptr_t rreq) { + auto prev_lsn = m_commit_upto_lsn.exchange(rreq->lsn); + RD_DBG_ASSERT_GT(rreq->lsn, prev_lsn, "Out of order commit of lsns, it is not expected in RaftReplDev"); + + RD_LOG(INFO, "Raft channel: Commit rreq=[{}]", rreq->to_compact_string()); + m_listener->on_commit(rreq->lsn, rreq->header, rreq->key, rreq->local_blkid, rreq); + + if (!rreq->is_proposer) { + rreq->header = sisl::blob{}; + rreq->key = sisl::blob{}; + rreq->pkts = sisl::io_blob_list_t{}; + if (rreq->rpc_data) { + rreq->rpc_data->send_response(); + rreq->rpc_data = nullptr; + } + } +} + +void RaftReplDev::cp_flush(CP*) { + auto lsn = m_commit_upto_lsn.load(); + if (lsn == m_last_flushed_commit_lsn) { + // Not dirtied since last flush ignore + return; + } + + m_rd_sb->commit_lsn = lsn; + m_rd_sb->checkpoint_lsn = lsn; + m_rd_sb.write(); + m_last_flushed_commit_lsn = lsn; +} + +void RaftReplDev::cp_cleanup(CP*) {} +} // namespace homestore diff --git a/src/lib/replication/repl_dev/raft_repl_dev.h b/src/lib/replication/repl_dev/raft_repl_dev.h new file mode 100644 index 000000000..64336d9a9 --- /dev/null +++ b/src/lib/replication/repl_dev/raft_repl_dev.h @@ -0,0 +1,118 @@ +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include "replication/repl_dev/common.h" +#include "replication/repl_dev/raft_state_machine.h" +#include "replication/log_store/repl_log_store.h" + +namespace homestore { + +#pragma pack(1) +struct raft_repl_dev_superblk : public repl_dev_superblk { + static constexpr uint32_t RAFT_REPL_DEV_SB_VERSION = 1; + + uint32_t raft_sb_version{RAFT_REPL_DEV_SB_VERSION}; + logstore_id_t free_blks_journal_id; // Logstore id for storing free blkid records + uint8_t is_timeline_consistent; // Flag to indicate whether the recovery of followers need to be timeline consistent + uint64_t last_applied_dsn; // Last applied data sequence number + + uint32_t get_raft_sb_version() const { return raft_sb_version; } +}; +#pragma pack() + +using raft_buf_ptr_t = nuraft::ptr< nuraft::buffer >; + +class RaftReplService; +class CP; +class RaftReplDev : public ReplDev, public nuraft_mesg::mesg_state_mgr { +private: + shared< RaftStateMachine > m_state_machine; + RaftReplService& m_repl_svc; + folly::ConcurrentHashMap< repl_key, repl_req_ptr_t, repl_key::Hasher > m_repl_key_req_map; + nuraft_mesg::Manager& m_msg_mgr; + group_id_t m_group_id; // Replication Group id + std::string m_rdev_name; // Short name for the group for easy debugging + replica_id_t m_my_repl_id; // This replica's uuid + int32_t m_raft_server_id; // Server ID used by raft (unique within raft group) + shared< ReplLogStore > m_data_journal; + shared< HomeLogStore > m_free_blks_journal; + + std::mutex m_config_mtx; + superblk< raft_repl_dev_superblk > m_rd_sb; // Superblk where we store the state machine etc + json_superblk m_raft_config_sb; // Raft Context and Config data information stored + mutable folly::SharedMutexWritePriority m_sb_lock; // Lock to protect staged sb and persisting sb + raft_repl_dev_superblk m_sb_in_mem; // Cached version which is used to read and for staging + + std::atomic< repl_lsn_t > m_commit_upto_lsn{0}; // LSN which was lastly written, to track flushes + repl_lsn_t m_last_flushed_commit_lsn{0}; // LSN upto which it was flushed to persistent store + iomgr::timer_handle_t m_sb_flush_timer_hdl; + + std::atomic< uint64_t > m_next_dsn{0}; // Data Sequence Number that will keep incrementing for each data entry + + static std::atomic< uint64_t > s_next_group_ordinal; + +public: + friend class RaftStateMachine; + + RaftReplDev(RaftReplService& svc, superblk< raft_repl_dev_superblk >&& rd_sb, bool load_existing); + virtual ~RaftReplDev() = default; + + void destroy(); + + //////////////// All ReplDev overrides/implementation /////////////////////// + void async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& value, + repl_req_ptr_t ctx) override; + folly::Future< std::error_code > async_read(MultiBlkId const& blkid, sisl::sg_list& sgs, uint32_t size, + bool part_of_batch = false) override; + void async_free_blks(int64_t lsn, MultiBlkId const& blkid) override; + bool is_leader() const override; + group_id_t group_id() const override { return m_group_id; } + std::string group_id_str() const { return boost::uuids::to_string(m_group_id); } + std::string rdev_name() const { return m_rdev_name; } + std::string my_replica_id_str() const { return boost::uuids::to_string(m_my_repl_id); } + uint32_t get_blk_size() const override; + repl_lsn_t get_last_commit_lsn() const { return m_commit_upto_lsn.load(); } + + //////////////// Accessor/shortcut methods /////////////////////// + nuraft_mesg::repl_service_ctx* group_msg_service(); + nuraft::raft_server* raft_server(); + + //////////////// Methods needed for other Raft classes to access ///////////////// + void use_config(json_superblk raft_config_sb); + void report_committed(repl_req_ptr_t rreq); + repl_req_ptr_t follower_create_req(repl_key const& rkey, sisl::blob const& user_header, sisl::blob const& user_key, + uint32_t data_size); + AsyncNotify notify_after_data_written(std::vector< repl_req_ptr_t >* rreqs); + void cp_flush(CP* cp); + void cp_cleanup(CP* cp); + +protected: + //////////////// All nuraft::state_mgr overrides /////////////////////// + nuraft::ptr< nuraft::cluster_config > load_config() override; + void save_config(const nuraft::cluster_config& config) override; + void save_state(const nuraft::srv_state& state) override; + nuraft::ptr< nuraft::srv_state > read_state() override; + nuraft::ptr< nuraft::log_store > load_log_store() override; + int32_t server_id() override; + void system_exit(const int exit_code) override { LOGINFO("System exiting with code [{}]", exit_code); } + + //////////////// All nuraft_mesg::mesg_state_mgr overrides /////////////////////// + uint32_t get_logstore_id() const override; + std::shared_ptr< nuraft::state_machine > get_state_machine() override; + void permanent_destroy() override; + void leave() override; + +private: + shared< nuraft::log_store > data_journal() { return m_data_journal; } + void push_data_to_all_followers(repl_req_ptr_t rreq); + void on_push_data_received(intrusive< sisl::GenericRpcData >& rpc_data); +}; + +} // namespace homestore diff --git a/src/lib/replication/repl_dev/raft_state_machine.cpp b/src/lib/replication/repl_dev/raft_state_machine.cpp new file mode 100644 index 000000000..e25f965f3 --- /dev/null +++ b/src/lib/replication/repl_dev/raft_state_machine.cpp @@ -0,0 +1,184 @@ +#include +#include +#include +#include + +#include "repl_dev/raft_state_machine.h" +#include "repl_dev/raft_repl_dev.h" + +SISL_LOGGING_DECL(replication) + +namespace homestore { + +RaftStateMachine::RaftStateMachine(RaftReplDev& rd) : m_rd{rd} { + m_success_ptr = nuraft::buffer::alloc(sizeof(int)); + m_success_ptr->put(0); +} + +raft_buf_ptr_t RaftStateMachine::pre_commit_ext(nuraft::state_machine::ext_op_params const& params) { + // Leader precommit is processed in next callback, because this callback doesn't provide a way to stick a context + // which could contain the req structure in it. + if (!m_rd.is_leader()) { + int64_t lsn = s_cast< int64_t >(params.log_idx); + raft_buf_ptr_t data = params.data; + + repl_req_ptr_t rreq = lsn_to_req(lsn); + RD_LOG(INFO, "Raft channel: Precommit rreq=[{}]", rreq->to_compact_string()); + m_rd.m_listener->on_pre_commit(rreq->lsn, rreq->header, rreq->key, rreq); + } + return m_success_ptr; +} + +void RaftStateMachine::after_precommit_in_leader(nuraft::raft_server::req_ext_cb_params const& params) { + repl_req_ptr_t rreq = repl_req_ptr_t(r_cast< repl_req_ctx* >(params.context)); + link_lsn_to_req(rreq, int64_cast(params.log_idx)); + + RD_LOG(INFO, "Raft Channel: Proposed rreq=[{}]", rreq->to_compact_string()); + m_rd.m_listener->on_pre_commit(rreq->lsn, rreq->header, rreq->key, rreq); +} + +raft_buf_ptr_t RaftStateMachine::commit_ext(nuraft::state_machine::ext_op_params const& params) { + int64_t lsn = s_cast< int64_t >(params.log_idx); + raft_buf_ptr_t data = params.data; + + repl_req_ptr_t rreq = lsn_to_req(lsn); + if (rreq == nullptr) { return m_success_ptr; } + + RD_LOG(INFO, "Raft channel: Received Commit message rreq=[{}]", rreq->to_compact_string()); + if (m_rd.is_leader()) { + // This is the time to ensure flushing of journal happens in leader + if (m_rd.m_data_journal->last_durable_index() < uint64_cast(lsn)) { m_rd.m_data_journal->flush(); } + rreq->state.fetch_or(uint32_cast(repl_req_state_t::LOG_FLUSHED)); + } + if (rreq->state.load() & uint32_cast(repl_req_state_t::DATA_WRITTEN)) { + m_lsn_req_map.erase(rreq->lsn); + m_rd.report_committed(rreq); + } + return m_success_ptr; +} + +uint64_t RaftStateMachine::last_commit_index() { return uint64_cast(m_rd.get_last_commit_lsn()); } + +void RaftStateMachine::propose_to_raft(repl_req_ptr_t rreq) { + uint32_t val_size = rreq->value.size ? rreq->local_blkid.serialized_size() : 0; + uint32_t entry_size = sizeof(repl_journal_entry) + rreq->header.size() + rreq->key.size() + val_size; + rreq->alloc_journal_entry(entry_size, true /* raft_buf */); + rreq->journal_entry->code = (rreq->value.size) ? journal_type_t::HS_LARGE_DATA : journal_type_t::HS_HEADER_ONLY; + rreq->journal_entry->server_id = m_rd.server_id(); + rreq->journal_entry->dsn = rreq->dsn(); + rreq->journal_entry->user_header_size = rreq->header.size(); + rreq->journal_entry->key_size = rreq->key.size(); + rreq->journal_entry->value_size = val_size; + + rreq->is_proposer = true; + uint8_t* raw_ptr = uintptr_cast(rreq->journal_entry) + sizeof(repl_journal_entry); + if (rreq->header.size()) { + std::memcpy(raw_ptr, rreq->header.cbytes(), rreq->header.size()); + raw_ptr += rreq->header.size(); + } + + if (rreq->key.size()) { + std::memcpy(raw_ptr, rreq->key.cbytes(), rreq->key.size()); + raw_ptr += rreq->key.size(); + } + + if (rreq->value.size) { + auto const b = rreq->local_blkid.serialize(); + std::memcpy(raw_ptr, b.cbytes(), b.size()); + raw_ptr += b.size(); + } + + auto* vec = sisl::VectorPool< raft_buf_ptr_t >::alloc(); + vec->push_back(rreq->raft_journal_buf()); + + nuraft::raft_server::req_ext_params param; + param.after_precommit_ = bind_this(RaftStateMachine::after_precommit_in_leader, 1); + param.expected_term_ = 0; + param.context_ = voidptr_cast(rreq.get()); + + RD_LOG(TRACE, "Raft Channel: journal_entry=[{}] ", rreq->journal_entry->to_string()); + + m_rd.raft_server()->append_entries_ext(*vec, param); + sisl::VectorPool< raft_buf_ptr_t >::free(vec); +} + +repl_req_ptr_t RaftStateMachine::transform_journal_entry(nuraft::ptr< nuraft::log_entry >& lentry) { + // Leader has nothing to transform or process + if (m_rd.is_leader()) { return nullptr; } + + // We don't want to transform anything that is not an app log + if (lentry->get_val_type() != nuraft::log_val_type::app_log) { return nullptr; } + + repl_journal_entry* jentry = r_cast< repl_journal_entry* >(lentry->get_buf().data_begin()); + RELEASE_ASSERT_EQ(jentry->major_version, repl_journal_entry::JOURNAL_ENTRY_MAJOR, + "Mismatched version of journal entry received from RAFT peer"); + + RD_LOG(TRACE, "Received Raft log_entry=[term={}], journal_entry=[{}] ", lentry->get_term(), jentry->to_string()); + + // For inline data we don't need to transform anything + if (jentry->code != journal_type_t::HS_LARGE_DATA) { return nullptr; } + + sisl::blob const header = sisl::blob{uintptr_cast(jentry) + sizeof(repl_journal_entry), jentry->user_header_size}; + sisl::blob const key = sisl::blob{header.cbytes() + header.size(), jentry->key_size}; + DEBUG_ASSERT_GT(jentry->value_size, 0, "Entry marked as large data, but value size is notified as 0"); + + // From the repl_key, get the repl_req. In cases where log stream got here first, this method will create a new + // repl_req and return that back. Fill up all of the required journal entry inside the repl_req + auto rreq = m_rd.follower_create_req( + repl_key{.server_id = jentry->server_id, .term = lentry->get_term(), .dsn = jentry->dsn}, header, key, + jentry->value_size); + rreq->journal_buf = lentry->serialize(); + + MultiBlkId entry_blkid; + entry_blkid.deserialize(sisl::blob{key.cbytes() + key.size(), jentry->value_size}, true /* copy */); + rreq->remote_blkid = RemoteBlkId{jentry->server_id, entry_blkid}; + + auto const local_size = rreq->local_blkid.serialized_size(); + auto const remote_size = entry_blkid.serialized_size(); + uint8_t* blkid_location; + if (local_size > remote_size) { + // We need to copy the entire log_entry to accomodate local blkid + auto new_buf = nuraft::buffer::expand(*rreq->raft_journal_buf(), + rreq->raft_journal_buf()->size() + local_size - remote_size); + blkid_location = uintptr_cast(new_buf->data_begin()) + rreq->raft_journal_buf()->size() - jentry->value_size; + rreq->journal_buf = std::move(new_buf); + } else { + // Can do in-place replace of remote blkid with local blkid. + blkid_location = uintptr_cast(rreq->raft_journal_buf()->data_begin()) + rreq->raft_journal_buf()->size() - + jentry->value_size; + } + std::memcpy(blkid_location, rreq->local_blkid.serialize().cbytes(), local_size); + rreq->journal_entry = r_cast< repl_journal_entry* >(rreq->raft_journal_buf()->data_begin()); + + return rreq; +} + +void RaftStateMachine::link_lsn_to_req(repl_req_ptr_t rreq, int64_t lsn) { + rreq->lsn = lsn; + rreq->state.fetch_or(uint32_cast(repl_req_state_t::LOG_RECEIVED)); + [[maybe_unused]] auto r = m_lsn_req_map.insert(lsn, std::move(rreq)); + RD_DBG_ASSERT_EQ(r.second, true, "lsn={} already in precommit list", lsn); +} + +repl_req_ptr_t RaftStateMachine::lsn_to_req(int64_t lsn) { + // Pull the req from the lsn + auto const it = m_lsn_req_map.find(lsn); + // RD_DBG_ASSERT(it != m_lsn_req_map.cend(), "lsn req map missing lsn={}", lsn); + if (it == m_lsn_req_map.cend()) { return nullptr; } + + repl_req_ptr_t rreq = it->second; + RD_DBG_ASSERT_EQ(lsn, rreq->lsn, "lsn req map mismatch"); + return rreq; +} + +nuraft_mesg::repl_service_ctx* RaftStateMachine::group_msg_service() { return m_rd.group_msg_service(); } + +void RaftStateMachine::create_snapshot(nuraft::snapshot& s, nuraft::async_result< bool >::handler_type& when_done) { + RD_LOG(DEBUG, "create_snapshot {}/{}", s.get_last_log_idx(), s.get_last_log_term()); + auto null_except = std::shared_ptr< std::exception >(); + auto ret_val{false}; + if (when_done) when_done(ret_val, null_except); +} + +std::string RaftStateMachine::rdev_name() const { return m_rd.rdev_name(); } +} // namespace homestore diff --git a/src/lib/replication/repl_dev/raft_state_machine.h b/src/lib/replication/repl_dev/raft_state_machine.h new file mode 100644 index 000000000..c341ebd3b --- /dev/null +++ b/src/lib/replication/repl_dev/raft_state_machine.h @@ -0,0 +1,123 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "replication/repl_dev/common.h" + +#if defined __clang__ or defined __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#endif +#include +#if defined __clang__ or defined __GNUC__ +#pragma GCC diagnostic pop +#endif +#undef auto_lock + +namespace homestore { +class ReplicaSetImpl; +class StateMachineStore; + +#define RD_LOG(level, msg, ...) \ + LOG##level##MOD_FMT(replication, ([&](fmt::memory_buffer& buf, const char* msgcb, auto&&... args) -> bool { \ + fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}:{}] "}, \ + fmt::make_format_args(file_name(__FILE__), __LINE__)); \ + fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}={}] "}, \ + fmt::make_format_args("rd", rdev_name())); \ + fmt::vformat_to(fmt::appender{buf}, fmt::string_view{msgcb}, \ + fmt::make_format_args(std::forward< decltype(args) >(args)...)); \ + return true; \ + }), \ + msg, ##__VA_ARGS__); + +#define RD_ASSERT_CMP(assert_type, val1, cmp, val2, ...) \ + { \ + assert_type##_ASSERT_CMP( \ + val1, cmp, val2, \ + [&](fmt::memory_buffer& buf, const char* const msgcb, auto&&... args) -> bool { \ + fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}:{}] "}, \ + fmt::make_format_args(file_name(__FILE__), __LINE__)); \ + sisl::logging::default_cmp_assert_formatter(buf, msgcb, std::forward< decltype(args) >(args)...); \ + fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}={}] "}, \ + fmt::make_format_args("rd", rdev_name())); \ + return true; \ + }, \ + ##__VA_ARGS__); \ + } +#define RD_ASSERT(assert_type, cond, ...) \ + { \ + assert_type##_ASSERT_FMT(cond, \ + ([&](fmt::memory_buffer& buf, const char* const msgcb, auto&&... args) -> bool { \ + fmt::vformat_to(fmt::appender{buf}, fmt::string_view{"[{}={}] "}, \ + fmt::make_format_args("rd", rdev_name())); \ + fmt::vformat_to(fmt::appender{buf}, fmt::string_view{msgcb}, \ + fmt::make_format_args(std::forward< decltype(args) >(args)...)); \ + return true; \ + }), \ + ##__VA_ARGS__); \ + } + +#define RD_DBG_ASSERT(cond, ...) RD_ASSERT(DEBUG, cond, ##__VA_ARGS__) +#define RD_DBG_ASSERT_EQ(val1, val2, ...) RD_ASSERT_CMP(DEBUG, val1, ==, val2, ##__VA_ARGS__) +#define RD_DBG_ASSERT_NE(val1, val2, ...) RD_ASSERT_CMP(DEBUG, val1, !=, val2, ##__VA_ARGS__) +#define RD_DBG_ASSERT_LT(val1, val2, ...) RD_ASSERT_CMP(DEBUG, val1, <, val2, ##__VA_ARGS__) +#define RD_DBG_ASSERT_LE(val1, val2, ...) RD_ASSERT_CMP(DEBUG, val1, <=, val2, ##__VA_ARGS__) +#define RD_DBG_ASSERT_GT(val1, val2, ...) RD_ASSERT_CMP(DEBUG, val1, >, val2, ##__VA_ARGS__) +#define RD_DBG_ASSERT_GE(val1, val2, ...) RD_ASSERT_CMP(DEBUG, val1, >=, val2, ##__VA_ARGS__) + +#define RD_REL_ASSERT(cond, ...) RD_ASSERT(RELEASE, cond, ##__VA_ARGS__) +#define RD_REL_ASSERT_EQ(val1, val2, ...) RD_ASSERT_CMP(RELEASE, val1, ==, val2, ##__VA_ARGS__) +#define RD_REL_ASSERT_NE(val1, val2, ...) RD_ASSERT_CMP(RELEASE, val1, !=, val2, ##__VA_ARGS__) +#define RD_REL_ASSERT_LT(val1, val2, ...) RD_ASSERT_CMP(RELEASE, val1, <, val2, ##__VA_ARGS__) +#define RD_REL_ASSERT_LE(val1, val2, ...) RD_ASSERT_CMP(RELEASE, val1, <=, val2, ##__VA_ARGS__) +#define RD_REL_ASSERT_GT(val1, val2, ...) RD_ASSERT_CMP(RELEASE, val1, >, val2, ##__VA_ARGS__) +#define RD_REL_ASSERT_GE(val1, val2, ...) RD_ASSERT_CMP(RELEASE, val1, >=, val2, ##__VA_ARGS__) + +using AsyncNotify = folly::SemiFuture< folly::Unit >; +using AsyncNotifier = folly::Promise< folly::Unit >; + +class RaftReplDev; +class RaftStateMachine : public nuraft::state_machine { +private: + folly::ConcurrentHashMap< int64_t, repl_req_ptr_t > m_lsn_req_map; + RaftReplDev& m_rd; + nuraft::ptr< nuraft::buffer > m_success_ptr; // Preallocate the success return to raft + // iomgr::timer_handle_t m_wait_blkid_write_timer_hdl{iomgr::null_timer_handle}; + bool m_resync_mode{false}; + +public: + RaftStateMachine(RaftReplDev& rd); + ~RaftStateMachine() override = default; + RaftStateMachine(RaftStateMachine const&) = delete; + RaftStateMachine& operator=(RaftStateMachine const&) = delete; + + /// NuRaft overrides + uint64_t last_commit_index() override; + raft_buf_ptr_t pre_commit_ext(const nuraft::state_machine::ext_op_params& params) override; + raft_buf_ptr_t commit_ext(const nuraft::state_machine::ext_op_params& params) override; + void rollback(uint64_t lsn, nuraft::buffer&) override { LOGCRITICAL("Unimplemented rollback on: [{}]", lsn); } + + bool apply_snapshot(nuraft::snapshot&) override { return false; } + void create_snapshot(nuraft::snapshot& s, nuraft::async_result< bool >::handler_type& when_done) override; + nuraft::ptr< nuraft::snapshot > last_snapshot() override { return nullptr; } + + ////////// APIs outside of nuraft::state_machine requirements //////////////////// + void propose_to_raft(repl_req_ptr_t rreq); + repl_req_ptr_t transform_journal_entry(nuraft::ptr< nuraft::log_entry >& lentry); + void link_lsn_to_req(repl_req_ptr_t rreq, int64_t lsn); + repl_req_ptr_t lsn_to_req(int64_t lsn); + nuraft_mesg::repl_service_ctx* group_msg_service(); + + std::string rdev_name() const; + +private: + void after_precommit_in_leader(const nuraft::raft_server::req_ext_cb_params& params); +}; + +} // namespace homestore diff --git a/src/lib/replication/repl_dev/solo_repl_dev.cpp b/src/lib/replication/repl_dev/solo_repl_dev.cpp index 35d3e6931..57aa63def 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.cpp +++ b/src/lib/replication/repl_dev/solo_repl_dev.cpp @@ -1,12 +1,14 @@ +#include +#include "replication/repl_dev/solo_repl_dev.h" +#include "replication/repl_dev/common.h" #include #include #include #include "common/homestore_assert.hpp" -#include "replication/repl_dev/solo_repl_dev.h" namespace homestore { SoloReplDev::SoloReplDev(superblk< repl_dev_superblk >&& rd_sb, bool load_existing) : - m_rd_sb{std::move(rd_sb)}, m_group_id{m_rd_sb->gid} { + m_rd_sb{std::move(rd_sb)}, m_group_id{m_rd_sb->group_id} { if (load_existing) { logstore_service().open_log_store(LogStoreService::DATA_LOG_FAMILY_IDX, m_rd_sb->data_journal_id, true, bind_this(SoloReplDev::on_data_journal_created, 1)); @@ -25,8 +27,8 @@ void SoloReplDev::on_data_journal_created(shared< HomeLogStore > log_store) { } void SoloReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& value, - intrusive< repl_req_ctx > rreq) { - if (!rreq) { auto rreq = intrusive< repl_req_ctx >(new repl_req_ctx{}); } + repl_req_ptr_t rreq) { + if (!rreq) { auto rreq = repl_req_ptr_t(new repl_req_ctx{}); } rreq->header = header; rreq->key = key; rreq->value = std::move(value); @@ -35,7 +37,8 @@ void SoloReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& if (rreq->value.size) { // Step 1: Alloc Blkid auto status = data_service().alloc_blks(uint32_cast(rreq->value.size), - m_listener->get_blk_alloc_hints(rreq->header, rreq), rreq->local_blkid); + m_listener->get_blk_alloc_hints(rreq->header, rreq->value.size), + rreq->local_blkid); HS_REL_ASSERT_EQ(status, BlkAllocStatus::SUCCESS); // Write the data @@ -50,32 +53,32 @@ void SoloReplDev::async_alloc_write(sisl::blob const& header, sisl::blob const& } } -void SoloReplDev::write_journal(intrusive< repl_req_ctx > rreq) { - uint32_t entry_size = sizeof(repl_journal_entry) + rreq->header.size + rreq->key.size + +void SoloReplDev::write_journal(repl_req_ptr_t rreq) { + uint32_t entry_size = sizeof(repl_journal_entry) + rreq->header.size() + rreq->key.size() + (rreq->value.size ? rreq->local_blkid.serialized_size() : 0); - rreq->alloc_journal_entry(entry_size); - rreq->journal_entry->code = journal_type_t::HS_DATA; - rreq->journal_entry->user_header_size = rreq->header.size; - rreq->journal_entry->key_size = rreq->key.size; + rreq->alloc_journal_entry(entry_size, false /* is_raft_buf */); + rreq->journal_entry->code = journal_type_t::HS_LARGE_DATA; + rreq->journal_entry->user_header_size = rreq->header.size(); + rreq->journal_entry->key_size = rreq->key.size(); uint8_t* raw_ptr = uintptr_cast(rreq->journal_entry) + sizeof(repl_journal_entry); - if (rreq->header.size) { - std::memcpy(raw_ptr, rreq->header.bytes, rreq->header.size); - raw_ptr += rreq->header.size; + if (rreq->header.size()) { + std::memcpy(raw_ptr, rreq->header.cbytes(), rreq->header.size()); + raw_ptr += rreq->header.size(); } - if (rreq->key.size) { - std::memcpy(raw_ptr, rreq->key.bytes, rreq->key.size); - raw_ptr += rreq->key.size; + if (rreq->key.size()) { + std::memcpy(raw_ptr, rreq->key.cbytes(), rreq->key.size()); + raw_ptr += rreq->key.size(); } if (rreq->value.size) { - auto b = rreq->local_blkid.serialize(); - std::memcpy(raw_ptr, b.bytes, b.size); - raw_ptr += b.size; + auto const b = rreq->local_blkid.serialize(); + std::memcpy(raw_ptr, b.cbytes(), b.size()); + raw_ptr += b.size(); } - m_data_journal->append_async(sisl::io_blob{rreq->journal_buf.get(), entry_size, false /* is_aligned */}, + m_data_journal->append_async(sisl::io_blob{rreq->raw_journal_buf(), entry_size, false /* is_aligned */}, nullptr /* cookie */, [this, rreq](int64_t lsn, sisl::io_blob&, homestore::logdev_key, void*) mutable { rreq->lsn = lsn; @@ -90,13 +93,13 @@ void SoloReplDev::write_journal(intrusive< repl_req_ctx > rreq) { } void SoloReplDev::on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx) { - repl_journal_entry* entry = r_cast< repl_journal_entry* >(buf.bytes()); + repl_journal_entry const* entry = r_cast< repl_journal_entry const* >(buf.bytes()); uint32_t remain_size = buf.size() - sizeof(repl_journal_entry); HS_REL_ASSERT_EQ(entry->major_version, repl_journal_entry::JOURNAL_ENTRY_MAJOR, "Mismatched version of journal entry found"); - HS_REL_ASSERT_EQ(entry->code, journal_type_t::HS_DATA, "Found a journal entry which is not data"); + HS_REL_ASSERT_EQ(entry->code, journal_type_t::HS_LARGE_DATA, "Found a journal entry which is not data"); - uint8_t* raw_ptr = r_cast< uint8_t* >(entry) + sizeof(repl_journal_entry); + uint8_t const* raw_ptr = r_cast< uint8_t const* >(entry) + sizeof(repl_journal_entry); sisl::blob header{raw_ptr, entry->user_header_size}; HS_REL_ASSERT_GE(remain_size, entry->user_header_size, "Invalid journal entry, header_size mismatch"); raw_ptr += entry->user_header_size; @@ -137,12 +140,5 @@ void SoloReplDev::cp_flush(CP*) { void SoloReplDev::cp_cleanup(CP*) { m_data_journal->truncate(m_rd_sb->checkpoint_lsn); } -void repl_req_ctx::alloc_journal_entry(uint32_t size) { - journal_buf = std::unique_ptr< uint8_t[] >(new uint8_t[size]); - journal_entry = new (journal_buf.get()) repl_journal_entry(); -} - -repl_req_ctx::~repl_req_ctx() { - if (journal_entry) { journal_entry->~repl_journal_entry(); } -} } // namespace homestore + diff --git a/src/lib/replication/repl_dev/solo_repl_dev.h b/src/lib/replication/repl_dev/solo_repl_dev.h index 1ea2367b1..331003f4a 100644 --- a/src/lib/replication/repl_dev/solo_repl_dev.h +++ b/src/lib/replication/repl_dev/solo_repl_dev.h @@ -14,6 +14,7 @@ *********************************************************************************/ #pragma once +#include #include #include @@ -21,45 +22,9 @@ #include #include -namespace homestore { -#pragma pack(1) -struct repl_dev_superblk { - static constexpr uint64_t REPL_DEV_SB_MAGIC = 0xABCDF00D; - static constexpr uint32_t REPL_DEV_SB_VERSION = 1; - - uint64_t magic{REPL_DEV_SB_MAGIC}; - uint32_t version{REPL_DEV_SB_VERSION}; - uuid_t gid; // gid of this replica set - logstore_id_t data_journal_id; // Logstore id for the data journal - int64_t commit_lsn; // LSN upto which this replica has committed - int64_t checkpoint_lsn; // LSN upto which this replica have checkpointed the data - -#if 0 - logstore_id_t free_pba_store_id; // Logstore id for storing free pba records -#endif - - uint64_t get_magic() const { return magic; } - uint32_t get_version() const { return version; } -}; -#pragma pack() - -VENUM(journal_type_t, uint16_t, HS_DATA = 0) -struct repl_journal_entry { - static constexpr uint16_t JOURNAL_ENTRY_MAJOR = 1; - static constexpr uint16_t JOURNAL_ENTRY_MINOR = 1; - - // Major and minor version. For each major version underlying structures could change. Minor versions can only add - // fields, not change any existing fields. - uint16_t major_version{JOURNAL_ENTRY_MAJOR}; - uint16_t minor_version{JOURNAL_ENTRY_MINOR}; - - journal_type_t code; - uint32_t replica_id; - uint32_t user_header_size; - uint32_t key_size; - // Followed by user_header, then key, then MultiBlkId -}; +#include "replication/repl_dev/common.h" +namespace homestore { class CP; class SoloReplDev : public ReplDev { @@ -74,7 +39,7 @@ class SoloReplDev : public ReplDev { virtual ~SoloReplDev() = default; void async_alloc_write(sisl::blob const& header, sisl::blob const& key, sisl::sg_list const& value, - intrusive< repl_req_ctx > ctx) override; + repl_req_ptr_t ctx) override; folly::Future< std::error_code > async_read(MultiBlkId const& bid, sisl::sg_list& sgs, uint32_t size, bool part_of_batch = false) override; @@ -92,7 +57,7 @@ class SoloReplDev : public ReplDev { private: void on_data_journal_created(shared< HomeLogStore > log_store); - void write_journal(intrusive< repl_req_ctx > rreq); + void write_journal(repl_req_ptr_t rreq); void on_log_found(logstore_seq_num_t lsn, log_buffer buf, void* ctx); }; diff --git a/src/lib/replication/service/generic_repl_svc.cpp b/src/lib/replication/service/generic_repl_svc.cpp new file mode 100644 index 000000000..d169d4ce2 --- /dev/null +++ b/src/lib/replication/service/generic_repl_svc.cpp @@ -0,0 +1,150 @@ +/********************************************************************************* + * Modifications Copyright 2017-2019 eBay Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ +#include +#include +#include +#include "common/homestore_assert.hpp" +#include "replication/service/generic_repl_svc.h" +#include "replication/service/raft_repl_service.h" +#include "replication/repl_dev/solo_repl_dev.h" + +namespace homestore { +ReplicationService& repl_service() { return hs()->repl_service(); } + +std::shared_ptr< GenericReplService > GenericReplService::create(cshared< ReplApplication >& repl_app) { + auto impl_type = repl_app->get_impl_type(); + if (impl_type == repl_impl_type::solo) { + return std::make_shared< SoloReplService >(repl_app); + } else if (impl_type == repl_impl_type::server_side) { + return std::make_shared< RaftReplService >(repl_app); + } else { + return nullptr; + } +} + +GenericReplService::GenericReplService(cshared< ReplApplication >& repl_app) : + m_repl_app{repl_app}, m_my_uuid{repl_app->get_my_repl_id()} { + meta_service().register_handler( + get_meta_blk_name(), + [this](meta_blk* mblk, sisl::byte_view buf, size_t) { load_repl_dev(std::move(buf), voidptr_cast(mblk)); }, + nullptr); +} + +void GenericReplService::stop() { + std::unique_lock lg{m_rd_map_mtx}; + m_rd_map.clear(); +} + +ReplResult< shared< ReplDev > > GenericReplService::get_repl_dev(group_id_t group_id) const { + std::shared_lock lg(m_rd_map_mtx); + if (auto it = m_rd_map.find(group_id); it != m_rd_map.end()) { return it->second; } + return folly::makeUnexpected(ReplServiceError::SERVER_NOT_FOUND); +} + +void GenericReplService::iterate_repl_devs(std::function< void(cshared< ReplDev >&) > const& cb) { + std::shared_lock lg(m_rd_map_mtx); + for (const auto& [uuid, rd] : m_rd_map) { + cb(rd); + } +} + +void GenericReplService::add_repl_dev(group_id_t group_id, shared< ReplDev > rdev) { + std::unique_lock lg(m_rd_map_mtx); + [[maybe_unused]] auto [it, happened] = m_rd_map.emplace(std::pair{group_id, rdev}); + HS_DBG_ASSERT(happened, "Unable to put the repl_dev in rd map for group_id={}, duplicated add?", group_id); +} + +hs_stats GenericReplService::get_cap_stats() const { + hs_stats stats; + stats.total_capacity = data_service().get_total_capacity(); + stats.used_capacity = data_service().get_used_capacity(); + return stats; +} + +///////////////////// SoloReplService specializations and CP Callbacks ///////////////////////////// +SoloReplService::SoloReplService(cshared< ReplApplication >& repl_app) : GenericReplService{repl_app} {} + +void SoloReplService::start() { + // Register to CP to flush the super blk and truncate the logstore + hs()->cp_mgr().register_consumer(cp_consumer_t::REPLICATION_SVC, std::make_unique< SoloReplServiceCPHandler >()); +} + +AsyncReplResult< shared< ReplDev > > SoloReplService::create_repl_dev(group_id_t group_id, + std::set< replica_id_t > const& members) { + superblk< repl_dev_superblk > rd_sb{get_meta_blk_name()}; + rd_sb.create(); + rd_sb->group_id = group_id; + auto rdev = std::make_shared< SoloReplDev >(std::move(rd_sb), false /* load_existing */); + + auto listener = m_repl_app->create_repl_dev_listener(group_id); + listener->set_repl_dev(rdev.get()); + rdev->attach_listener(std::move(listener)); + + { + std::unique_lock lg(m_rd_map_mtx); + auto [it, happened] = m_rd_map.emplace(group_id, rdev); + if (!happened) { + // We should never reach here, as we have failed to emplace in map, but couldn't find entry + DEBUG_ASSERT(false, "Unable to put the repl_dev in rd map"); + return make_async_error< shared< ReplDev > >(ReplServiceError::SERVER_ALREADY_EXISTS); + } + } + + return make_async_success< shared< ReplDev > >(rdev); +} + +void SoloReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) { + superblk< repl_dev_superblk > rd_sb{get_meta_blk_name()}; + rd_sb.load(buf, meta_cookie); + HS_DBG_ASSERT_EQ(rd_sb->get_magic(), repl_dev_superblk::REPL_DEV_SB_MAGIC, "Invalid rdev metablk, magic mismatch"); + HS_DBG_ASSERT_EQ(rd_sb->get_version(), repl_dev_superblk::REPL_DEV_SB_VERSION, "Invalid version of rdev metablk"); + group_id_t group_id = rd_sb->group_id; + auto rdev = std::make_shared< SoloReplDev >(std::move(rd_sb), true /* load_existing */); + + auto listener = m_repl_app->create_repl_dev_listener(group_id); + listener->set_repl_dev(rdev.get()); + rdev->attach_listener(std::move(listener)); + + { + std::unique_lock lg(m_rd_map_mtx); + auto [_, happened] = m_rd_map.emplace(group_id, rdev); + (void) happened; + HS_DBG_ASSERT(happened, "Unable to put the repl_dev in rd map for group_id={}", group_id); + } +} + +AsyncReplResult<> SoloReplService::replace_member(group_id_t group_id, replica_id_t member_out, + replica_id_t member_in) const { + return make_async_error<>(ReplServiceError::NOT_IMPLEMENTED); +} + +std::unique_ptr< CPContext > SoloReplServiceCPHandler::on_switchover_cp(CP* cur_cp, CP* new_cp) { return nullptr; } + +folly::Future< bool > SoloReplServiceCPHandler::cp_flush(CP* cp) { + repl_service().iterate_repl_devs([cp](cshared< ReplDev >& repl_dev) { + if (repl_dev) { std::dynamic_pointer_cast< SoloReplDev >(repl_dev)->cp_flush(cp); } + }); + return folly::makeFuture< bool >(true); +} + +void SoloReplServiceCPHandler::cp_cleanup(CP* cp) { + repl_service().iterate_repl_devs([cp](cshared< ReplDev >& repl_dev) { + if (repl_dev) { std::dynamic_pointer_cast< SoloReplDev >(repl_dev)->cp_cleanup(cp); } + }); +} + +int SoloReplServiceCPHandler::cp_progress_percent() { return 100; } + +} // namespace homestore diff --git a/src/lib/replication/service/generic_repl_svc.h b/src/lib/replication/service/generic_repl_svc.h new file mode 100644 index 000000000..64b8ea47a --- /dev/null +++ b/src/lib/replication/service/generic_repl_svc.h @@ -0,0 +1,88 @@ +/********************************************************************************* + * Modifications Copyright 2017-2019 eBay Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ +#pragma once +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace homestore { + +static std::string const PUSH_DATA{"push_data"}; +static std::string const FETCH_DATA{"fetch_data"}; + +struct repl_dev_superblk; +class GenericReplService : public ReplicationService { +protected: + shared< ReplApplication > m_repl_app; + std::shared_mutex m_rd_map_mtx; + std::map< group_id_t, shared< ReplDev > > m_rd_map; + replica_id_t m_my_uuid; + +public: + static std::shared_ptr< GenericReplService > create(cshared< ReplApplication >& repl_app); + + GenericReplService(cshared< ReplApplication >& repl_app); + virtual void start() = 0; + virtual void stop(); + meta_sub_type get_meta_blk_name() const override { return "repl_dev"; } + + ReplResult< shared< ReplDev > > get_repl_dev(group_id_t group_id) const override; + void iterate_repl_devs(std::function< void(cshared< ReplDev >&) > const& cb) override; + + hs_stats get_cap_stats() const override; + replica_id_t get_my_repl_uuid() const { return m_my_uuid; } + +protected: + virtual void add_repl_dev(group_id_t group_id, shared< ReplDev > rdev); + virtual void load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) = 0; +}; + +class SoloReplService : public GenericReplService { +public: + SoloReplService(cshared< ReplApplication >& repl_app); + void start() override; + + AsyncReplResult< shared< ReplDev > > create_repl_dev(group_id_t group_id, + std::set< replica_id_t > const& members) override; + void load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) override; + AsyncReplResult<> replace_member(group_id_t group_id, replica_id_t member_out, + replica_id_t member_in) const override; +}; + +class SoloReplServiceCPHandler : public CPCallbacks { +public: + SoloReplServiceCPHandler() = default; + virtual ~SoloReplServiceCPHandler() = default; + + std::unique_ptr< CPContext > on_switchover_cp(CP* cur_cp, CP* new_cp) override; + folly::Future< bool > cp_flush(CP* cp) override; + void cp_cleanup(CP* cp) override; + int cp_progress_percent() override; +}; + +extern ReplicationService& repl_service(); +} // namespace homestore diff --git a/src/lib/replication/service/raft_repl_service.cpp b/src/lib/replication/service/raft_repl_service.cpp new file mode 100644 index 000000000..d25b49bc6 --- /dev/null +++ b/src/lib/replication/service/raft_repl_service.cpp @@ -0,0 +1,240 @@ +/********************************************************************************* + * Modifications Copyright 2017-2019 eBay Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ +#include +#include +#include + +#include +#include "common/homestore_config.hpp" +#include "common/homestore_assert.hpp" +#include "replication/service/raft_repl_service.h" +#include "replication/repl_dev/raft_repl_dev.h" + +namespace homestore { +ReplServiceError RaftReplService::to_repl_error(nuraft::cmd_result_code code) { + ReplServiceError ret; + if (code == nuraft::cmd_result_code::OK) { + ret = ReplServiceError::OK; + } else if (code == nuraft::cmd_result_code::CANCELLED) { + ret = ReplServiceError::CANCELLED; + } else if (code == nuraft::cmd_result_code::TIMEOUT) { + ret = ReplServiceError::TIMEOUT; + } else if (code == nuraft::cmd_result_code::NOT_LEADER) { + ret = ReplServiceError::NOT_LEADER; + } else if (code == nuraft::cmd_result_code::BAD_REQUEST) { + ret = ReplServiceError::BAD_REQUEST; + } else if (code == nuraft::cmd_result_code::SERVER_ALREADY_EXISTS) { + ret = ReplServiceError::SERVER_ALREADY_EXISTS; + } else if (code == nuraft::cmd_result_code::CONFIG_CHANGING) { + ret = ReplServiceError::CONFIG_CHANGING; + } else if (code == nuraft::cmd_result_code::SERVER_IS_JOINING) { + ret = ReplServiceError::SERVER_IS_JOINING; + } else if (code == nuraft::cmd_result_code::SERVER_NOT_FOUND) { + ret = ReplServiceError::SERVER_NOT_FOUND; + } else if (code == nuraft::cmd_result_code::CANNOT_REMOVE_LEADER) { + ret = ReplServiceError::CANNOT_REMOVE_LEADER; + } else if (code == nuraft::cmd_result_code::SERVER_IS_LEAVING) { + ret = ReplServiceError::SERVER_IS_LEAVING; + } else if (code == nuraft::cmd_result_code::TERM_MISMATCH) { + ret = ReplServiceError::TERM_MISMATCH; + } else if (code == nuraft::cmd_result_code::RESULT_NOT_EXIST_YET) { + ret = ReplServiceError::RESULT_NOT_EXIST_YET; + } else { + ret = ReplServiceError::FAILED; + } + return ret; +} + +RaftReplService::RaftReplService(cshared< ReplApplication >& repl_app) : GenericReplService{repl_app} { + meta_service().register_handler( + get_meta_blk_name() + "_raft_config", + [this](meta_blk* mblk, sisl::byte_view buf, size_t) { + raft_group_config_found(std::move(buf), voidptr_cast(mblk)); + }, + nullptr, false, std::optional< meta_subtype_vec_t >({get_meta_blk_name()})); +} + +void RaftReplService::start() { + /*auto params = nuraft_mesg::Manager::Params{ + .server_uuid_ = m_my_uuid, + .mesg_port_ = m_repl_app->lookup_peer(m_my_uuid).second, + .default_group_type_ = "homestore_replication", + .ssl_key_ = ioenvironment.get_ssl_key(), + .ssl_cert_ = ioenvironment.get_ssl_cert(), + .token_verifier_ = std::dynamic_pointer_cast< sisl::GrpcTokenVerifier >(ioenvironment.get_token_verifier()), + .token_client_ = std::dynamic_pointer_cast< sisl::GrpcTokenClient >(ioenvironment.get_token_client())};*/ + auto params = nuraft_mesg::Manager::Params(); + params.server_uuid_ = m_my_uuid; + params.mesg_port_ = m_repl_app->lookup_peer(m_my_uuid).second; + params.default_group_type_ = "homestore_replication"; + m_msg_mgr = nuraft_mesg::init_messaging(params, weak_from_this(), true /* with_data_channel */); + + LOGINFOMOD(replication, "Starting RaftReplService with server_uuid={} port={}", + boost::uuids::to_string(params.server_uuid_), params.mesg_port_); + + auto r_params = nuraft::raft_params() + .with_election_timeout_lower(HS_DYNAMIC_CONFIG(consensus.elect_to_low_ms)) + .with_election_timeout_upper(HS_DYNAMIC_CONFIG(consensus.elect_to_high_ms)) + .with_rpc_failure_backoff(HS_DYNAMIC_CONFIG(consensus.rpc_backoff_ms)) + .with_hb_interval(HS_DYNAMIC_CONFIG(consensus.heartbeat_period_ms)) + .with_max_append_size(HS_DYNAMIC_CONFIG(consensus.max_append_batch_size)) + .with_log_sync_batch_size(HS_DYNAMIC_CONFIG(consensus.log_sync_batch_size)) + .with_log_sync_stopping_gap(HS_DYNAMIC_CONFIG(consensus.min_log_gap_to_join)) + .with_stale_log_gap(HS_DYNAMIC_CONFIG(consensus.stale_log_gap_hi_threshold)) + .with_fresh_log_gap(HS_DYNAMIC_CONFIG(consensus.stale_log_gap_lo_threshold)) + .with_snapshot_enabled(HS_DYNAMIC_CONFIG(consensus.snapshot_freq_distance)) + .with_reserved_log_items(0) // In reality ReplLogStore retains much more than this + .with_auto_forwarding(false); + r_params.return_method_ = nuraft::raft_params::async_handler; + m_msg_mgr->register_mgr_type(params.default_group_type_, r_params); + + hs()->cp_mgr().register_consumer(cp_consumer_t::REPLICATION_SVC, std::make_unique< RaftReplServiceCPHandler >()); +} + +void RaftReplService::raft_group_config_found(sisl::byte_view const& buf, void* meta_cookie) { + json_superblk group_config; + auto& js = group_config.load(buf, meta_cookie); + std::string gid_str = js["group_id"]; + RELEASE_ASSERT(!gid_str.empty(), "Invalid raft_group config found"); + + boost::uuids::string_generator gen; + uuid_t uuid = gen(gid_str); + + auto v = get_repl_dev(uuid); + RELEASE_ASSERT(bool(v), "Not able to find the group_id corresponding, has repl_dev superblk not loaded yet?"); + + (std::dynamic_pointer_cast< RaftReplDev >(*v))->use_config(std::move(group_config)); +} + +std::string RaftReplService::lookup_peer(nuraft_mesg::peer_id_t const& peer) { + auto const p = m_repl_app->lookup_peer(peer); + return p.first + ":" + std::to_string(p.second); +} + +shared< nuraft_mesg::mesg_state_mgr > RaftReplService::create_state_mgr(int32_t srv_id, + nuraft_mesg::group_id_t const& group_id) { + auto result = get_repl_dev(group_id); + if (result) { return std::dynamic_pointer_cast< nuraft_mesg::mesg_state_mgr >(result.value()); } + + // Create a new raft superblk + superblk< raft_repl_dev_superblk > rd_sb{get_meta_blk_name()}; + rd_sb.create(); + rd_sb->group_id = group_id; + rd_sb->is_timeline_consistent = m_repl_app->need_timeline_consistency(); + + // Create a new instance of Raft ReplDev (which is the state manager this method is looking for) + auto rdev = std::make_shared< RaftReplDev >(*this, std::move(rd_sb), false /* load_existing */); + rdev->use_config(json_superblk{get_meta_blk_name() + "_raft_config"}); + + // Attach the listener to the raft + auto listener = m_repl_app->create_repl_dev_listener(group_id); + listener->set_repl_dev(rdev.get()); + rdev->attach_listener(std::move(listener)); + + // Add the repl dev to the map + add_repl_dev(group_id, rdev); + return std::dynamic_pointer_cast< nuraft_mesg::mesg_state_mgr >(rdev); +} + +AsyncReplResult< shared< ReplDev > > RaftReplService::create_repl_dev(group_id_t group_id, + std::set< replica_id_t > const& members) { + // TODO: All operations are made sync here for convenience to caller. However, we should attempt to make this async + // and do deferValue to a seperate dedicated hs thread for these kind of operations and wakeup the caller. It + // probably needs iomanager executor for deferValue. + if (members.size() > 0) { + // Create a new RAFT group and add all members. create_group() will call the create_state_mgr which will create + // the repl_dev instance and add it to the map. + if (auto const status = m_msg_mgr->create_group(group_id, "homestore_replication").get(); !status) { + return make_async_error< shared< ReplDev > >(to_repl_error(status.error())); + } + + auto my_id = m_repl_app->get_my_repl_id(); + for (auto& member : members) { + if (member == my_id) { continue; } // Skip myself + do { + auto const result = m_msg_mgr->add_member(group_id, member).get(); + if (result) { + LOGINFO("Groupid={}, new member={} added", boost::uuids::to_string(group_id), + boost::uuids::to_string(member)); + break; + } else if (result.error() != nuraft::CONFIG_CHANGING) { + return make_async_error< shared< ReplDev > >(to_repl_error(result.error())); + } else { + LOGWARN("Config is changing for group_id={} while adding member={}, retry operation in a second", + boost::uuids::to_string(group_id), boost::uuids::to_string(member)); + std::this_thread::sleep_for(std::chrono::seconds(1)); + } + } while (true); + } + } + + auto result = get_repl_dev(group_id); + return result ? make_async_success< shared< ReplDev > >(result.value()) + : make_async_error< shared< ReplDev > >(ReplServiceError::SERVER_NOT_FOUND); +} + +void RaftReplService::load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) { + // Load the superblk + superblk< raft_repl_dev_superblk > rd_sb{get_meta_blk_name()}; + rd_sb.load(buf, meta_cookie); + HS_DBG_ASSERT_EQ(rd_sb->get_magic(), repl_dev_superblk::REPL_DEV_SB_MAGIC, "Invalid rdev metablk, magic mismatch"); + HS_DBG_ASSERT_EQ(rd_sb->get_raft_sb_version(), raft_repl_dev_superblk::RAFT_REPL_DEV_SB_VERSION, + "Invalid version of raft rdev metablk"); + group_id_t group_id = rd_sb->group_id; + + // Validate if the repl_dev for this group is already loaded. + auto rdev_result = get_repl_dev(group_id); + if (rdev_result) { + HS_DBG_ASSERT("Group ID={} already loaded and added to repl_dev list, duplicate load?", + boost::uuids::to_string(group_id).c_str()); + return; + } + + // Create an instance of ReplDev from loaded superblk + auto rdev = std::make_shared< RaftReplDev >(*this, std::move(rd_sb), true /* load_existing */); + + // Try to join the RAFT group + auto raft_result = m_msg_mgr->join_group(group_id, "homestore_replication", + std::dynamic_pointer_cast< nuraft_mesg::mesg_state_mgr >(rdev)); + if (!raft_result) { + HS_DBG_ASSERT(false, "Unable to join the group_id={} with error={}", boost::uuids::to_string(group_id).c_str(), + raft_result.error()); + } + + // Add the RaftReplDev to the list of repl_devs + add_repl_dev(group_id, rdev); +} + +AsyncReplResult<> RaftReplService::replace_member(group_id_t group_id, replica_id_t member_out, + replica_id_t member_in) const { + return make_async_error<>(ReplServiceError::NOT_IMPLEMENTED); +} + +///////////////////// RaftReplService CP Callbacks ///////////////////////////// +std::unique_ptr< CPContext > RaftReplServiceCPHandler::on_switchover_cp(CP* cur_cp, CP* new_cp) { return nullptr; } + +folly::Future< bool > RaftReplServiceCPHandler::cp_flush(CP* cp) { + repl_service().iterate_repl_devs( + [cp](cshared< ReplDev >& repl_dev) { std::static_pointer_cast< RaftReplDev >(repl_dev)->cp_flush(cp); }); + return folly::makeFuture< bool >(true); +} + +void RaftReplServiceCPHandler::cp_cleanup(CP* cp) { + repl_service().iterate_repl_devs( + [cp](cshared< ReplDev >& repl_dev) { std::static_pointer_cast< RaftReplDev >(repl_dev)->cp_cleanup(cp); }); +} + +int RaftReplServiceCPHandler::cp_progress_percent() { return 100; } +} // namespace homestore diff --git a/src/lib/replication/service/raft_repl_service.h b/src/lib/replication/service/raft_repl_service.h new file mode 100644 index 000000000..fa12cd07e --- /dev/null +++ b/src/lib/replication/service/raft_repl_service.h @@ -0,0 +1,77 @@ +/********************************************************************************* + * Modifications Copyright 2017-2019 eBay Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ +#pragma once +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include "replication/service/generic_repl_svc.h" + +namespace homestore { + +struct repl_dev_superblk; +class RaftReplService : public GenericReplService, + public nuraft_mesg::MessagingApplication, + public std::enable_shared_from_this< RaftReplService > { +private: + shared< nuraft_mesg::Manager > m_msg_mgr; + json_superblk m_config_sb; + +public: + RaftReplService(cshared< ReplApplication >& repl_app); + + static ReplServiceError to_repl_error(nuraft::cmd_result_code code); + + ///////////////////// Overrides of nuraft_mesg::MessagingApplication //////////////////// + std::string lookup_peer(nuraft_mesg::peer_id_t const&) override; + std::shared_ptr< nuraft_mesg::mesg_state_mgr > create_state_mgr(int32_t srv_id, + nuraft_mesg::group_id_t const& group_id) override; + nuraft_mesg::Manager& msg_manager() { return *m_msg_mgr; } + +protected: + ///////////////////// Overrides of GenericReplService //////////////////// + void start() override; + AsyncReplResult< shared< ReplDev > > create_repl_dev(group_id_t group_id, + std::set< replica_id_t > const& members) override; + void load_repl_dev(sisl::byte_view const& buf, void* meta_cookie) override; + AsyncReplResult<> replace_member(group_id_t group_id, replica_id_t member_out, + replica_id_t member_in) const override; + +private: + void raft_group_config_found(sisl::byte_view const& buf, void* meta_cookie); +}; + +class RaftReplServiceCPHandler : public CPCallbacks { +public: + RaftReplServiceCPHandler() = default; + virtual ~RaftReplServiceCPHandler() = default; + +public: + std::unique_ptr< CPContext > on_switchover_cp(CP* cur_cp, CP* new_cp) override; + folly::Future< bool > cp_flush(CP* cp) override; + void cp_cleanup(CP* cp) override; + int cp_progress_percent() override; +}; + +} // namespace homestore diff --git a/src/lib/replication/service/repl_service_impl.cpp b/src/lib/replication/service/repl_service_impl.cpp deleted file mode 100644 index e439488ca..000000000 --- a/src/lib/replication/service/repl_service_impl.cpp +++ /dev/null @@ -1,179 +0,0 @@ -/********************************************************************************* - * Modifications Copyright 2017-2019 eBay Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software distributed - * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR - * CONDITIONS OF ANY KIND, either express or implied. See the License for the - * specific language governing permissions and limitations under the License. - * - *********************************************************************************/ -#include -#include -#include "common/homestore_assert.hpp" -#include "replication/service/repl_service_impl.h" -#include "replication/repl_dev/solo_repl_dev.h" -#include "homestore/blkdata_service.hpp" -#include "homestore/homestore.hpp" - -namespace homestore { -ReplicationService& repl_service() { return hs()->repl_service(); } - -ReplicationServiceImpl::ReplicationServiceImpl(repl_impl_type impl_type) : m_repl_type{impl_type} { - meta_service().register_handler( - "repl_dev", - [this](meta_blk* mblk, sisl::byte_view buf, size_t) { rd_super_blk_found(std::move(buf), voidptr_cast(mblk)); }, - nullptr); -} - -void ReplicationServiceImpl::start() { - // Register to CP to flush the super blk and truncate the logstore - hs()->cp_mgr().register_consumer(cp_consumer_t::REPLICATION_SVC, std::make_unique< ReplServiceCPHandler >()); - - { - std::shared_lock lg{m_rd_map_mtx}; - for (auto const& [gid, info] : m_pending_open) { - // info.dev_promise.setValue(folly::makeUnexpected(ReplServiceError::SERVER_NOT_FOUND)); - } - } - m_rd_map_loaded = true; -} - -void ReplicationServiceImpl::stop() { - std::unique_lock lg{m_rd_map_mtx}; - m_rd_map.clear(); -} - -hs_stats ReplicationServiceImpl::get_cap_stats() const { - hs_stats stats; - - stats.total_capacity = data_service().get_total_capacity(); - stats.used_capacity = data_service().get_used_capacity(); - return stats; -} - -AsyncReplResult< shared< ReplDev > > -ReplicationServiceImpl::create_repl_dev(uuid_t group_id, std::set< std::string, std::less<> >&& members, - std::unique_ptr< ReplDevListener > listener) { - superblk< repl_dev_superblk > rd_sb{"repl_dev"}; - rd_sb.create(sizeof(repl_dev_superblk)); - rd_sb->gid = group_id; - - shared< ReplDev > repl_dev = create_repl_dev_instance(std::move(rd_sb), false /* load_existing */); - listener->set_repl_dev(repl_dev.get()); - repl_dev->attach_listener(std::move(listener)); - return make_async_success(std::move(repl_dev)); -} - -AsyncReplResult< shared< ReplDev > > -ReplicationServiceImpl::open_repl_dev(uuid_t group_id, std::unique_ptr< ReplDevListener > listener) { - if (m_rd_map_loaded) { - // We have already loaded all repl_dev and open_repl_dev is called after that, we don't support dynamically - // opening the repl_dev. Return an error - LOGERROR("Opening group_id={} after services are started, which is not supported", - boost::uuids::to_string(group_id)); - return make_async_error< shared< ReplDev > >(ReplServiceError::BAD_REQUEST); - } - - std::unique_lock lg(m_rd_map_mtx); - auto it = m_rd_map.find(group_id); - if (it != m_rd_map.end()) { - // We already loaded the ReplDev, just call the group_id and attach the listener - auto repl_dev = it->second; - listener->set_repl_dev(repl_dev.get()); - repl_dev->attach_listener(std::move(listener)); - return make_async_success< shared< ReplDev > >(std::move(repl_dev)); - } else { - auto [pending_it, inserted] = - m_pending_open.insert_or_assign(group_id, listener_info{.listener = std::move(listener)}); - DEBUG_ASSERT(inserted, "Duplicate open_replica_dev called for group_id = {}", - boost::uuids::to_string(group_id)); - return pending_it->second.dev_promise.getFuture(); - } -} - -ReplResult< shared< ReplDev > > ReplicationServiceImpl::get_repl_dev(uuid_t group_id) const { - std::shared_lock lg(m_rd_map_mtx); - if (auto it = m_rd_map.find(group_id); it != m_rd_map.end()) { return it->second; } - return folly::makeUnexpected(ReplServiceError::SERVER_NOT_FOUND); -} - -void ReplicationServiceImpl::iterate_repl_devs(std::function< void(cshared< ReplDev >&) > const& cb) { - std::shared_lock lg(m_rd_map_mtx); - for (const auto& [uuid, rd] : m_rd_map) { - cb(rd); - } -} - -folly::Future< ReplServiceError > ReplicationServiceImpl::replace_member(uuid_t group_id, std::string const& member_out, - std::string const& member_in) const { - return folly::makeFuture< ReplServiceError >(ReplServiceError::NOT_IMPLEMENTED); -} - -shared< ReplDev > ReplicationServiceImpl::create_repl_dev_instance(superblk< repl_dev_superblk >&& rd_sb, - bool load_existing) { - auto it = m_rd_map.end(); - bool happened = false; - - { - std::unique_lock lg(m_rd_map_mtx); - std::tie(it, happened) = m_rd_map.emplace(std::make_pair(rd_sb->gid, nullptr)); - } - DEBUG_ASSERT(m_rd_map.end() != it, "Could not insert into map!"); - if (!happened) { return it->second; } - - shared< ReplDev > repl_dev; - if (m_repl_type == repl_impl_type::solo) { - repl_dev = std::make_shared< SoloReplDev >(std::move(rd_sb), load_existing); - } else { - HS_REL_ASSERT(false, "Repl impl type = {} is not supported yet", enum_name(m_repl_type)); - } - it->second = repl_dev; - - return repl_dev; -} - -void ReplicationServiceImpl::rd_super_blk_found(sisl::byte_view const& buf, void* meta_cookie) { - superblk< repl_dev_superblk > rd_sb; - rd_sb.load(buf, meta_cookie); - HS_DBG_ASSERT_EQ(rd_sb->get_magic(), repl_dev_superblk::REPL_DEV_SB_MAGIC, "Invalid rdev metablk, magic mismatch"); - HS_DBG_ASSERT_EQ(rd_sb->get_version(), repl_dev_superblk::REPL_DEV_SB_VERSION, "Invalid version of rdev metablk"); - auto rd_sb_gid = rd_sb->gid; - shared< ReplDev > repl_dev = create_repl_dev_instance(std::move(rd_sb), true /* load_existing */); - { - std::unique_lock lg(m_rd_map_mtx); - auto it = m_pending_open.find(rd_sb_gid); - if (it != m_pending_open.end()) { - auto& li_info = it->second; - // Someone waiting for this repl dev to open, call them to attach the listener and provide the value - li_info.listener->set_repl_dev(repl_dev.get()); - repl_dev->attach_listener(std::move(li_info.listener)); - li_info.dev_promise.setValue(repl_dev); - m_pending_open.erase(it); - } - } -} - -///////////////////// CP Callbacks for Repl Service ////////////// -ReplServiceCPHandler::ReplServiceCPHandler() {} - -std::unique_ptr< CPContext > ReplServiceCPHandler::on_switchover_cp(CP* cur_cp, CP* new_cp) { return nullptr; } - -folly::Future< bool > ReplServiceCPHandler::cp_flush(CP* cp) { - repl_service().iterate_repl_devs( - [cp](cshared< ReplDev >& repl_dev) { std::dynamic_pointer_cast< SoloReplDev >(repl_dev)->cp_flush(cp); }); - return folly::makeFuture< bool >(true); -} - -void ReplServiceCPHandler::cp_cleanup(CP* cp) { - repl_service().iterate_repl_devs( - [cp](cshared< ReplDev >& repl_dev) { std::dynamic_pointer_cast< SoloReplDev >(repl_dev)->cp_cleanup(cp); }); -} - -int ReplServiceCPHandler::cp_progress_percent() { return 100; } - -} // namespace homestore diff --git a/src/lib/replication/service/repl_service_impl.h b/src/lib/replication/service/repl_service_impl.h deleted file mode 100644 index 3a0e9493d..000000000 --- a/src/lib/replication/service/repl_service_impl.h +++ /dev/null @@ -1,93 +0,0 @@ -/********************************************************************************* - * Modifications Copyright 2017-2019 eBay Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software distributed - * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR - * CONDITIONS OF ANY KIND, either express or implied. See the License for the - * specific language governing permissions and limitations under the License. - * - *********************************************************************************/ -#pragma once -#include -#include -#include -#include - -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -namespace homestore { - -struct repl_dev_superblk; -class ReplicationServiceImpl : public ReplicationService { - struct listener_info { - folly::Promise< folly::Expected< shared< ReplDev >, ReplServiceError > > dev_promise{}; - std::unique_ptr< ReplDevListener > listener; - }; - - template < class V > - auto make_async_error(ReplServiceError err) { - return folly::makeFuture< ReplResult< V > >(folly::makeUnexpected(err)); - } - - template < class V > - auto make_async_success(V&& v) { - return folly::makeFuture< ReplResult< V > >(std::move(v)); - } - -protected: - repl_impl_type m_repl_type; - std::shared_mutex m_rd_map_mtx; - std::map< uuid_t, shared< ReplDev > > m_rd_map; - std::map< uuid_t, listener_info > m_pending_open; - std::atomic< bool > m_rd_map_loaded{false}; - -public: - ReplicationServiceImpl(repl_impl_type impl_type); - void start(); - void stop(); - AsyncReplResult< shared< ReplDev > > create_repl_dev(uuid_t group_id, - std::set< std::string, std::less<> >&& members, - std::unique_ptr< ReplDevListener > listener) override; - AsyncReplResult< shared< ReplDev > > open_repl_dev(uuid_t group_id, - std::unique_ptr< ReplDevListener > listener) override; - ReplResult< shared< ReplDev > > get_repl_dev(uuid_t group_id) const override; - void iterate_repl_devs(std::function< void(cshared< ReplDev >&) > const& cb) override; - - folly::Future< ReplServiceError > replace_member(uuid_t group_id, std::string const& member_out, - std::string const& member_in) const override; - hs_stats get_cap_stats() const override; - - -private: - shared< ReplDev > create_repl_dev_instance(superblk< repl_dev_superblk > &&rd_sb, bool load_existing); - void rd_super_blk_found(sisl::byte_view const& buf, void* meta_cookie); -}; - -class ReplServiceCPHandler : public CPCallbacks { -public: - ReplServiceCPHandler(); - virtual ~ReplServiceCPHandler() = default; - -public: - std::unique_ptr< CPContext > on_switchover_cp(CP* cur_cp, CP* new_cp) override; - folly::Future< bool > cp_flush(CP* cp) override; - void cp_cleanup(CP* cp) override; - int cp_progress_percent() override; -}; - -extern ReplicationService& repl_service(); -} // namespace homestore diff --git a/src/test_common/bits_generator.hpp b/src/test_common/bits_generator.hpp deleted file mode 100644 index 97fb035d5..000000000 --- a/src/test_common/bits_generator.hpp +++ /dev/null @@ -1,39 +0,0 @@ -/********************************************************************************* - * Modifications Copyright 2017-2019 eBay Inc. - * - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software distributed - * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR - * CONDITIONS OF ANY KIND, either express or implied. See the License for the - * specific language governing permissions and limitations under the License. - * - *********************************************************************************/ -#pragma once -#include -#include -#include -#include - -namespace homestore { - -class BitsGenerator { -public: - static void gen_random_bits(size_t size, uint8_t* buf) { - std::random_device rd; - std::default_random_engine g(rd()); - std::uniform_int_distribution< unsigned long long > dis(std::numeric_limits< std::uint8_t >::min(), - std::numeric_limits< std::uint8_t >::max()); - for (size_t i = 0; i < size; ++i) { - buf[i] = dis(g); - } - } - - static void gen_random_bits(sisl::blob& b) { gen_random_bits(b.size, b.bytes); } -}; - -}; // namespace homestore diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt index decd98b22..708aa161b 100644 --- a/src/tests/CMakeLists.txt +++ b/src/tests/CMakeLists.txt @@ -3,6 +3,7 @@ cmake_minimum_required(VERSION 3.13) include (${CMAKE_SOURCE_DIR}/cmake/test_mode.cmake) include_directories (BEFORE ../include/) include_directories (BEFORE ../lib/) +include_directories (BEFORE ../tests/) include_directories (BEFORE .) add_subdirectory(test_scripts) @@ -15,7 +16,7 @@ if (${build_nonio_tests}) add_executable(test_blkalloc) target_sources(test_blkalloc PRIVATE test_blkalloc.cpp $) target_link_libraries(test_blkalloc homestore ${COMMON_TEST_DEPS} ) - add_test(NAME BlkAlloc COMMAND ${CMAKE_SOURCE_DIR}/test_wrap.sh ${CMAKE_BINARY_DIR}/bin/test_blkalloc) + add_test(NAME BlkAlloc COMMAND ${CMAKE_BINARY_DIR}/bin/test_blkalloc) add_executable(test_blk_cache_queue) target_sources(test_blk_cache_queue PRIVATE test_blk_cache_queue.cpp ../lib/blkalloc/blk_cache_queue.cpp) @@ -25,7 +26,7 @@ if (${build_nonio_tests}) set(TEST_JOURNAL_VDEV_SOURCES test_journal_vdev.cpp) add_executable(test_journal_vdev ${TEST_JOURNAL_VDEV_SOURCES}) target_link_libraries(test_journal_vdev homestore ${COMMON_TEST_DEPS} GTest::gmock) - add_test(NAME JournalVDev COMMAND ${CMAKE_SOURCE_DIR}/test_wrap.sh ${CMAKE_BINARY_DIR}/bin/test_journal_vdev) + add_test(NAME JournalVDev COMMAND ${CMAKE_BINARY_DIR}/bin/test_journal_vdev) set(TEST_BTREENODE_SOURCE_FILES test_btree_node.cpp) add_executable(test_btree_node ${TEST_BTREENODE_SOURCE_FILES}) @@ -36,6 +37,7 @@ if (${build_nonio_tests}) add_executable(test_mem_btree ${TEST_MEMBTREE_SOURCE_FILES}) target_link_libraries(test_mem_btree ${COMMON_TEST_DEPS} GTest::gtest) add_test(NAME MemBtree COMMAND test_mem_btree) + set_tests_properties(MemBtree PROPERTIES TIMEOUT 180) add_executable(test_blk_read_tracker) target_sources(test_blk_read_tracker PRIVATE test_blk_read_tracker.cpp ../lib/blkdata_svc/blk_read_tracker.cpp ../lib/blkalloc/blk.cpp) @@ -94,20 +96,32 @@ if (${io_tests}) target_sources(test_solo_repl_dev PRIVATE test_solo_repl_dev.cpp) target_link_libraries(test_solo_repl_dev homestore ${COMMON_TEST_DEPS} GTest::gmock) + add_executable(test_home_raft_logstore) + target_sources(test_home_raft_logstore PRIVATE test_home_raft_logstore.cpp) + target_link_libraries(test_home_raft_logstore homestore ${COMMON_TEST_DEPS} GTest::gmock) + + add_executable(test_raft_repl_dev) + target_sources(test_raft_repl_dev PRIVATE test_raft_repl_dev.cpp) + target_link_libraries(test_raft_repl_dev homestore ${COMMON_TEST_DEPS} GTest::gmock) + can_build_epoll_io_tests(epoll_tests) if(${epoll_tests}) - add_test(NAME LogStore-Epoll COMMAND ${CMAKE_SOURCE_DIR}/test_wrap.sh ${CMAKE_BINARY_DIR}/bin/test_log_store) - add_test(NAME MetaBlkMgr-Epoll COMMAND ${CMAKE_SOURCE_DIR}/test_wrap.sh ${CMAKE_BINARY_DIR}/bin/test_meta_blk_mgr) - add_test(NAME DataService-Epoll COMMAND ${CMAKE_SOURCE_DIR}/test_wrap.sh ${CMAKE_BINARY_DIR}/bin/test_data_service) - add_test(NAME SoloReplDev-Epoll COMMAND ${CMAKE_SOURCE_DIR}/test_wrap.sh ${CMAKE_BINARY_DIR}/bin/test_solo_repl_dev) + add_test(NAME LogStore-Epoll COMMAND ${CMAKE_BINARY_DIR}/bin/test_log_store) + add_test(NAME MetaBlkMgr-Epoll COMMAND ${CMAKE_BINARY_DIR}/bin/test_meta_blk_mgr) + add_test(NAME DataService-Epoll COMMAND ${CMAKE_BINARY_DIR}/bin/test_data_service) + add_test(NAME SoloReplDev-Epoll COMMAND ${CMAKE_BINARY_DIR}/bin/test_solo_repl_dev) + add_test(NAME HomeRaftLogStore-Epoll COMMAND ${CMAKE_BINARY_DIR}/bin/test_home_raft_logstore) + add_test(NAME RaftReplDev-Epoll COMMAND ${CMAKE_BINARY_DIR}/bin/test_raft_repl_dev) endif() can_build_spdk_io_tests(spdk_tests) if(${spdk_tests}) - add_test(NAME LogStore-Spdk COMMAND ${CMAKE_SOURCE_DIR}/test_wrap.sh ${CMAKE_BINARY_DIR}/bin/test_log_store -- --spdk "true") - add_test(NAME MetaBlkMgr-Spdk COMMAND ${CMAKE_SOURCE_DIR}/test_wrap.sh ${CMAKE_BINARY_DIR}/bin/test_meta_blk_mgr -- --spdk "true") - add_test(NAME DataSerice-Spdk COMMAND ${CMAKE_SOURCE_DIR}/test_wrap.sh ${CMAKE_BINARY_DIR}/bin/test_data_service -- --spdk "true") - add_test(NAME SoloReplDev-Spdk COMMAND ${CMAKE_SOURCE_DIR}/test_wrap.sh ${CMAKE_BINARY_DIR}/bin/test_solo_repl_dev -- --spdk "true") + add_test(NAME LogStore-Spdk COMMAND ${CMAKE_BINARY_DIR}/bin/test_log_store -- --spdk "true") + add_test(NAME MetaBlkMgr-Spdk COMMAND ${CMAKE_BINARY_DIR}/bin/test_meta_blk_mgr -- --spdk "true") + add_test(NAME DataSerice-Spdk COMMAND ${CMAKE_BINARY_DIR}/bin/test_data_service -- --spdk "true") + add_test(NAME SoloReplDev-Spdk COMMAND ${CMAKE_BINARY_DIR}/bin/test_solo_repl_dev -- --spdk "true") + add_test(NAME HomeRaftLogStore-Spdk COMMAND ${CMAKE_BINARY_DIR}/bin/test_home_raft_logstore -- --spdk "true") + add_test(NAME RaftReplDev-Spdk COMMAND ${CMAKE_BINARY_DIR}/bin/test_raft_repl_dev -- --spdk "true") if(${epoll_tests}) SET_TESTS_PROPERTIES(MetaBlkMgr-Spdk PROPERTIES DEPENDS LogStore-Spdk) SET_TESTS_PROPERTIES(DataService-Spdk PROPERTIES DEPENDS MetaBlkMgr-Spdk) diff --git a/src/tests/btree_helpers/btree_test_helper.hpp b/src/tests/btree_helpers/btree_test_helper.hpp index e9e93b491..be5cc4e14 100644 --- a/src/tests/btree_helpers/btree_test_helper.hpp +++ b/src/tests/btree_helpers/btree_test_helper.hpp @@ -381,9 +381,9 @@ struct BtreeTestHelper { // Construct a weighted distribution based on the input frequencies std::discrete_distribution< uint32_t > s_rand_op_generator(weights.begin(), weights.end()); auto m_start_time = Clock::now(); - auto time_to_stop = [this, m_start_time]() { - return (get_elapsed_time_sec(m_start_time) > m_run_time); - }; + + auto time_to_stop = [this, m_start_time]() {return (get_elapsed_time_sec(m_start_time) > m_run_time);}; + for (uint32_t i = 0; i < num_iters_per_thread && !time_to_stop(); i++) { uint32_t op_idx = s_rand_op_generator(re); (this->m_operations[op_list[op_idx].first])(); diff --git a/src/tests/btree_helpers/btree_test_kvs.hpp b/src/tests/btree_helpers/btree_test_kvs.hpp index f13aa1d5d..4d4481136 100644 --- a/src/tests/btree_helpers/btree_test_kvs.hpp +++ b/src/tests/btree_helpers/btree_test_kvs.hpp @@ -72,7 +72,7 @@ class TestFixedKey : public BtreeKey { TestFixedKey(uint64_t k) : m_key{k} {} TestFixedKey(const TestFixedKey& other) : TestFixedKey(other.serialize(), true) {} TestFixedKey(const BtreeKey& other) : TestFixedKey(other.serialize(), true) {} - TestFixedKey(const sisl::blob& b, bool copy) : BtreeKey(), m_key{*(r_cast< const uint64_t* >(b.bytes))} {} + TestFixedKey(const sisl::blob& b, bool copy) : BtreeKey(), m_key{*(r_cast< const uint64_t* >(b.cbytes()))} {} TestFixedKey& operator=(const TestFixedKey& other) = default; TestFixedKey& operator=(BtreeKey const& other) { m_key = s_cast< TestFixedKey const& >(other).m_key; @@ -114,7 +114,7 @@ class TestFixedKey : public BtreeKey { static uint32_t get_fixed_size() { return (sizeof(uint64_t)); } std::string to_string() const { return fmt::format("{}", m_key); } - void deserialize(const sisl::blob& b, bool copy) override { m_key = *(r_cast< const uint64_t* >(b.bytes)); } + void deserialize(const sisl::blob& b, bool copy) override { m_key = *(r_cast< const uint64_t* >(b.cbytes())); } static uint32_t get_max_size() { return get_fixed_size(); } friend std::ostream& operator<<(std::ostream& os, const TestFixedKey& k) { @@ -182,7 +182,7 @@ class TestVarLenKey : public BtreeKey { } void deserialize(const sisl::blob& b, bool copy) { - std::string data{r_cast< const char* >(b.bytes), b.size}; + std::string data{r_cast< const char* >(b.cbytes()), b.size()}; std::stringstream ss; ss << std::hex << data.substr(0, 8); ss >> m_key; @@ -255,7 +255,7 @@ class TestIntervalKey : public BtreeIntervalKey { TestIntervalKey(const TestIntervalKey& other) = default; TestIntervalKey(const BtreeKey& other) : TestIntervalKey(other.serialize(), true) {} TestIntervalKey(const sisl::blob& b, bool copy) : BtreeIntervalKey() { - TestIntervalKey* other = r_cast< TestIntervalKey* >(b.bytes); + TestIntervalKey const* other = r_cast< TestIntervalKey const* >(b.cbytes()); m_base = other->m_base; m_offset = other->m_offset; } @@ -290,8 +290,8 @@ class TestIntervalKey : public BtreeIntervalKey { uint32_t serialized_size() const override { return sizeof(TestIntervalKey); } void deserialize(sisl::blob const& b, bool copy) override { - assert(b.size == sizeof(TestIntervalKey)); - TestIntervalKey* other = r_cast< TestIntervalKey* >(b.bytes); + assert(b.size() == sizeof(TestIntervalKey)); + TestIntervalKey const* other = r_cast< TestIntervalKey const* >(b.cbytes()); m_base = other->m_base; m_offset = other->m_offset; } @@ -329,12 +329,12 @@ class TestIntervalKey : public BtreeIntervalKey { uint32_t serialized_suffix_size() const override { return uint32_cast(sizeof(uint32_t)); }; void deserialize(sisl::blob const& prefix, sisl::blob const& suffix, bool) { - DEBUG_ASSERT_EQ(prefix.size, sizeof(uint32_t), "Invalid prefix size on deserialize"); - DEBUG_ASSERT_EQ(suffix.size, sizeof(uint32_t), "Invalid suffix size on deserialize"); - uint32_t* other_p = r_cast< uint32_t* >(prefix.bytes); + DEBUG_ASSERT_EQ(prefix.size(), sizeof(uint32_t), "Invalid prefix size on deserialize"); + DEBUG_ASSERT_EQ(suffix.size(), sizeof(uint32_t), "Invalid suffix size on deserialize"); + uint32_t const* other_p = r_cast< uint32_t const* >(prefix.cbytes()); m_base = *other_p; - uint32_t* other_s = r_cast< uint32_t* >(suffix.bytes); + uint32_t const* other_s = r_cast< uint32_t const* >(suffix.cbytes()); m_offset = *other_s; } @@ -364,7 +364,7 @@ class TestFixedValue : public BtreeValue { TestFixedValue(uint32_t val) : BtreeValue() { m_val = val; } TestFixedValue() : TestFixedValue((uint32_t)-1) {} TestFixedValue(const TestFixedValue& other) : BtreeValue() { m_val = other.m_val; }; - TestFixedValue(const sisl::blob& b, bool copy) : BtreeValue() { m_val = *(r_cast< uint32_t* >(b.bytes)); } + TestFixedValue(const sisl::blob& b, bool copy) : BtreeValue() { m_val = *(r_cast< uint32_t const* >(b.cbytes())); } virtual ~TestFixedValue() = default; static TestFixedValue generate_rand() { return TestFixedValue{g_randval_generator(g_re)}; } @@ -375,15 +375,13 @@ class TestFixedValue : public BtreeValue { } sisl::blob serialize() const override { - sisl::blob b; - b.bytes = uintptr_cast(const_cast< uint32_t* >(&m_val)); - b.size = sizeof(m_val); + sisl::blob b{r_cast< uint8_t const* >(&m_val), uint32_cast(sizeof(m_val))}; return b; } uint32_t serialized_size() const override { return sizeof(m_val); } static uint32_t get_fixed_size() { return sizeof(m_val); } - void deserialize(const sisl::blob& b, bool copy) { m_val = *(r_cast< uint32_t* >(b.bytes)); } + void deserialize(const sisl::blob& b, bool copy) { m_val = *(r_cast< uint32_t const* >(b.cbytes())); } std::string to_string() const override { return fmt::format("{}", m_val); } @@ -412,7 +410,8 @@ class TestVarLenValue : public BtreeValue { TestVarLenValue(const std::string& val) : BtreeValue(), m_val{val} {} TestVarLenValue() = default; TestVarLenValue(const TestVarLenValue& other) : BtreeValue() { m_val = other.m_val; }; - TestVarLenValue(const sisl::blob& b, bool copy) : BtreeValue(), m_val{std::string((const char*)b.bytes, b.size)} {} + TestVarLenValue(const sisl::blob& b, bool copy) : + BtreeValue(), m_val{std::string((const char*)b.cbytes(), b.size())} {} virtual ~TestVarLenValue() = default; TestVarLenValue& operator=(const TestVarLenValue& other) { @@ -423,16 +422,14 @@ class TestVarLenValue : public BtreeValue { static TestVarLenValue generate_rand() { return TestVarLenValue{gen_random_string(rand_val_size())}; } sisl::blob serialize() const override { - sisl::blob b; - b.bytes = uintptr_cast(const_cast< char* >(m_val.c_str())); - b.size = m_val.size(); + sisl::blob b{r_cast< const uint8_t* >(m_val.c_str()), uint32_cast(m_val.size())}; return b; } uint32_t serialized_size() const override { return (uint32_t)m_val.size(); } static uint32_t get_fixed_size() { return 0; } - void deserialize(const sisl::blob& b, bool copy) { m_val = std::string((const char*)b.bytes, b.size); } + void deserialize(const sisl::blob& b, bool copy) { m_val = std::string((const char*)b.cbytes(), b.size()); } std::string to_string() const override { return fmt::format("{}", m_val); } @@ -473,16 +470,14 @@ class TestIntervalValue : public BtreeIntervalValue { ///////////////////////////// Overriding methods of BtreeValue ////////////////////////// TestIntervalValue& operator=(const TestIntervalValue& other) = default; sisl::blob serialize() const override { - sisl::blob b; - b.bytes = uintptr_cast(const_cast< TestIntervalValue* >(this)); - b.size = sizeof(TestIntervalValue); + sisl::blob b{r_cast< uint8_t const* >(this), sizeof(TestIntervalValue)}; return b; } uint32_t serialized_size() const override { return sizeof(TestIntervalValue); } static uint32_t get_fixed_size() { return sizeof(TestIntervalValue); } void deserialize(const sisl::blob& b, bool) { - TestIntervalValue const* other = r_cast< TestIntervalValue const* >(b.bytes); + TestIntervalValue const* other = r_cast< TestIntervalValue const* >(b.cbytes()); m_base_val = other->m_base_val; m_offset = other->m_offset; } @@ -507,10 +502,10 @@ class TestIntervalValue : public BtreeIntervalValue { uint32_t serialized_suffix_size() const override { return uint32_cast(sizeof(uint16_t)); } void deserialize(sisl::blob const& prefix, sisl::blob const& suffix, bool) override { - DEBUG_ASSERT_EQ(prefix.size, sizeof(uint32_t), "Invalid prefix size on deserialize"); - DEBUG_ASSERT_EQ(suffix.size, sizeof(uint16_t), "Invalid suffix size on deserialize"); - m_base_val = *(r_cast< uint32_t* >(prefix.bytes)); - m_offset = *(r_cast< uint16_t* >(suffix.bytes)); + DEBUG_ASSERT_EQ(prefix.size(), sizeof(uint32_t), "Invalid prefix size on deserialize"); + DEBUG_ASSERT_EQ(suffix.size(), sizeof(uint16_t), "Invalid suffix size on deserialize"); + m_base_val = *(r_cast< uint32_t const* >(prefix.cbytes())); + m_offset = *(r_cast< uint16_t const* >(suffix.cbytes())); } bool operator==(TestIntervalValue const& other) const { diff --git a/src/tests/index_btree_benchmark.cpp b/src/tests/index_btree_benchmark.cpp index b71ccac19..d36bea643 100644 --- a/src/tests/index_btree_benchmark.cpp +++ b/src/tests/index_btree_benchmark.cpp @@ -133,7 +133,7 @@ INDEX_BETREE_BENCHMARK(FixedLenBtree) INDEX_BETREE_BENCHMARK(VarKeySizeBtree) INDEX_BETREE_BENCHMARK(VarValueSizeBtree) INDEX_BETREE_BENCHMARK(VarObjSizeBtree) -INDEX_BETREE_BENCHMARK(PrefixIntervalBtree) +//INDEX_BETREE_BENCHMARK(PrefixIntervalBtree) int main(int argc, char** argv) { SISL_OPTIONS_LOAD(argc, argv, logging, index_btree_benchmark, iomgr, test_common_setup); diff --git a/src/tests/test_append_blkalloc.cpp b/src/tests/test_append_blkalloc.cpp index c1073691f..8deca826e 100644 --- a/src/tests/test_append_blkalloc.cpp +++ b/src/tests/test_append_blkalloc.cpp @@ -285,9 +285,7 @@ TEST_F(AppendBlkAllocatorTest, TestWriteThenRecovey) { SISL_OPTION_GROUP(test_append_blkalloc, (run_time, "", "run_time", "running time in seconds", - ::cxxopts::value< uint64_t >()->default_value("30"), "number"), - (num_io, "", "num_io", "number of io", ::cxxopts::value< uint64_t >()->default_value("300"), - "number")); + ::cxxopts::value< uint64_t >()->default_value("30"), "number")); int main(int argc, char* argv[]) { int parsed_argc{argc}; diff --git a/src/tests/test_blk_read_tracker.cpp b/src/tests/test_blk_read_tracker.cpp index 4c656ac0b..dec5d1e4f 100644 --- a/src/tests/test_blk_read_tracker.cpp +++ b/src/tests/test_blk_read_tracker.cpp @@ -25,7 +25,7 @@ using namespace homestore; -SISL_LOGGING_INIT(test_blk_read_tracker, iomgr, flip, io_wd) +SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) SISL_OPTIONS_ENABLE(logging, test_blk_read_tracker) VENUM(op_type_t, uint8_t, insert = 0, remove = 1, wait_on = 2, max_op = 3); diff --git a/src/tests/test_blkid.cpp b/src/tests/test_blkid.cpp index 435e41784..0123232be 100644 --- a/src/tests/test_blkid.cpp +++ b/src/tests/test_blkid.cpp @@ -7,7 +7,7 @@ #include -SISL_LOGGING_INIT(test_blkid, iomgr, flip, io_wd) +SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) SISL_OPTIONS_ENABLE(logging, test_blkid) SISL_OPTION_GROUP(test_blkid, @@ -29,7 +29,7 @@ TEST(BlkIdTest, SingleBlkIdBasic) { ASSERT_EQ(b2.is_multi(), false); sisl::blob buf = b2.serialize(); - ASSERT_EQ(buf.size, sizeof(uint64_t)); + ASSERT_EQ(buf.size(), sizeof(uint64_t)); BlkId b3; b3.deserialize(buf, true); @@ -175,4 +175,4 @@ int main(int argc, char* argv[]) { spdlog::set_pattern("[%D %T%z] [%^%l%$] [%t] %v"); return RUN_ALL_TESTS(); -} \ No newline at end of file +} diff --git a/src/tests/test_btree_node.cpp b/src/tests/test_btree_node.cpp index 7ba6f9611..1ff602f19 100644 --- a/src/tests/test_btree_node.cpp +++ b/src/tests/test_btree_node.cpp @@ -28,7 +28,8 @@ static constexpr uint32_t g_max_keys{6000}; static std::uniform_int_distribution< uint32_t > g_randkey_generator{0, g_max_keys - 1}; using namespace homestore; -SISL_LOGGING_INIT(btree, iomgr, flip, io_wd) +SISL_LOGGING_DEF(btree) +SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) struct FixedLenNodeTest { using NodeType = SimpleNode< TestFixedKey, TestFixedValue >; diff --git a/src/tests/test_common/bits_generator.hpp b/src/tests/test_common/bits_generator.hpp index 97fb035d5..e5d0dac7b 100644 --- a/src/tests/test_common/bits_generator.hpp +++ b/src/tests/test_common/bits_generator.hpp @@ -33,7 +33,7 @@ class BitsGenerator { } } - static void gen_random_bits(sisl::blob& b) { gen_random_bits(b.size, b.bytes); } + static void gen_random_bits(sisl::blob& b) { gen_random_bits(b.size(), b.bytes()); } }; }; // namespace homestore diff --git a/src/tests/test_common/homestore_test_common.hpp b/src/tests/test_common/homestore_test_common.hpp index 846e1c801..277c54a24 100644 --- a/src/tests/test_common/homestore_test_common.hpp +++ b/src/tests/test_common/homestore_test_common.hpp @@ -50,6 +50,8 @@ SISL_OPTION_GROUP(test_common_setup, ::cxxopts::value< std::vector< std::string > >(), "path [...]"), (http_port, "", "http_port", "http port (0 for no http, -1 for random, rest specific value)", ::cxxopts::value< int >()->default_value("-1"), "number"), + (num_io, "", "num_io", "number of IO operations", + ::cxxopts::value< uint64_t >()->default_value("300"), "number"), (spdk, "", "spdk", "spdk", ::cxxopts::value< bool >()->default_value("false"), "true or false")); SETTINGS_INIT(iomgrcfg::IomgrSettings, iomgr_config); @@ -75,6 +77,71 @@ inline static uint32_t generate_random_http_port() { return http_port; } +struct Runner { + uint64_t total_tasks_{0}; + uint32_t qdepth_{8}; + std::atomic< uint64_t > issued_tasks_{0}; + std::atomic< uint64_t > completed_tasks_{0}; + std::function< void(void) > task_; + folly::Promise< folly::Unit > comp_promise_; + + Runner(uint64_t num_tasks, uint32_t qd = 8) : total_tasks_{num_tasks}, qdepth_{qd} { + if (total_tasks_ < (uint64_t)qdepth_) { total_tasks_ = qdepth_; } + } + Runner() : Runner{SISL_OPTIONS["num_io"].as< uint64_t >()} {} + Runner(const Runner&) = delete; + Runner& operator=(const Runner&) = delete; + + void set_num_tasks(uint64_t num_tasks) { total_tasks_ = std::max((uint64_t)qdepth_, num_tasks); } + void set_task(std::function< void(void) > f) { + issued_tasks_.store(0); + completed_tasks_.store(0); + comp_promise_ = folly::Promise< folly::Unit >{}; + task_ = std::move(f); + } + + folly::Future< folly::Unit > execute() { + for (uint32_t i{0}; i < qdepth_; ++i) { + run_task(); + } + return comp_promise_.getFuture(); + } + + void next_task() { + auto ctasks = completed_tasks_.fetch_add(1); + if ((issued_tasks_.load() < total_tasks_)) { + run_task(); + } else if ((ctasks + 1) == total_tasks_) { + comp_promise_.setValue(); + } + } + + void run_task() { + ++issued_tasks_; + iomanager.run_on_forget(iomgr::reactor_regex::random_worker, task_); + } +}; + +struct Waiter { + std::atomic< uint64_t > expected_comp{0}; + std::atomic< uint64_t > actual_comp{0}; + folly::Promise< folly::Unit > comp_promise; + + Waiter(uint64_t num_op) : expected_comp{num_op} {} + Waiter() : Waiter{SISL_OPTIONS["num_io"].as< uint64_t >()} {} + Waiter(const Waiter&) = delete; + Waiter& operator=(const Waiter&) = delete; + + folly::Future< folly::Unit > start(std::function< void(void) > f) { + f(); + return comp_promise.getFuture(); + } + + void one_complete() { + if ((actual_comp.fetch_add(1) + 1) >= expected_comp.load()) { comp_promise.setValue(); } + } +}; + class HSTestHelper { private: static void remove_files(const std::vector< std::string >& file_paths) { @@ -100,7 +167,7 @@ class HSTestHelper { uint32_t blk_size{0}; shared< ChunkSelector > custom_chunk_selector{nullptr}; IndexServiceCallbacks* index_svc_cbs{nullptr}; - repl_impl_type repl_impl{repl_impl_type::solo}; + shared< ReplApplication > repl_app{nullptr}; chunk_num_t num_chunks{1}; }; @@ -175,7 +242,7 @@ class HSTestHelper { } else if ((svc == HS_SERVICE::LOG_REPLICATED) || (svc == HS_SERVICE::LOG_LOCAL)) { hsi->with_log_service(); } else if (svc == HS_SERVICE::REPLICATION) { - hsi->with_repl_data_service(tp.repl_impl, tp.custom_chunk_selector); + hsi->with_repl_data_service(tp.repl_app, tp.custom_chunk_selector); } } bool need_format = @@ -219,15 +286,16 @@ class HSTestHelper { } } - static void validate_data_buf(uint8_t* buf, uint64_t size, uint64_t pattern = 0) { - uint64_t* ptr = r_cast< uint64_t* >(buf); + static void validate_data_buf(uint8_t const* buf, uint64_t size, uint64_t pattern = 0) { + uint64_t const* ptr = r_cast< uint64_t const* >(buf); for (uint64_t i = 0ul; i < size / sizeof(uint64_t); ++i) { HS_REL_ASSERT_EQ(ptr[i], ((pattern == 0) ? i : pattern), "data_buf mismatch at offset={}", i); } } - static sisl::sg_list create_sgs(uint64_t io_size, uint32_t blk_size, uint32_t max_size_per_iov, + static sisl::sg_list create_sgs(uint64_t io_size, uint32_t max_size_per_iov, std::optional< uint64_t > fill_data_pattern = std::nullopt) { + auto blk_size = SISL_OPTIONS["block_size"].as< uint32_t >(); HS_REL_ASSERT_EQ(io_size % blk_size, 0, "io_size should be a multiple of blk_size"); HS_REL_ASSERT_EQ(max_size_per_iov % blk_size, 0, "max_size_per_iov should be a multiple of blk_size"); diff --git a/src/tests/test_common/hs_repl_test_common.hpp b/src/tests/test_common/hs_repl_test_common.hpp new file mode 100644 index 000000000..e2e18b2b2 --- /dev/null +++ b/src/tests/test_common/hs_repl_test_common.hpp @@ -0,0 +1,252 @@ +/********************************************************************************* + * Modifications Copyright 2017-2019 eBay Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ +/* + * Homestore Replication testing binaries shared common definitions, apis and data structures + */ + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include "test_common/homestore_test_common.hpp" + +SISL_OPTION_GROUP(test_repl_common_setup, + (replicas, "", "replicas", "Total number of replicas", + ::cxxopts::value< uint32_t >()->default_value("3"), "number"), + (base_port, "", "base_port", "Port number of first replica", + ::cxxopts::value< uint16_t >()->default_value("4000"), "number"), + (replica_num, "", "replica_num", + "Internal replica num (used to lauch multi process) - don't override", + ::cxxopts::value< uint16_t >()->default_value("0"), "number")); + +std::vector< std::string > test_common::HSTestHelper::s_dev_names; + +using namespace homestore; +namespace bip = boost::interprocess; + +namespace test_common { + +VENUM(ipc_packet_op_t, uint32_t, WAKE_UP = 0, CLEAN_EXIT = 1, UNCLEAN_EXIT = 2, PEER_GOING_DOWN = 3); +ENUM(repl_test_phase_t, uint32_t, REGISTER, MEMBER_START, TEST_RUN, VALIDATE, CLEANUP); + +class HSReplTestHelper { +protected: + struct IPCData { + bip::interprocess_mutex mtx_; + bip::interprocess_condition cv_; + + repl_test_phase_t phase_{repl_test_phase_t::REGISTER}; + uint32_t registered_count_{0}; + uint32_t test_start_count_{0}; + uint32_t verify_start_count_{0}; + uint32_t cleanup_start_count_{0}; + uint64_t test_dataset_size_{0}; + + void sync_for_member_start() { sync_for(registered_count_, repl_test_phase_t::MEMBER_START); } + void sync_for_test_start() { sync_for(test_start_count_, repl_test_phase_t::TEST_RUN); } + void sync_for_verify_start() { sync_for(verify_start_count_, repl_test_phase_t::VALIDATE); } + void sync_for_cleanup_start() { sync_for(cleanup_start_count_, repl_test_phase_t::CLEANUP); } + + private: + void sync_for(uint32_t& count, repl_test_phase_t new_phase) { + std::unique_lock< bip::interprocess_mutex > lg(mtx_); + ++count; + if (count == SISL_OPTIONS["replicas"].as< uint32_t >()) { + phase_ = new_phase; + cv_.notify_all(); + } + cv_.wait(lg, [this, new_phase]() { return (phase_ == new_phase); }); + } + }; + +public: + class TestReplApplication : public ReplApplication { + private: + HSReplTestHelper& helper_; + + public: + TestReplApplication(HSReplTestHelper& h) : helper_{h} {} + virtual ~TestReplApplication() = default; + + homestore::repl_impl_type get_impl_type() const override { return homestore::repl_impl_type::server_side; } + bool need_timeline_consistency() const { return false; } + + std::shared_ptr< homestore::ReplDevListener > + create_repl_dev_listener(homestore::group_id_t group_id) override { + return helper_.get_listener(group_id); + } + + std::pair< std::string, uint16_t > lookup_peer(homestore::replica_id_t replica_id) const override { + uint16_t port; + if (auto it = helper_.members_.find(replica_id); it != helper_.members_.end()) { + port = SISL_OPTIONS["base_port"].as< uint16_t >() + it->second; + } else { + RELEASE_ASSERT(false, "Gotten lookup_peer call for a non member"); + } + + return std::make_pair(std::string("127.0.0.1"), port); + } + + homestore::replica_id_t get_my_repl_id() const override { return helper_.my_replica_id_; } + }; + +public: + friend class TestReplApplication; + + HSReplTestHelper(std::string const& name, char** argv) : name_{name}, argv_{argv} {} + + void setup() { + replica_num_ = SISL_OPTIONS["replica_num"].as< uint16_t >(); + sisl::logging::SetLogger(name_ + std::string("_replica_") + std::to_string(replica_num_)); + auto const num_replicas = SISL_OPTIONS["replicas"].as< uint32_t >(); + + boost::uuids::string_generator gen; + for (uint32_t i{0}; i < num_replicas; ++i) { + auto replica_id = gen(fmt::format("{:04}", i) + std::string("0123456789abcdef0123456789ab")); + up_members_.insert(i); + if (i == replica_num_) { my_replica_id_ = replica_id; } + members_.insert(std::pair(replica_id, i)); + } + + if (replica_num_ == 0) { + // Erase previous shmem and create a new shmem with IPCData structure + bip::shared_memory_object::remove("raft_repl_test_shmem"); + shm_ = std::make_unique< bip::shared_memory_object >(bip::create_only, "raft_repl_test_shmem", + bip::read_write); + shm_->truncate(sizeof(IPCData)); + region_ = std::make_unique< bip::mapped_region >(*shm_, bip::read_write); + ipc_data_ = new (region_->get_address()) IPCData; + + for (uint32_t i{1}; i < num_replicas; ++i) { + LOGINFO("Spawning Homestore replica={} instance", i); + boost::process::child c(argv_[0], "--replica_num", std::to_string(i), proc_grp_); + c.detach(); + } + } else { + shm_ = + std::make_unique< bip::shared_memory_object >(bip::open_only, "raft_repl_test_shmem", bip::read_write); + region_ = std::make_unique< bip::mapped_region >(*shm_, bip::read_write); + ipc_data_ = static_cast< IPCData* >(region_->get_address()); + } + + int tmp_argc = 1; + folly_ = std::make_unique< folly::Init >(&tmp_argc, &argv_, true); + + LOGINFO("Starting Homestore replica={}", replica_num_); + test_common::HSTestHelper::start_homestore( + name_ + std::to_string(replica_num_), + {{HS_SERVICE::META, {.size_pct = 5.0}}, + {HS_SERVICE::REPLICATION, {.size_pct = 60.0, .repl_app = std::make_unique< TestReplApplication >(*this)}}, + {HS_SERVICE::LOG_REPLICATED, {.size_pct = 20.0}}, + {HS_SERVICE::LOG_LOCAL, {.size_pct = 2.0}}}); + } + + void teardown() { + LOGINFO("Stopping Homestore replica={}", replica_num_); + sisl::GrpcAsyncClientWorker::shutdown_all(); + test_common::HSTestHelper::shutdown_homestore(); + } + + void reset_setup() { + teardown(); + setup(); + } + + uint16_t replica_num() const { return replica_num_; } + + Runner& runner() { return io_runner_; } + + void register_listener(std::shared_ptr< ReplDevListener > listener) { + if (replica_num_ != 0) { pending_listeners_.emplace_back(std::move(listener)); } + + ipc_data_->sync_for_member_start(); + + if (replica_num_ == 0) { + std::set< homestore::replica_id_t > members; + std::transform(members_.begin(), members_.end(), std::inserter(members, members.end()), + [](auto const& p) { return p.first; }); + group_id_t repl_group_id = hs_utils::gen_random_uuid(); + { + std::unique_lock lg(groups_mtx_); + repl_groups_.insert({repl_group_id, std::move(listener)}); + } + + auto v = hs()->repl_service().create_repl_dev(repl_group_id, members).get(); + ASSERT_EQ(v.hasValue(), true) + << "Error in creating repl dev for group_id=" << boost::uuids::to_string(repl_group_id).c_str(); + } + } + + std::shared_ptr< ReplDevListener > get_listener(homestore::group_id_t group_id) { + std::unique_lock lg(groups_mtx_); + + auto it = repl_groups_.find(group_id); + if ((it != repl_groups_.end()) && (it->second != nullptr)) { return it->second; } + + RELEASE_ASSERT(!pending_listeners_.empty(), + "Looking for listener for group_id, but register_listener was not called"); + + auto listener = std::move(pending_listeners_[0]); + repl_groups_.insert(std::pair(group_id, listener)); + pending_listeners_.erase(pending_listeners_.begin()); + return listener; + } + + void sync_for_test_start() { ipc_data_->sync_for_test_start(); } + void sync_for_verify_start() { ipc_data_->sync_for_verify_start(); } + void sync_for_cleanup_start() { ipc_data_->sync_for_cleanup_start(); } + void sync_dataset_size(uint64_t dataset_size) { ipc_data_->test_dataset_size_ = dataset_size; } + uint64_t dataset_size() const { return ipc_data_->test_dataset_size_; } + +private: + uint16_t replica_num_; + std::string name_; + char** argv_; + + boost::process::group proc_grp_; + std::unique_ptr< bip::shared_memory_object > shm_; + std::unique_ptr< bip::mapped_region > region_; + std::unique_ptr< folly::Init > folly_; + + std::mutex groups_mtx_; + std::condition_variable group_created_cv_; + std::map< homestore::group_id_t, std::shared_ptr< homestore::ReplDevListener > > repl_groups_; + std::vector< std::shared_ptr< homestore::ReplDevListener > > pending_listeners_; // pending to join raft group + std::map< homestore::replica_id_t, uint32_t > members_; + std::set< uint32_t > up_members_; + homestore::replica_id_t my_replica_id_; + + std::mutex wakeup_mtx_; + uint32_t wokenup_replicas_{0}; + std::condition_variable wakeup_cv_; + + IPCData* ipc_data_; + + Runner io_runner_; +}; +} // namespace test_common \ No newline at end of file diff --git a/src/tests/test_data_service.cpp b/src/tests/test_data_service.cpp index 39e8f2112..5af59445f 100644 --- a/src/tests/test_data_service.cpp +++ b/src/tests/test_data_service.cpp @@ -816,13 +816,13 @@ TEST_F(BlkDataServiceTest, TestRandMixIOLoad) { // Stream related test -SISL_OPTION_GROUP( - test_data_service, - (run_time, "", "run_time", "running time in seconds", ::cxxopts::value< uint64_t >()->default_value("30"), - "number"), - (min_io_size, "", "min_io_size", "mim io size", ::cxxopts::value< uint32_t >()->default_value("4096"), "number"), - (max_io_size, "", "max_io_size", "max io size", ::cxxopts::value< uint32_t >()->default_value("4096"), "number"), - (num_io, "", "num_io", "number of io", ::cxxopts::value< uint64_t >()->default_value("300"), "number")); +SISL_OPTION_GROUP(test_data_service, + (run_time, "", "run_time", "running time in seconds", + ::cxxopts::value< uint64_t >()->default_value("30"), "number"), + (min_io_size, "", "min_io_size", "mim io size", ::cxxopts::value< uint32_t >()->default_value("4096"), + "number"), + (max_io_size, "", "max_io_size", "max io size", ::cxxopts::value< uint32_t >()->default_value("4096"), + "number")); int main(int argc, char* argv[]) { int parsed_argc{argc}; diff --git a/src/tests/test_home_raft_logstore.cpp b/src/tests/test_home_raft_logstore.cpp new file mode 100644 index 000000000..d9c9df4c5 --- /dev/null +++ b/src/tests/test_home_raft_logstore.cpp @@ -0,0 +1,275 @@ +#include +#include +#include +#include +#include +#include + +#include "test_common/homestore_test_common.hpp" +#include "replication/log_store/home_raft_log_store.h" + +using namespace homestore; + +SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) + +static constexpr uint32_t g_max_logsize{512}; +static std::random_device g_rd{}; +static std::default_random_engine g_re{g_rd()}; +static std::uniform_int_distribution< uint32_t > g_randlogsize_generator{2, g_max_logsize}; +std::vector< std::string > test_common::HSTestHelper::s_dev_names; + +static constexpr std::array< const char, 62 > alphanum{ + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', + 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', + 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'}; + +static std::string gen_random_string(size_t len, uint64_t preamble = std::numeric_limits< uint32_t >::max()) { + std::string str; + if (preamble != std::numeric_limits< uint64_t >::max()) { + std::stringstream ss; + ss << std::setw(8) << std::setfill('0') << std::hex << preamble; + str += ss.str(); + } + + std::uniform_int_distribution< size_t > rand_char{0, alphanum.size() - 1}; + for (size_t i{0}; i < len; ++i) { + str += alphanum[rand_char(g_re)]; + } + str += '\0'; + return str; +} + +struct pack_result_t { + raft_buf_ptr_t actual_data; + std::vector< std::string > exp_data; +}; + +class RaftLogStoreClient { +public: + friend class TestRaftLogStore; + + void append_read_test(uint32_t num_entries) { + ASSERT_EQ(m_rls->next_slot(), m_next_lsn); + ASSERT_EQ(m_rls->start_index(), m_start_lsn); + + auto max_lsn_this_iter = uint64_cast(m_next_lsn) + num_entries; + for (uint64_t lsn = m_next_lsn; lsn <= max_lsn_this_iter; ++lsn) { + auto le = make_log(m_cur_term, lsn); + int64_t const store_sn = m_rls->append(le); + + ASSERT_EQ(lsn, store_sn); + ASSERT_EQ(m_rls->next_slot(), lsn + 1); + validate_log(m_rls->last_entry(), lsn); + + ++m_next_lsn; + } + m_rls->flush(); + ASSERT_EQ(m_rls->start_index(), m_start_lsn) << "Start Index not expected to be updated after insertion"; + } + + void rollback_test() { + m_next_lsn = (m_next_lsn - m_start_lsn) / 2; // Rollback half of the current logs + ++m_cur_term; + auto le = make_log(m_cur_term, m_next_lsn); + m_rls->write_at(m_next_lsn, le); // Rollback and write with next term + m_shadow_log.erase(m_shadow_log.begin() + m_next_lsn, m_shadow_log.end()); + ++m_next_lsn; + + ASSERT_EQ(m_rls->next_slot(), m_next_lsn) << "Post rollback, next slot doesn't have expected value"; + validate_log(m_rls->last_entry(), m_next_lsn - 1); + validate_all_logs(); + } + + void compact_test(uint32_t num_records) { + uint64_t compact_upto = m_start_lsn + num_records - 1; + + // Reflect expected behavior from logstore, if we are compacting beyond next insertion index, then we should + // reset the next insertion slot and we expect logstores to create holes and fill it with dummy. + if (compact_upto >= uint64_cast(m_next_lsn)) { m_next_lsn = compact_upto + 1; } + + m_start_lsn = compact_upto + 1; + m_rls->compact(compact_upto); + ASSERT_EQ(m_rls->start_index(), m_start_lsn) << "Post compaction, start_index is invalid"; + validate_all_logs(); + } + + void pack_test(uint64_t from, int32_t cnt, pack_result_t& out_pack) { + out_pack.actual_data = m_rls->pack(from, cnt); + ASSERT_NE(out_pack.actual_data.get(), nullptr); + out_pack.exp_data.assign(m_shadow_log.begin() + from - 1, m_shadow_log.begin() + from + cnt - 1); + } + + pack_result_t pack_test() { + pack_result_t p; + pack_test(m_start_lsn, m_next_lsn - m_start_lsn, p); + return p; + } + + void unpack_test(const pack_result_t& p) { + m_rls->apply_pack(m_next_lsn, *p.actual_data); + m_shadow_log.insert(std::end(m_shadow_log), p.exp_data.begin(), p.exp_data.end()); + m_next_lsn += p.exp_data.size(); + validate_all_logs(); + } + + size_t total_records() const { return m_shadow_log.size() - m_start_lsn + 1; } + + void validate_all_logs() { + // Do Basic read validation + ASSERT_EQ(m_rls->next_slot(), m_next_lsn); + ASSERT_EQ(m_rls->start_index(), m_start_lsn); + + if (m_next_lsn > m_start_lsn) { validate_log(m_rls->last_entry(), m_next_lsn - 1); } + + // Do invidivual get validation + for (uint64_t lsn = m_start_lsn; lsn < uint64_cast(m_next_lsn); ++lsn) { + validate_log(m_rls->entry_at(lsn), lsn); + } + + // Do bulk get validation as well. + auto lsn = m_start_lsn; + auto const entries = m_rls->log_entries(m_start_lsn, m_next_lsn); + ASSERT_EQ(entries->size(), uint64_cast(m_next_lsn - m_start_lsn)); + for (const auto& le : *entries) { + validate_log(le, lsn++); + } + } + +private: + nuraft::ptr< nuraft::log_entry > make_log(uint64_t term, uint64_t lsn) { + auto val = gen_random_string(g_randlogsize_generator(g_re), term); + raft_buf_ptr_t buf = nuraft::buffer::alloc(val.size() + 1); + buf->put(val); + m_shadow_log[lsn - 1] = std::move(val); + return nuraft::cs_new< nuraft::log_entry >(term, buf); + } + + void validate_log(const nuraft::ptr< nuraft::log_entry >& le, int64_t lsn) { + uint64_t expected_term; + std::stringstream ss; + ss << std::hex << m_shadow_log[lsn - 1].substr(0, 8); + ss >> expected_term; + ASSERT_EQ(le->get_term(), expected_term) << "Term mismatch at lsn=" << lsn; + + nuraft::buffer& buf = le->get_buf(); + buf.pos(0); + auto bytes = buf.get_raw(buf.size()); + + ASSERT_EQ(buf.size() - 1, m_shadow_log[lsn - 1].size()) << "Size from log and shadow mismatch for lsn=" << lsn; + ASSERT_EQ(std::string(r_cast< const char* >(bytes), buf.size() - 1), m_shadow_log[lsn - 1]) + << "Log entry mismatch for lsn=" << lsn; + buf.pos(0); + } + +private: + homestore::logstore_id_t m_store_id{UINT32_MAX}; + std::unique_ptr< HomeRaftLogStore > m_rls; + sisl::sparse_vector< std::string > m_shadow_log; + uint64_t m_cur_term{1}; + int64_t m_next_lsn{1}; + int64_t m_start_lsn{1}; +}; + +class TestRaftLogStore : public ::testing::Test { +public: + void SetUp() { + test_common::HSTestHelper::start_homestore("test_home_raft_log_store", + {{HS_SERVICE::META, {.size_pct = 5.0}}, + {HS_SERVICE::LOG_REPLICATED, {.size_pct = 70.0}}, + {HS_SERVICE::LOG_LOCAL, {.size_pct = 2.0}}}); + m_leader_store.m_rls = std::make_unique< HomeRaftLogStore >(); + m_leader_store.m_store_id = m_leader_store.m_rls->logstore_id(); + + m_follower_store.m_rls = std::make_unique< HomeRaftLogStore >(); + m_follower_store.m_store_id = m_follower_store.m_rls->logstore_id(); + } + + void restart() { + m_leader_store.m_rls.reset(); + m_follower_store.m_rls.reset(); + + test_common::HSTestHelper::start_homestore( + "test_home_raft_log_store", + {{HS_SERVICE::META, {}}, {HS_SERVICE::LOG_REPLICATED, {}}, {HS_SERVICE::LOG_LOCAL, {}}}, + [this]() { + m_leader_store.m_rls = std::make_unique< HomeRaftLogStore >(m_leader_store.m_store_id); + m_follower_store.m_rls = std::make_unique< HomeRaftLogStore >(m_follower_store.m_store_id); + }, + true /* restart */); + } + + virtual void TearDown() override { + m_leader_store.m_rls.reset(); + m_follower_store.m_rls.reset(); + test_common::HSTestHelper::shutdown_homestore(); + } + +protected: + RaftLogStoreClient m_leader_store; + RaftLogStoreClient m_follower_store; +}; + +TEST_F(TestRaftLogStore, lifecycle_test) { + auto nrecords = SISL_OPTIONS["num_records"].as< uint32_t >(); + + LOGINFO("Step 1: Append and test {} records", nrecords); + this->m_leader_store.append_read_test(nrecords); // assuming nrecords = 1000, total_records = 1000 + + LOGINFO("Step 2: Rollback half of the records"); + this->m_leader_store.rollback_test(); // total_records = 500 + + LOGINFO("Step 3: Post rollback add {} records", nrecords); + this->m_leader_store.append_read_test(nrecords); // total_records = 1500 + + auto shrink_records = (this->m_leader_store.total_records() * 10) / 100; + LOGINFO("Step 4: Compact first 10% records = {}", shrink_records); + this->m_leader_store.compact_test(shrink_records); // total_records = 1350 + + LOGINFO("Step 5: Post compaction add {} records", nrecords); + this->m_leader_store.append_read_test(nrecords); // total_records = 2350 + + shrink_records = this->m_leader_store.total_records() + (this->m_leader_store.total_records() * 10) / 100; + LOGINFO("Step 6: Compaction 10% records={} beyond max appended entries test", shrink_records); + this->m_leader_store.compact_test(shrink_records); // total_records = 0 + + LOGINFO("Step 7: Post compaction add {} records", nrecords); + this->m_leader_store.append_read_test(nrecords); // total_records = 1000 + + LOGINFO("Step 8: Pack all records"); + auto pack_data = this->m_leader_store.pack_test(); // total_records = 1000 + + LOGINFO("Step 9: Unpack all records on an empty logstore"); + this->m_follower_store.unpack_test(pack_data); // total_records in follower = 1000 + + LOGINFO("Step 10: Append more {} records to follower logstore", nrecords); + this->m_follower_store.append_read_test(nrecords); // total_records in follower = 2000 + + LOGINFO("Step 11: Unpack same leader records again after append inserted records"); + this->m_follower_store.unpack_test(pack_data); // total_records in follower = 3000 + + LOGINFO("Step 12: Restart homestore and validate recovery"); + this->restart(); + this->m_leader_store.validate_all_logs(); + this->m_follower_store.validate_all_logs(); + + LOGINFO("Step 13: Post recovery do append test"); + this->m_leader_store.append_read_test(nrecords); // total_records in leader = 2000 + this->m_follower_store.append_read_test(nrecords); // total_records in follower = 4000 +} + +SISL_OPTIONS_ENABLE(logging, test_home_raft_log_store, iomgr, test_common_setup) +SISL_OPTION_GROUP(test_home_raft_log_store, + (num_records, "", "num_records", "number of record to test", + ::cxxopts::value< uint32_t >()->default_value("1000"), "number"), + (iterations, "", "iterations", "Iterations", ::cxxopts::value< uint32_t >()->default_value("1"), + "the number of iterations to run each test")); + +int main(int argc, char* argv[]) { + int parsed_argc = argc; + ::testing::InitGoogleTest(&parsed_argc, argv); + SISL_OPTIONS_LOAD(parsed_argc, argv, logging, test_home_raft_log_store, iomgr, test_common_setup); + sisl::logging::SetLogger("test_home_raft_log_store"); + spdlog::set_pattern("[%D %T%z] [%^%l%$] [%t] %v"); + + return RUN_ALL_TESTS(); +} diff --git a/src/tests/test_index_btree.cpp b/src/tests/test_index_btree.cpp index 1872b79f5..1e833b059 100644 --- a/src/tests/test_index_btree.cpp +++ b/src/tests/test_index_btree.cpp @@ -33,6 +33,7 @@ SISL_LOGGING_DECL(test_index_btree) std::vector< std::string > test_common::HSTestHelper::s_dev_names; + // TODO Add tests to do write,remove after recovery. // TODO Test with var len key with io mgr page size is 512. @@ -409,6 +410,7 @@ TYPED_TEST(BtreeTest, ThreadedCpFlush) { } template < typename TestType > + struct BtreeConcurrentTest : public BtreeTestHelper< TestType >, public ::testing::Test { using T = TestType; @@ -417,6 +419,7 @@ struct BtreeConcurrentTest : public BtreeTestHelper< TestType >, public ::testin class TestIndexServiceCallbacks : public IndexServiceCallbacks { public: TestIndexServiceCallbacks(BtreeConcurrentTest* test) : m_test(test) {} + std::shared_ptr< IndexTableBase > on_index_table_found(superblk< index_table_sb >&& sb) override { LOGINFO("Index table recovered"); LOGINFO("Root bnode_id {} version {}", sb->root_node, sb->link_version); @@ -428,6 +431,7 @@ struct BtreeConcurrentTest : public BtreeTestHelper< TestType >, public ::testin BtreeConcurrentTest* m_test; }; + BtreeConcurrentTest() : testing::Test() { this->m_is_multi_threaded = true; } void SetUp() override { diff --git a/src/tests/test_journal_vdev.cpp b/src/tests/test_journal_vdev.cpp index 39abf9b43..693eb0925 100644 --- a/src/tests/test_journal_vdev.cpp +++ b/src/tests/test_journal_vdev.cpp @@ -342,8 +342,6 @@ SISL_OPTION_GROUP(test_vdev, ::cxxopts::value< uint32_t >()->default_value("8192"), "number"), (run_time, "", "run_time", "running time in seconds", ::cxxopts::value< uint64_t >()->default_value("30"), "number"), - (num_io, "", "num_io", "number of io", ::cxxopts::value< uint64_t >()->default_value("3000"), - "number"), (per_read, "", "per_read", "read percentage of io that are reads", ::cxxopts::value< uint32_t >()->default_value("20"), "number"), (per_write, "", "per_write", "write percentage of io that are writes", diff --git a/src/tests/test_log_store.cpp b/src/tests/test_log_store.cpp index d7ad85012..656092105 100644 --- a/src/tests/test_log_store.cpp +++ b/src/tests/test_log_store.cpp @@ -68,8 +68,12 @@ struct test_log_data { uint32_t size; uint8_t* get_data() { return uintptr_cast(this) + sizeof(test_log_data); }; + uint8_t const* get_data_const() const { return r_cast< uint8_t const* >(this) + sizeof(test_log_data); } const uint8_t* get_data() const { return r_cast< const uint8_t* >(this) + sizeof(test_log_data); } uint32_t total_size() const { return sizeof(test_log_data) + size; } + std::string get_data_str() const { + return std::string(r_cast< const char* >(get_data_const()), static_cast< size_t >(size)); + } }; typedef std::function< void(logstore_family_id_t, logstore_seq_num_t, logdev_key) > test_log_store_comp_cb_t; @@ -187,7 +191,7 @@ class SampleLogStoreClient { if ((hole_entry != hole_end) && hole_entry->second) { // Hole entry exists, but filled EXPECT_EQ(b.size(), 0ul); } else { - auto* tl = r_cast< test_log_data* >(b.bytes()); + auto const* tl = r_cast< test_log_data const* >(b.bytes()); EXPECT_EQ(tl->total_size(), b.size()); validate_data(tl, seq_num); } @@ -245,15 +249,15 @@ class SampleLogStoreClient { ASSERT_EQ(b.size(), 0ul) << "Expected null entry for lsn=" << m_log_store->get_store_id() << ":" << i; } else { - auto* tl = r_cast< test_log_data* >(b.bytes()); + auto* tl = r_cast< test_log_data const* >(b.bytes()); ASSERT_EQ(tl->total_size(), b.size()) << "Size Mismatch for lsn=" << m_log_store->get_store_id() << ":" << i; validate_data(tl, i); } } catch (const std::exception& e) { if (!expect_all_completed) { - // In case we run truncation in parallel to read, it is possible truncate moved, so adjust the - // truncated_upto accordingly. + // In case we run truncation in parallel to read, it is possible truncate moved, so adjust + // the truncated_upto accordingly. const auto trunc_upto = m_log_store->truncated_upto(); if (i <= trunc_upto) { i = trunc_upto; @@ -328,7 +332,7 @@ class SampleLogStoreClient { LOGDEBUG("Recovered lsn {}:{} with log data of size {}", m_log_store->get_store_id(), lsn, buf.size()) EXPECT_LE(lsn, m_cur_lsn.load()) << "Recovered incorrect lsn " << m_log_store->get_store_id() << ":" << lsn << "Expected less than cur_lsn " << m_cur_lsn.load(); - auto* tl = r_cast< test_log_data* >(buf.bytes()); + auto* tl = r_cast< test_log_data const* >(buf.bytes()); validate_data(tl, lsn); // Count only the ones which are after truncated, because recovery could receive even truncated lsns @@ -378,7 +382,7 @@ class SampleLogStoreClient { private: void validate_data(const test_log_data* d, const logstore_seq_num_t lsn) { const char c = static_cast< char >((lsn % 94) + 33); - const std::string actual{r_cast< const char* >(d->get_data()), static_cast< size_t >(d->size)}; + const std::string actual = d->get_data_str(); const std::string expected(static_cast< size_t >(d->size), c); // needs to be () because of same reason as vector ASSERT_EQ(actual, expected) << "Data mismatch for LSN=" << m_log_store->get_store_id() << ":" << lsn @@ -688,10 +692,11 @@ class LogStoreTest : public ::testing::Test { if (lsc->has_all_lsns_truncated()) ++n_fully_truncated; } - // While inserts are going on, truncation can guaranteed to be forward progressed if none of the log - // stores are fully truncated. If all stores are fully truncated, its obvious no progress, but even - // if one of the store is fully truncated, then it might be possible that logstore is holding lowest - // logdev location and waiting for next flush to finish to move the safe logdev location. + // While inserts are going on, truncation can guaranteed to be forward progressed if none of the + // log stores are fully truncated. If all stores are fully truncated, its obvious no progress, + // but even if one of the store is fully truncated, then it might be possible that logstore is + // holding lowest logdev location and waiting for next flush to finish to move the safe logdev + // location. expect_forward_progress = (n_fully_truncated == 0); } @@ -954,10 +959,10 @@ TEST_F(LogStoreTest, VarRateInsertThenTruncate) { for (uint32_t iteration{0}; iteration < iterations; ++iteration) { LOGINFO("Iteration {}", iteration); - LOGINFO( - "Step 1: Reinit the num records={} and insert them as batch of 10 with qdepth=500 and wait for all records " - "to be inserted and then validate them", - nrecords); + LOGINFO("Step 1: Reinit the num records={} and insert them as batch of 10 with qdepth=500 and wait for all " + "records " + "to be inserted and then validate them", + nrecords); this->init(nrecords); this->kickstart_inserts(10, 500); this->wait_for_inserts(); @@ -980,10 +985,10 @@ TEST_F(LogStoreTest, VarRateInsertThenTruncate) { this->truncate_validate(); } - LOGINFO( - "Step 3: Change data rate on stores 0,1 but still slower than other stores, write num_records={} wait for " - "their completion, validate it is readable, then truncate - all in a loop for 3 times", - nrecords); + LOGINFO("Step 3: Change data rate on stores 0,1 but still slower than other stores, write num_records={} " + "wait for " + "their completion, validate it is readable, then truncate - all in a loop for 3 times", + nrecords); for (auto i{0u}; i < 3u; ++i) { LOGINFO("Step 3.{}.1: Write and wait for {}", i + 1, nrecords); this->init(nrecords, {{0, 5}, {1, 20}}); @@ -1248,10 +1253,10 @@ TEST_F(LogStoreTest, WriteSyncThenRead) { } auto b = tmp_log_store->read_sync(i); - auto* tl = r_cast< test_log_data* >(b.bytes()); + auto* tl = r_cast< test_log_data const* >(b.bytes()); ASSERT_EQ(tl->total_size(), b.size()) << "Size Mismatch for lsn=" << store_id << ":" << i; const char c = static_cast< char >((i % 94) + 33); - const std::string actual{r_cast< const char* >(tl->get_data()), static_cast< size_t >(tl->size)}; + const std::string actual = tl->get_data_str(); const std::string expected(static_cast< size_t >(tl->size), c); // needs to be () because of same reason as vector ASSERT_EQ(actual, expected) << "Data mismatch for LSN=" << store_id << ":" << i << " size=" << tl->size; diff --git a/src/tests/test_mem_btree.cpp b/src/tests/test_mem_btree.cpp index 44995e360..f6df10d0e 100644 --- a/src/tests/test_mem_btree.cpp +++ b/src/tests/test_mem_btree.cpp @@ -30,7 +30,8 @@ #include "btree_helpers/btree_test_helper.hpp" using namespace homestore; -SISL_LOGGING_INIT(btree, iomgr, io_wd, flip) +SISL_LOGGING_DEF(btree) +SISL_LOGGING_INIT(btree) SISL_OPTIONS_ENABLE(logging, test_mem_btree) SISL_OPTION_GROUP( @@ -105,7 +106,8 @@ struct BtreeTest : public BtreeTestHelper< TestType >, public ::testing::Test { } }; -using BtreeTypes = testing::Types< PrefixIntervalBtreeTest, FixedLenBtreeTest, VarKeySizeBtreeTest, +// TODO Enable PrefixIntervalBtreeTest later +using BtreeTypes = testing::Types; TYPED_TEST_SUITE(BtreeTest, BtreeTypes); diff --git a/src/tests/test_meta_blk_mgr.cpp b/src/tests/test_meta_blk_mgr.cpp index 1c3e6ab63..5e15900ed 100644 --- a/src/tests/test_meta_blk_mgr.cpp +++ b/src/tests/test_meta_blk_mgr.cpp @@ -372,7 +372,7 @@ class VMetaBlkMgrTest : public ::testing::Test { iomanager.iobuf_free(buf); } else { if (unaligned_addr) { - delete[](buf - unaligned_shift); + delete[] (buf - unaligned_shift); } else { delete[] buf; } @@ -915,7 +915,6 @@ SISL_OPTION_GROUP( "number"), (max_write_size, "", "max_write_size", "maximum write size", ::cxxopts::value< uint32_t >()->default_value("524288"), "number"), - (num_io, "", "num_io", "number of io", ::cxxopts::value< uint64_t >()->default_value("300"), "number"), (overflow, "", "overflow", "always do overflow", ::cxxopts::value< uint32_t >()->default_value("0"), "number"), (per_update, "", "per_update", "update percentage", ::cxxopts::value< uint32_t >()->default_value("20"), "number"), (per_write, "", "per_write", "write percentage", ::cxxopts::value< uint32_t >()->default_value("60"), "number"), diff --git a/src/tests/test_raft_repl_dev.cpp b/src/tests/test_raft_repl_dev.cpp new file mode 100644 index 000000000..b42f04d94 --- /dev/null +++ b/src/tests/test_raft_repl_dev.cpp @@ -0,0 +1,265 @@ +/********************************************************************************* + * Modifications Copyright 2017-2019 eBay Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed + * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + * CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + * + *********************************************************************************/ +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include "common/homestore_config.hpp" +#include "common/homestore_assert.hpp" +#include "common/homestore_utils.hpp" +#include "test_common/hs_repl_test_common.hpp" +#include "replication/service/raft_repl_service.h" +#include "replication/repl_dev/raft_repl_dev.h" + +using namespace homestore; + +SISL_LOGGING_DEF(test_raft_repl_dev) +SISL_LOGGING_INIT(HOMESTORE_LOG_MODS) + +SISL_OPTION_GROUP(test_raft_repl_dev, + (block_size, "", "block_size", "block size to io", + ::cxxopts::value< uint32_t >()->default_value("4096"), "number")); +SISL_OPTIONS_ENABLE(logging, test_raft_repl_dev, iomgr, test_common_setup, test_repl_common_setup) + +static std::unique_ptr< test_common::HSReplTestHelper > g_helper; + +class TestReplicatedDB : public homestore::ReplDevListener { +public: + struct Key { + uint64_t id_; + bool operator<(Key const& other) const { return id_ < other.id_; } + }; + + struct Value { + int64_t lsn_; + uint64_t data_size_; + uint64_t data_pattern_; + MultiBlkId blkid_; + }; + + struct test_req : public repl_req_ctx { + struct journal_header { + uint64_t data_size; + uint64_t data_pattern; + }; + + journal_header jheader; + uint64_t key_id; + sisl::sg_list write_sgs; + sisl::sg_list read_sgs; + + sisl::blob header_blob() { return sisl::blob(uintptr_cast(&jheader), sizeof(journal_header)); } + sisl::blob key_blob() { return sisl::blob{uintptr_cast(&key_id), sizeof(uint64_t)}; } + + test_req() { + write_sgs.size = 0; + read_sgs.size = 0; + key_id = (uint64_t)rand() << 32 | rand(); + } + + ~test_req() { + for (auto const& iov : write_sgs.iovs) { + iomanager.iobuf_free(uintptr_cast(iov.iov_base)); + } + + for (auto const& iov : read_sgs.iovs) { + iomanager.iobuf_free(uintptr_cast(iov.iov_base)); + } + } + }; + + TestReplicatedDB() = default; + virtual ~TestReplicatedDB() = default; + + void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, MultiBlkId const& blkids, + cintrusive< repl_req_ctx >& ctx) override { + LOGINFO("[Replica={}] Received commit on lsn={}", g_helper->replica_num(), lsn); + ASSERT_EQ(header.size(), sizeof(test_req::journal_header)); + + auto jheader = r_cast< test_req::journal_header const* >(header.cbytes()); + Key k{.id_ = *(r_cast< uint64_t const* >(key.cbytes()))}; + Value v{ + .lsn_ = lsn, .data_size_ = jheader->data_size, .data_pattern_ = jheader->data_pattern, .blkid_ = blkids}; + + { + std::unique_lock lk(db_mtx_); + inmem_db_.insert_or_assign(k, v); + } + + if (ctx->is_proposer) { g_helper->runner().next_task(); } + } + + bool on_pre_commit(int64_t lsn, const sisl::blob& header, const sisl::blob& key, + cintrusive< repl_req_ctx >& ctx) override { + LOGINFO("[Replica={}] Received pre-commit on lsn={}", g_helper->replica_num(), lsn); + return true; + } + + void on_rollback(int64_t lsn, const sisl::blob& header, const sisl::blob& key, + cintrusive< repl_req_ctx >& ctx) override { + LOGINFO("[Replica={}] Received rollback on lsn={}", g_helper->replica_num(), lsn); + } + + blk_alloc_hints get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size) override { + return blk_alloc_hints{}; + } + + void on_replica_stop() override {} + + void db_write(uint64_t data_size, uint32_t max_size_per_iov) { + auto req = intrusive< test_req >(new test_req()); + req->jheader.data_size = data_size; + req->jheader.data_pattern = ((long long)rand() << 32) | rand(); + auto block_size = SISL_OPTIONS["block_size"].as< uint32_t >(); + + if (data_size != 0) { + req->write_sgs = + test_common::HSTestHelper::create_sgs(data_size, max_size_per_iov, req->jheader.data_pattern); + } + + repl_dev()->async_alloc_write(req->header_blob(), req->key_blob(), req->write_sgs, req); + } + + void validate_db_data() { + g_helper->runner().set_num_tasks(inmem_db_.size()); + + LOGINFO("[{}]: Total {} keys committed, validating them", boost::uuids::to_string(repl_dev()->group_id()), + inmem_db_.size()); + auto it = inmem_db_.begin(); + g_helper->runner().set_task([this, &it]() { + Key k; + Value v; + { + std::unique_lock lk(db_mtx_); + std::tie(k, v) = *it; + ++it; + } + auto block_size = SISL_OPTIONS["block_size"].as< uint32_t >(); + auto read_sgs = test_common::HSTestHelper::create_sgs(v.data_size_, block_size); + + repl_dev()->async_read(v.blkid_, read_sgs, v.data_size_).thenValue([read_sgs, k, v](auto const ec) { + RELEASE_ASSERT(!ec, "Read of blkid={} for key={} error={}", v.blkid_.to_string(), k.id_, ec.message()); + for (auto const& iov : read_sgs.iovs) { + test_common::HSTestHelper::validate_data_buf(uintptr_cast(iov.iov_base), iov.iov_len, + v.data_pattern_); + iomanager.iobuf_free(uintptr_cast(iov.iov_base)); + } + g_helper->runner().next_task(); + }); + }); + g_helper->runner().execute().get(); + } + + uint64_t db_size() const { + std::shared_lock lk(db_mtx_); + return inmem_db_.size(); + } + +private: + std::map< Key, Value > inmem_db_; + std::shared_mutex db_mtx_; +}; + +class RaftReplDevTest : public testing::Test { +public: + void SetUp() override { + // By default it will create one db + auto db = std::make_shared< TestReplicatedDB >(); + g_helper->register_listener(db); + dbs_.emplace_back(std::move(db)); + } + + void generate_writes(uint64_t data_size, uint32_t max_size_per_iov) { + pick_one_db().db_write(data_size, max_size_per_iov); + } + + void wait_for_all_writes(uint64_t exp_writes) { + while (true) { + uint64_t total_writes{0}; + for (auto const& db : dbs_) { + total_writes += db->db_size(); + } + + if (total_writes >= exp_writes) { break; } + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + } + + void validate_all_data() { + for (auto const& db : dbs_) { + db->validate_db_data(); + } + } + + TestReplicatedDB& pick_one_db() { return *dbs_[0]; } + +private: + std::vector< std::shared_ptr< TestReplicatedDB > > dbs_; +}; + +TEST_F(RaftReplDevTest, All_Append) { + LOGINFO("Homestore replica={} setup completed", g_helper->replica_num()); + g_helper->sync_for_test_start(); + + if (g_helper->replica_num() == 0) { + g_helper->sync_dataset_size(SISL_OPTIONS["num_io"].as< uint64_t >()); + auto block_size = SISL_OPTIONS["block_size"].as< uint32_t >(); + LOGINFO("Run on worker threads to schedule append on repldev for {} Bytes.", block_size); + g_helper->runner().set_task([this, block_size]() { this->generate_writes(block_size, block_size); }); + g_helper->runner().execute().get(); + } + + this->wait_for_all_writes(g_helper->dataset_size()); + + g_helper->sync_for_verify_start(); + LOGINFO("Validate all data written so far by reading them"); + this->validate_all_data(); + + g_helper->sync_for_cleanup_start(); +} + +int main(int argc, char* argv[]) { + int parsed_argc{argc}; + char** orig_argv = argv; + + ::testing::InitGoogleTest(&parsed_argc, argv); + SISL_OPTIONS_LOAD(parsed_argc, argv, logging, test_raft_repl_dev, iomgr, test_common_setup, test_repl_common_setup); + + FLAGS_folly_global_cpu_executor_threads = 4; + g_helper = std::make_unique< test_common::HSReplTestHelper >("test_raft_repl_dev", orig_argv); + g_helper->setup(); + + (g_helper->replica_num() == 0) ? ::testing::GTEST_FLAG(filter) = "*Primary_*:*All_*" + : ::testing::GTEST_FLAG(filter) = "*Secondary_*::*All_*"; + + auto ret = RUN_ALL_TESTS(); + g_helper->teardown(); + return ret; +} diff --git a/src/tests/test_solo_repl_dev.cpp b/src/tests/test_solo_repl_dev.cpp index b0527648a..492a8006a 100644 --- a/src/tests/test_solo_repl_dev.cpp +++ b/src/tests/test_solo_repl_dev.cpp @@ -35,7 +35,7 @@ #include "common/homestore_assert.hpp" #include "common/homestore_utils.hpp" #include "test_common/homestore_test_common.hpp" -#include "replication/service/repl_service_impl.h" +#include "replication/service/generic_repl_svc.h" #include "replication/repl_dev/solo_repl_dev.h" //////////////////////////////////////////////////////////////////////////// @@ -60,63 +60,6 @@ static constexpr uint64_t Ki{1024}; static constexpr uint64_t Mi{Ki * Ki}; static constexpr uint64_t Gi{Ki * Mi}; -struct Runner { - uint64_t total_tasks{0}; - uint32_t qdepth{8}; - std::atomic< uint64_t > issued_tasks{0}; - std::atomic< uint64_t > pending_tasks{0}; - std::function< void(void) > task; - folly::Promise< folly::Unit > comp_promise; - - Runner(uint64_t num_tasks, uint32_t qd = 8) : total_tasks{num_tasks}, qdepth{qd} { - if (total_tasks < (uint64_t)qdepth) { total_tasks = qdepth; } - } - - Runner() : Runner{SISL_OPTIONS["num_io"].as< uint64_t >()} {} - - void set_task(std::function< void(void) > f) { task = std::move(f); } - - folly::Future< folly::Unit > execute() { - for (uint32_t i{0}; i < qdepth; ++i) { - run_task(); - } - return comp_promise.getFuture(); - } - - void next_task() { - auto ptasks = pending_tasks.fetch_sub(1) - 1; - if ((issued_tasks.load() < total_tasks)) { - run_task(); - } else if (ptasks == 0) { - comp_promise.setValue(); - } - } - - void run_task() { - ++issued_tasks; - ++pending_tasks; - iomanager.run_on_forget(iomgr::reactor_regex::random_worker, task); - } -}; - -struct Waiter { - std::atomic< uint64_t > expected_comp{0}; - std::atomic< uint64_t > actual_comp{0}; - folly::Promise< folly::Unit > comp_promise; - - Waiter(uint64_t num_op) : expected_comp{num_op} {} - Waiter() : Waiter{SISL_OPTIONS["num_io"].as< uint64_t >()} {} - - folly::Future< folly::Unit > start(std::function< void(void) > f) { - f(); - return comp_promise.getFuture(); - } - - void one_complete() { - if ((actual_comp.fetch_add(1) + 1) >= expected_comp.load()) { comp_promise.setValue(); } - } -}; - struct test_repl_req : public repl_req_ctx { sisl::byte_array header; sisl::byte_array key; @@ -174,16 +117,33 @@ class SoloReplDevTest : public testing::Test { void on_rollback(int64_t lsn, const sisl::blob& header, const sisl::blob& key, cintrusive< repl_req_ctx >& ctx) override {} - blk_alloc_hints get_blk_alloc_hints(sisl::blob const& header, cintrusive< repl_req_ctx >& ctx) override { + blk_alloc_hints get_blk_alloc_hints(sisl::blob const& header, uint32_t data_size) override { return blk_alloc_hints{}; } void on_replica_stop() override {} }; + class Application : public ReplApplication { + private: + SoloReplDevTest& m_test; + + public: + Application(SoloReplDevTest& test) : m_test{test} {} + virtual ~Application() = default; + + repl_impl_type get_impl_type() const override { return repl_impl_type::solo; } + bool need_timeline_consistency() const { return true; } + shared< ReplDevListener > create_repl_dev_listener(uuid_t) override { + return std::make_shared< Listener >(m_test); + } + std::pair< std::string, uint16_t > lookup_peer(uuid_t uuid) const override { return std::make_pair("", 0u); } + replica_id_t get_my_repl_id() const override { return hs_utils::gen_random_uuid(); } + }; + protected: - Runner m_io_runner; - Waiter m_task_waiter; + test_common::Runner m_io_runner; + test_common::Waiter m_task_waiter; shared< ReplDev > m_repl_dev1; shared< ReplDev > m_repl_dev2; uuid_t m_uuid1; @@ -194,15 +154,13 @@ class SoloReplDevTest : public testing::Test { test_common::HSTestHelper::start_homestore( "test_solo_repl_dev", {{HS_SERVICE::META, {.size_pct = 5.0}}, - {HS_SERVICE::REPLICATION, {.size_pct = 60.0, .repl_impl = repl_impl_type::solo}}, + {HS_SERVICE::REPLICATION, {.size_pct = 60.0, .repl_app = std::make_unique< Application >(*this)}}, {HS_SERVICE::LOG_REPLICATED, {.size_pct = 20.0}}, {HS_SERVICE::LOG_LOCAL, {.size_pct = 2.0}}}); m_uuid1 = hs_utils::gen_random_uuid(); m_uuid2 = hs_utils::gen_random_uuid(); - m_repl_dev1 = - hs()->repl_service().create_repl_dev(m_uuid1, {}, std::make_unique< Listener >(*this)).get().value(); - m_repl_dev2 = - hs()->repl_service().create_repl_dev(m_uuid2, {}, std::make_unique< Listener >(*this)).get().value(); + m_repl_dev1 = hs()->repl_service().create_repl_dev(m_uuid1, {}).get().value(); + m_repl_dev2 = hs()->repl_service().create_repl_dev(m_uuid2, {}).get().value(); } virtual void TearDown() override { @@ -217,14 +175,10 @@ class SoloReplDevTest : public testing::Test { test_common::HSTestHelper::start_homestore( "test_solo_repl_dev", - {{HS_SERVICE::REPLICATION, {.repl_impl = repl_impl_type::solo}}, + {{HS_SERVICE::REPLICATION, {.repl_app = std::make_unique< Application >(*this)}}, {HS_SERVICE::LOG_REPLICATED, {}}, {HS_SERVICE::LOG_LOCAL, {}}}, - [this]() { - hs()->repl_service().open_repl_dev(m_uuid1, std::make_unique< Listener >(*this)); - hs()->repl_service().open_repl_dev(m_uuid2, std::make_unique< Listener >(*this)); - }, - true /* restart */); + nullptr, true /* restart */); m_repl_dev1 = hs()->repl_service().get_repl_dev(m_uuid1).value(); m_repl_dev2 = hs()->repl_service().get_repl_dev(m_uuid2).value(); @@ -233,7 +187,7 @@ class SoloReplDevTest : public testing::Test { void write_io(uint32_t key_size, uint64_t data_size, uint32_t max_size_per_iov) { auto req = intrusive< test_repl_req >(new test_repl_req()); req->header = sisl::make_byte_array(sizeof(test_repl_req::journal_header)); - auto hdr = r_cast< test_repl_req::journal_header* >(req->header->bytes); + auto hdr = r_cast< test_repl_req::journal_header* >(req->header->bytes()); hdr->key_size = key_size; hdr->key_pattern = ((long long)rand() << 32) | rand(); hdr->data_size = data_size; @@ -241,11 +195,11 @@ class SoloReplDevTest : public testing::Test { if (key_size != 0) { req->key = sisl::make_byte_array(key_size); - HSTestHelper::fill_data_buf(req->key->bytes, key_size, hdr->key_pattern); + HSTestHelper::fill_data_buf(req->key->bytes(), key_size, hdr->key_pattern); } if (data_size != 0) { - req->write_sgs = HSTestHelper::create_sgs(data_size, g_block_size, max_size_per_iov, hdr->data_pattern); + req->write_sgs = HSTestHelper::create_sgs(data_size, max_size_per_iov, hdr->data_pattern); } auto& rdev = (rand() % 2) ? m_repl_dev1 : m_repl_dev2; @@ -258,12 +212,12 @@ class SoloReplDevTest : public testing::Test { void validate_replay(ReplDev& rdev, int64_t lsn, sisl::blob const& header, sisl::blob const& key, MultiBlkId const& blkids) { - auto jhdr = r_cast< test_repl_req::journal_header* >(header.bytes); - HSTestHelper::validate_data_buf(key.bytes, key.size, jhdr->key_pattern); + auto const jhdr = r_cast< test_repl_req::journal_header const* >(header.cbytes()); + HSTestHelper::validate_data_buf(key.cbytes(), key.size(), jhdr->key_pattern); uint32_t size = blkids.blk_count() * g_block_size; if (size) { - auto read_sgs = HSTestHelper::create_sgs(size, g_block_size, size); + auto read_sgs = HSTestHelper::create_sgs(size, size); LOGDEBUG("[{}] Validating replay of lsn={} blkid = {}", boost::uuids::to_string(rdev.group_id()), lsn, blkids.to_string()); rdev.async_read(blkids, read_sgs, size) @@ -287,7 +241,7 @@ class SoloReplDevTest : public testing::Test { void on_write_complete(ReplDev& rdev, intrusive< test_repl_req > req) { // If we did send some data to the repl_dev, validate it by doing async_read if (req->write_sgs.size != 0) { - req->read_sgs = HSTestHelper::create_sgs(req->write_sgs.size, g_block_size, req->write_sgs.size); + req->read_sgs = HSTestHelper::create_sgs(req->write_sgs.size, req->write_sgs.size); auto const cap = hs()->repl_service().get_cap_stats(); LOGDEBUG("Write complete with cap stats: used={} total={}", cap.used_capacity, cap.total_capacity); @@ -299,7 +253,7 @@ class SoloReplDevTest : public testing::Test { LOGDEBUG("[{}] Write complete with lsn={} for size={} blkids={}", boost::uuids::to_string(rdev.group_id()), req->get_lsn(), req->write_sgs.size, req->written_blkids.to_string()); - auto hdr = r_cast< test_repl_req::journal_header* >(req->header->bytes); + auto hdr = r_cast< test_repl_req::journal_header* >(req->header->bytes()); HS_REL_ASSERT_EQ(hdr->data_size, req->read_sgs.size, "journal hdr data size mismatch with actual size"); @@ -342,8 +296,6 @@ TEST_F(SoloReplDevTest, TestHeaderOnly) { } SISL_OPTION_GROUP(test_solo_repl_dev, - (num_io, "", "num_io", "number of io", ::cxxopts::value< uint64_t >()->default_value("300"), - "number"), (block_size, "", "block_size", "block size to io", ::cxxopts::value< uint32_t >()->default_value("4096"), "number"));