From 6db84f2a7c3fe7b6caabd5b7a697c6edc3e5d2dd Mon Sep 17 00:00:00 2001 From: Harihara Kadayam Date: Mon, 18 Sep 2023 14:08:30 -0700 Subject: [PATCH] MultiBlkId for all vdev and services to avoid caller to use std::vector (#166) * All services exposes default std::vector< BlkId > for allocating and writing blocks. This open ended structure has difficulty for upper layers to serialize. Hence introducing MultiBlkId which has tigther structure so upper layers can serialize and use in-place of BlkId. Existing BlkId can still be used and some services also provide backward compatibile std::vector< BlkId > --- conanfile.py | 4 +- src/include/homestore/blk.h | 272 +++++--- src/include/homestore/blkdata_service.hpp | 32 +- src/include/homestore/homestore_decl.hpp | 12 +- src/include/homestore/logstore_service.hpp | 2 +- src/include/homestore/meta_service.hpp | 2 +- .../homestore/replication/repl_decls.h | 44 ++ src/include/homestore/replication/repl_dev.h | 132 ++++ src/include/homestore/replication_service.hpp | 52 ++ src/include/homestore/vchunk.h | 2 +- src/lib/blkalloc/append_blk_allocator.cpp | 42 +- src/lib/blkalloc/append_blk_allocator.h | 24 +- src/lib/blkalloc/blk.cpp | 176 +++-- src/lib/blkalloc/blk_allocator.cpp | 200 ++++-- src/lib/blkalloc/blk_allocator.h | 92 ++- src/lib/blkalloc/blk_cache.h | 37 +- src/lib/blkalloc/blk_cache_queue.cpp | 50 +- src/lib/blkalloc/blk_cache_queue.h | 58 +- src/lib/blkalloc/fixed_blk_allocator.cpp | 36 +- src/lib/blkalloc/varsize_blk_allocator.cpp | 620 ++++++++++-------- src/lib/blkalloc/varsize_blk_allocator.h | 88 +-- src/lib/blkdata_svc/blk_read_tracker.cpp | 24 +- src/lib/blkdata_svc/blk_read_tracker.hpp | 4 +- src/lib/blkdata_svc/blkdata_service.cpp | 174 +++-- src/lib/device/journal_vdev.cpp | 18 +- src/lib/device/journal_vdev.hpp | 16 +- src/lib/device/physical_dev.cpp | 109 ++- src/lib/device/physical_dev.hpp | 31 +- src/lib/device/vchunk.cpp | 2 +- src/lib/device/virtual_dev.cpp | 226 ++++--- src/lib/device/virtual_dev.hpp | 74 ++- src/lib/homestore.cpp | 13 +- src/lib/index/wb_cache.cpp | 6 +- src/lib/logstore/log_store_service.cpp | 2 +- src/lib/meta/meta_blk_service.cpp | 33 +- src/lib/meta/meta_sb.hpp | 26 +- src/lib/replication/repl_service.cpp | 122 ++++ src/tests/CMakeLists.txt | 17 +- src/tests/test_append_blkalloc.cpp | 73 +-- src/tests/test_blk_cache_queue.cpp | 2 +- src/tests/test_blkalloc.cpp | 28 +- src/tests/test_blkid.cpp | 178 +++++ src/tests/test_data_service.cpp | 155 ++--- 43 files changed, 2078 insertions(+), 1232 deletions(-) create mode 100644 src/include/homestore/replication/repl_decls.h create mode 100644 src/include/homestore/replication/repl_dev.h create mode 100644 src/include/homestore/replication_service.hpp create mode 100644 src/lib/replication/repl_service.cpp create mode 100644 src/tests/test_blkid.cpp diff --git a/conanfile.py b/conanfile.py index 753ce2458..d4bb59224 100644 --- a/conanfile.py +++ b/conanfile.py @@ -5,7 +5,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "4.2.2" + version = "4.3.0" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" @@ -55,7 +55,7 @@ def build_requirements(self): self.build_requires("gtest/1.14.0") def requirements(self): - self.requires("iomgr/[~=9, include_prerelease=True]@oss/master") + self.requires("iomgr/[~=10, include_prerelease=True]@oss/master") self.requires("sisl/[~=10, include_prerelease=True]@oss/master") self.requires("farmhash/cci.20190513@") diff --git a/src/include/homestore/blk.h b/src/include/homestore/blk.h index 14283cd3e..c0c8b25c7 100644 --- a/src/include/homestore/blk.h +++ b/src/include/homestore/blk.h @@ -24,122 +24,162 @@ #include #include +#include #include +#include #include namespace homestore { -typedef uint32_t blk_num_t; -typedef blk_num_t blk_cap_t; -static_assert(sizeof(blk_num_t) == (BLK_NUM_BITS - 1) / 8 + 1, "Expected blk_num_t to matching BLK_NUM_BITS"); +using chunk_num_t = uint16_t; +using blk_count_t = uint16_t; +using blk_num_t = uint32_t; +using blk_temp_t = uint16_t; -typedef uint8_t blk_count_serialized_t; -typedef uint16_t blk_count_t; -static_assert(sizeof(blk_count_serialized_t) == (NBLKS_BITS - 1) / 8 + 1, - "Expected blk_count_t to matching NBLKS_BITS"); +static constexpr size_t max_addressable_chunks() { return 1UL << (8 * sizeof(chunk_num_t)); } +static constexpr size_t max_blks_per_chunk() { return 1UL << (8 * sizeof(blk_num_t)); } +static constexpr size_t max_blks_per_blkid() { return (1UL << (8 * sizeof(blk_count_t))) - 1; } -typedef uint8_t chunk_num_t; -static_assert(sizeof(chunk_num_t) == (CHUNK_NUM_BITS - 1) / 8 + 1, "Expected blk_count_t to matching CHUNK_NUM_BITS"); - -typedef uint8_t blk_temp_t; - -/* This structure represents the application wide unique block number. It also encomposses the number of blks. */ +#pragma pack(1) struct BlkId { -private: - static constexpr uint64_t s_blk_num_mask{(static_cast< uint64_t >(1) << BLK_NUM_BITS) - 1}; - static constexpr uint64_t s_nblks_mask{(static_cast< uint64_t >(1) << NBLKS_BITS) - 1}; - static constexpr uint64_t s_chunk_num_mask{(static_cast< uint64_t >(1) << CHUNK_NUM_BITS) - 1}; +protected: + struct serialized { + blk_num_t m_is_multi : 1; // Is it a part of multi blkid or not + blk_num_t m_blk_num : 31; // Block number which is unique within the chunk + blk_count_t m_nblks; // Number of blocks+1 for this blkid, don't directly acccess this - use blk_count() + chunk_num_t m_chunk_num; // Chunk number - which is unique for the entire application -public: - static constexpr blk_count_t max_blks_in_op() { return (1 << NBLKS_BITS); } - static constexpr uint64_t max_id_int() { return (1ull << (BLK_NUM_BITS + NBLKS_BITS + CHUNK_NUM_BITS)) - 1; } + serialized() : m_is_multi{0}, m_blk_num{0}, m_nblks{0}, m_chunk_num{0} {} + serialized(bool is_multi, blk_num_t blk_num, blk_count_t nblks, chunk_num_t cnum) : + m_is_multi{is_multi ? 0x1u : 0x0u}, m_blk_num{blk_num}, m_nblks{nblks}, m_chunk_num{cnum} {} + }; + static_assert(sizeof(serialized) == sizeof(uint64_t), "Expected serialized size to 64 bits"); - static int compare(const BlkId& one, const BlkId& two); - uint64_t to_integer() const; + serialized s; +public: + BlkId() = default; explicit BlkId(uint64_t id_int); - BlkId(blk_num_t blk_num, blk_count_t nblks, chunk_num_t chunk_num = 0); - BlkId() { invalidate(); } - BlkId(const BlkId&) = default; - BlkId& operator=(const BlkId&) = default; + BlkId(blk_num_t blk_num, blk_count_t nblks, chunk_num_t chunk_num); + BlkId(BlkId const&) = default; + BlkId& operator=(BlkId const&) = default; BlkId(BlkId&&) noexcept = default; BlkId& operator=(BlkId&&) noexcept = default; - bool operator==(const BlkId& other) const { return (compare(*this, other) == 0); } - bool operator>(const BlkId& other) const { return (compare(*this, other) > 0); } - bool operator<(const BlkId& other) const { return (compare(*this, other) < 0); } + + bool operator==(BlkId const& other) const { return (compare(*this, other) == 0); } + bool operator>(BlkId const& other) const { return (compare(*this, other) > 0); } + bool operator<(BlkId const& other) const { return (compare(*this, other) < 0); } + + blk_num_t blk_num() const { return s.m_blk_num; } + blk_count_t blk_count() const { return s.m_nblks; } + chunk_num_t chunk_num() const { return s.m_chunk_num; } + bool is_multi() const { return s.m_is_multi; } void invalidate(); + uint64_t to_integer() const; + sisl::blob serialize(); // TODO: Consider making this const, perhaps returns const uint8_t version of blob + void deserialize(sisl::blob const& b, bool copy); + uint32_t serialized_size() const; + std::string to_string() const; bool is_valid() const; - BlkId get_blkid_at(uint32_t offset, uint32_t pagesz) const; - BlkId get_blkid_at(uint32_t offset, uint32_t size, uint32_t pagesz) const; - - void set(blk_num_t blk_num, blk_count_t nblks, chunk_num_t chunk_num = 0); - void set(const BlkId& bid); - void set(uint64_t id_int); + static int compare(BlkId const& one, BlkId const& two); +}; +#pragma pack() - void set_blk_num(blk_num_t blk_num); - blk_num_t get_blk_num() const { return m_blk_num; } - // last blk num is the last blk num that belongs this blkid; - blk_num_t get_last_blk_num() const { return get_blk_num() + get_nblks() - 1; } +#pragma pack(1) +struct MultiBlkId : public BlkId { + static constexpr uint32_t max_addln_pieces{5}; + static constexpr uint32_t max_pieces{max_addln_pieces + 1}; - void set_nblks(blk_count_t nblks); - blk_count_t get_nblks() const { return static_cast< blk_count_t >(m_nblks) + 1; } +private: + struct chain_blkid { + blk_num_t m_blk_num; + blk_count_t m_nblks{0}; - void set_chunk_num(const chunk_num_t chunk_num); - chunk_num_t get_chunk_num() const { return m_chunk_num; } + bool is_valid() const { return (m_nblks != 0); } + }; - /* A blkID represent a page size which is assigned to a blk allocator */ - uint32_t data_size(const uint32_t page_size) const { return (get_nblks() * page_size); } + uint16_t n_addln_piece{0}; + std::array< chain_blkid, max_addln_pieces > addln_pieces; +public: + MultiBlkId(); + MultiBlkId(BlkId const& b); + MultiBlkId(blk_num_t blk_num, blk_count_t nblks, chunk_num_t chunk_num); + MultiBlkId(MultiBlkId const&) = default; + MultiBlkId& operator=(MultiBlkId const&) = default; + MultiBlkId(MultiBlkId&&) noexcept = default; + MultiBlkId& operator=(MultiBlkId&&) noexcept = default; + + void add(blk_num_t blk_num, blk_count_t nblks, chunk_num_t chunk_num); + void add(BlkId const&); + + uint16_t num_pieces() const; + blk_count_t blk_count() const; std::string to_string() const; - blk_num_t m_blk_num; // Block number which is unique within the chunk - blk_count_serialized_t m_nblks; // Number of blocks+1 for this blkid, don't directly acccess this - use get_nblks() - chunk_num_t m_chunk_num; // Chunk number - which is unique for the entire application -} __attribute__((__packed__)); - -VENUM(BlkAllocStatus, uint32_t, - BLK_ALLOC_NONE = 0, // No Action taken - SUCCESS = 1ul << 0, // Success - FAILED = 1ul << 1, // Failed to alloc/free - REQ_MORE = 1ul << 2, // Indicate that we need more - SPACE_FULL = 1ul << 3, // Space is full - INVALID_DEV = 1ul << 4, // Invalid Device provided for alloc - PARTIAL = 1ul << 5, // In case of multiple blks, only partial is alloced/freed - INVALID_THREAD = 1ul << 6 // Not possible to alloc in this thread -); - -static_assert(sizeof(BlkId) < 8); -#pragma pack(1) -struct BlkId8_t : public BlkId { - uint8_t pad[8 - sizeof(BlkId)]{}; - - BlkId8_t& operator=(const BlkId& rhs) { - BlkId::operator=(rhs); - return *this; - } + bool operator==(MultiBlkId const& other) const { return (compare(*this, other) == 0); } + bool operator>(MultiBlkId const& other) const { return (compare(*this, other) > 0); } + bool operator<(MultiBlkId const& other) const { return (compare(*this, other) < 0); } + + sisl::blob serialize(); + uint32_t serialized_size() const; + void deserialize(sisl::blob const& b, bool copy); + + bool has_room() const; + BlkId to_single_blkid() const; + + static int compare(MultiBlkId const& one, MultiBlkId const& two); + + struct iterator { + MultiBlkId const& mbid_; + uint16_t next_blk_{0}; + + iterator(MultiBlkId const& mb) : mbid_{mb} {} + std::optional< BlkId > next() { + if (next_blk_ == 0) { + auto bid = r_cast< BlkId const& >(mbid_); + ++next_blk_; + return (bid.is_valid()) ? std::make_optional(bid) : std::nullopt; + } else if (next_blk_ < mbid_.num_pieces()) { + auto cbid = mbid_.addln_pieces[next_blk_ - 1]; + ++next_blk_; + return std::make_optional(BlkId{cbid.m_blk_num, cbid.m_nblks, mbid_.chunk_num()}); + } else { + return std::nullopt; + } + } + }; + + iterator iterate() const; }; #pragma pack() -static_assert(sizeof(BlkId8_t) == 8); -inline blk_num_t begin_of(const BlkId& blkid) { return blkid.get_blk_num(); } -inline blk_num_t end_of(const BlkId& blkid) { return blkid.get_blk_num() + blkid.get_nblks(); } -inline size_t hash_value(const BlkId& blkid) { return std::hash< uint64_t >()(blkid.to_integer()); } } // namespace homestore -// hash function definitions +///////////////////// hash function definitions ///////////////////// namespace std { template <> struct hash< homestore::BlkId > { - typedef homestore::BlkId argument_type; - typedef size_t result_type; - result_type operator()(const argument_type& bid) const noexcept { - return std::hash< uint64_t >()(bid.to_integer()); + size_t operator()(const homestore::BlkId& bid) const noexcept { return std::hash< uint64_t >()(bid.to_integer()); } +}; + +template <> +struct hash< homestore::MultiBlkId > { + size_t operator()(const homestore::MultiBlkId& mbid) const noexcept { + static constexpr size_t s_start_seed = 0xB504F333; + size_t seed = s_start_seed; + auto it = mbid.iterate(); + while (auto b = it.next()) { + boost::hash_combine(seed, b->to_integer()); + } + return seed; } }; } // namespace std +///////////////////// formatting definitions ///////////////////// template < typename T > struct fmt::formatter< T, std::enable_if_t< std::is_base_of< homestore::BlkId, T >::value, char > > : fmt::formatter< std::string > { @@ -148,10 +188,25 @@ struct fmt::formatter< T, std::enable_if_t< std::is_base_of< homestore::BlkId, T } }; -namespace homestore { +template < typename T > +struct fmt::formatter< T, std::enable_if_t< std::is_base_of< homestore::MultiBlkId, T >::value, char > > + : fmt::formatter< std::string > { + auto format(const homestore::MultiBlkId& a, format_context& ctx) const { + return fmt::formatter< std::string >::format(a.to_string(), ctx); + } +}; -template < typename charT, typename traits > -std::basic_ostream< charT, traits >& operator<<(std::basic_ostream< charT, traits >& outStream, const BlkId& blk) { +namespace boost { +template <> +struct hash< homestore::BlkId > { + size_t operator()(const homestore::BlkId& bid) const noexcept { return std::hash< homestore::BlkId >()(bid); } +}; +} // namespace boost + +namespace homestore { +///////////////////// stream operation definitions ///////////////////// +template < typename charT, typename traits, typename blkidT > +std::basic_ostream< charT, traits >& stream_op(std::basic_ostream< charT, traits >& outStream, blkidT const& blk) { // copy the stream formatting std::basic_ostringstream< charT, traits > outStringStream; outStringStream.copyfmt(outStream); @@ -163,27 +218,40 @@ std::basic_ostream< charT, traits >& operator<<(std::basic_ostream< charT, trait return outStream; } -/* Hints for various allocators */ +template < typename charT, typename traits > +std::basic_ostream< charT, traits >& operator<<(std::basic_ostream< charT, traits >& outStream, BlkId const& blk) { + return stream_op< charT, traits, BlkId >(outStream, blk); +} + +template < typename charT, typename traits > +std::basic_ostream< charT, traits >& operator<<(std::basic_ostream< charT, traits >& outStream, MultiBlkId const& blk) { + return stream_op< charT, traits, MultiBlkId >(outStream, blk); +} + +///////////////////// Other common Blkd definitions ///////////////////// +VENUM(BlkAllocStatus, uint32_t, + BLK_ALLOC_NONE = 0, // No Action taken + SUCCESS = 1ul << 0, // Success + FAILED = 1ul << 1, // Failed to alloc/free + REQ_MORE = 1ul << 2, // Indicate that we need more + SPACE_FULL = 1ul << 3, // Space is full + INVALID_DEV = 1ul << 4, // Invalid Device provided for alloc + PARTIAL = 1ul << 5, // In case of multiple blks, only partial is alloced/freed + INVALID_THREAD = 1ul << 6, // Not possible to alloc in this thread + INVALID_INPUT = 1ul << 7, // Invalid input + TOO_MANY_PIECES = 1ul << 8 // Allocation results in more pieces than passed on +); + struct blk_alloc_hints { - blk_alloc_hints() : - desired_temp{0}, - dev_id_hint{INVALID_DEV_ID}, - can_look_for_other_chunk{true}, - is_contiguous{false}, - multiplier{1}, - max_blks_per_entry{BlkId::max_blks_in_op()}, - stream_info{(uintptr_t) nullptr} {} - - blk_temp_t desired_temp; // Temperature hint for the device - uint32_t dev_id_hint; // which physical device to pick (hint if any) -1 for don't care - bool can_look_for_other_chunk; // If alloc on device not available can I pick other device - bool is_contiguous; - uint32_t multiplier; // blks allocated in a blkid should be a multiple of multiplier - uint32_t max_blks_per_entry; // Number of blks on every entry - uintptr_t stream_info; -#ifdef _PRERELEASE - bool error_simulate = false; // can error simulate happen -#endif + blk_temp_t desired_temp{0}; // Temperature hint for the device + std::optional< uint32_t > pdev_id_hint; // which physical device to pick (hint if any) -1 for don't care + std::optional< chunk_num_t > chunk_id_hint; // any specific chunk id to pick for this allocation + std::optional< stream_id_t > stream_id_hint; // any specific stream to pick + bool can_look_for_other_chunk{true}; // If alloc on device not available can I pick other device + bool is_contiguous{true}; // Should the entire allocation be one contiguous block + bool partial_alloc_ok{false}; // ok to allocate only portion of nblks? Mutually exclusive with is_contiguous + uint32_t min_blks_per_piece{1}; // blks allocated in a blkid should be atleast this size per entry + uint32_t max_blks_per_piece{max_blks_per_blkid()}; // Number of blks on every entry }; } // namespace homestore diff --git a/src/include/homestore/blkdata_service.hpp b/src/include/homestore/blkdata_service.hpp index d82eaf7dd..3cb2987ca 100644 --- a/src/include/homestore/blkdata_service.hpp +++ b/src/include/homestore/blkdata_service.hpp @@ -56,7 +56,7 @@ class BlkDataService { * * @param vb : vdev info blk containing the details of this blkstore */ - shared< VirtualDev > open_vdev(const vdev_info& vinfo, bool load_existing); + shared< VirtualDev > open_vdev(vdev_info const& vinfo, bool load_existing); /** * @brief : asynchronous write without input block ids. Block ids will be allocated by this api and returned; @@ -67,9 +67,11 @@ class BlkDataService { * @param cb : callback that will be triggered after write completes; * @param part_of_batch : is this write part of a batch; */ - folly::Future< bool > async_alloc_write(const sisl::sg_list& sgs, const blk_alloc_hints& hints, - std::vector< BlkId >& out_blkids, bool part_of_batch = false); + folly::Future< std::error_code > async_alloc_write(sisl::sg_list const& sgs, blk_alloc_hints const& hints, + MultiBlkId& out_blkids, bool part_of_batch = false); + folly::Future< std::error_code > async_write(const char* buf, uint32_t size, MultiBlkId const& bid, + bool part_of_batch); /** * @brief : asynchronous write with input block ids; * @@ -79,8 +81,11 @@ class BlkDataService { * @param cb : callback that will be triggered after write completes * @param part_of_batch : is this write part of a batch; */ - folly::Future< bool > async_write(const sisl::sg_list& sgs, const blk_alloc_hints& hints, - const std::vector< BlkId >& in_blkids, bool part_of_batch = false); + folly::Future< std::error_code > async_write(sisl::sg_list const& sgs, MultiBlkId const& in_blkids, + bool part_of_batch = false); + + folly::Future< std::error_code > async_read(MultiBlkId const& bid, uint8_t* buf, uint32_t size, + bool part_of_batch = false); /** * @brief : asynchronous read @@ -91,14 +96,15 @@ class BlkDataService { * @param cb : callback that will be triggered after read completes * @param part_of_batch : is this read part of batch; */ - folly::Future< bool > async_read(const BlkId& bid, sisl::sg_list& sgs, uint32_t size, bool part_of_batch = false); + folly::Future< std::error_code > async_read(MultiBlkId const& bid, sisl::sg_list& sgs, uint32_t size, + bool part_of_batch = false); /** * @brief : commit a block, usually called during recovery * * @param bid : block id to commit; */ - void commit_blk(const BlkId& bid); + void commit_blk(MultiBlkId const& bid); /** * @brief : alloc blocks based on input size; @@ -116,14 +122,14 @@ class BlkDataService { * @param bid : the block id to free * @param cb : the callback that will be triggered after free block completes; */ - folly::Future< bool > async_free_blk(const BlkId bid); + folly::Future< std::error_code > async_free_blk(MultiBlkId const& bid); /** - * @brief : get the page size of this data service; + * @brief : get the blk size of this data service; * - * @return : page size + * @return : blk size */ - uint32_t get_page_size() const { return m_page_size; } + uint32_t get_blk_size() const { return m_blk_size; } /** * @brief : get the read block tracker handle; @@ -138,7 +144,7 @@ class BlkDataService { void start(); private: - BlkAllocStatus alloc_blks(uint32_t size, const blk_alloc_hints& hints, std::vector< BlkId >& out_blkids); + BlkAllocStatus alloc_blks(uint32_t size, blk_alloc_hints const& hints, MultiBlkId& out_blkids); void init(); @@ -147,7 +153,7 @@ class BlkDataService { private: std::shared_ptr< VirtualDev > m_vdev; std::unique_ptr< BlkReadTracker > m_blk_read_tracker; - uint32_t m_page_size; + uint32_t m_blk_size; }; extern BlkDataService& data_service(); diff --git a/src/include/homestore/homestore_decl.hpp b/src/include/homestore/homestore_decl.hpp index 8dd859556..2b227d08b 100644 --- a/src/include/homestore/homestore_decl.hpp +++ b/src/include/homestore/homestore_decl.hpp @@ -20,6 +20,7 @@ #include #include +#include #include #include @@ -49,7 +50,10 @@ template < typename T > using cshared = const std::shared_ptr< T >; template < typename T > -using unique = const std::unique_ptr< T >; +using unique = std::unique_ptr< T >; + +template < typename T > +using intrusive = boost::intrusive_ptr< T >; ////////////// All Size Limits /////////////////// constexpr uint32_t BLK_NUM_BITS{32}; @@ -145,12 +149,17 @@ struct HS_SERVICE { static constexpr uint32_t LOG_LOCAL = 1 << 2; static constexpr uint32_t DATA = 1 << 3; static constexpr uint32_t INDEX = 1 << 4; + static constexpr uint32_t REPLICATION = 1 << 5; uint32_t svcs; HS_SERVICE() : svcs{META} {} HS_SERVICE(uint32_t val) : svcs{val} { svcs |= META; // Force meta to be present always + if (svcs & REPLICATION) { + svcs |= LOG_REPLICATED | LOG_LOCAL; + svcs &= ~DATA; // ReplicationDataSvc or DataSvc only one of them + } } std::string list() const { @@ -160,6 +169,7 @@ struct HS_SERVICE { if (svcs & INDEX) { str += "index,"; } if (svcs & LOG_REPLICATED) { str += "log_replicated,"; } if (svcs & LOG_LOCAL) { str += "log_local,"; } + if (svcs & REPLICATION) { str += "replication,"; } return str; } }; diff --git a/src/include/homestore/logstore_service.hpp b/src/include/homestore/logstore_service.hpp index f7eda0035..3f1d62958 100644 --- a/src/include/homestore/logstore_service.hpp +++ b/src/include/homestore/logstore_service.hpp @@ -135,7 +135,7 @@ class LogStoreService { void device_truncate(const device_truncate_cb_t& cb = nullptr, const bool wait_till_done = false, const bool dry_run = false); - folly::Future< bool > create_vdev(uint64_t size, logstore_family_id_t family); + folly::Future< std::error_code > create_vdev(uint64_t size, logstore_family_id_t family); shared< VirtualDev > open_vdev(const vdev_info& vinfo, logstore_family_id_t family, bool load_existing); shared< JournalVirtualDev > get_vdev(logstore_family_id_t family) const { return (family == DATA_LOG_FAMILY_IDX) ? m_data_logdev_vdev : m_ctrl_logdev_vdev; diff --git a/src/include/homestore/meta_service.hpp b/src/include/homestore/meta_service.hpp index 3ab8e1cf6..b6b5c9b4c 100644 --- a/src/include/homestore/meta_service.hpp +++ b/src/include/homestore/meta_service.hpp @@ -265,7 +265,7 @@ class MetaBlkService { * */ void alloc_meta_blk(BlkId& bid); - void alloc_meta_blk(uint64_t size, std::vector< BlkId >& bid); + void alloc_meta_blks(uint64_t size, std::vector< BlkId >& bid); void free_meta_blk(meta_blk* mblk); diff --git a/src/include/homestore/replication/repl_decls.h b/src/include/homestore/replication/repl_decls.h new file mode 100644 index 000000000..1c659d6ad --- /dev/null +++ b/src/include/homestore/replication/repl_decls.h @@ -0,0 +1,44 @@ +#pragma once +#include +#include + +#include +#include +#include +#include +#include +#include + +SISL_LOGGING_DECL(replication) + +#define REPL_LOG_MODS grpc_server, HOMESTORE_LOG_MODS, nuraft_mesg, nuraft, replication + +namespace homestore { +using blkid_list_t = folly::small_vector< BlkId, 4 >; + +// Fully qualified domain pba, unique pba id across replica set +struct RemoteBlkId { + RemoteBlkId(uint32_t s, const BlkId& b) : server_id{s}, blkid{b} {} + uint32_t server_id; + BlkId blkid; + + bool operator==(RemoteBlkId const& o) const { return (server_id == o.server_id) && (blkid == o.blkid); } +}; + +using remote_blkid_list_t = folly::small_vector< RemoteBlkId, 4 >; + +// data service api names +static std::string const SEND_DATA{"send_data"}; +static std::string const FETCH_DATA{"fetch_data"}; + +} // namespace homestore + +// hash function definitions +namespace std { +template <> +struct hash< homestore::RemoteBlkId > { + size_t operator()(homestore::RemoteBlkId const& fqbid) const noexcept { + return std::hash< uint64_t >()(fqbid.server_id) + std::hash< uint64_t >()(fqbid.blkid.to_integer()); + } +}; +} // namespace std diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h new file mode 100644 index 000000000..6b459d10a --- /dev/null +++ b/src/include/homestore/replication/repl_dev.h @@ -0,0 +1,132 @@ +#pragma once + +#include +#include + +#include + +namespace home_replication { + +// +// Callbacks to be implemented by ReplDev users. +// +class ReplDevListener { +public: + virtual ~ReplDevListener() = default; + + /// @brief Called when the log entry has been committed in the replica set. + /// + /// This function is called from a dedicated commit thread which is different from the original thread calling + /// replica_set::write(). There is only one commit thread, and lsn is guaranteed to be monotonically increasing. + /// + /// @param lsn - The log sequence number + /// @param header - Header originally passed with replica_set::write() api + /// @param key - Key originally passed with replica_set::write() api + /// @param blkids - List of blkids where data is written to the storage engine. + /// @param ctx - User contenxt passed as part of the replica_set::write() api + /// + virtual void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, blkid_list_t const& blkids, + void* ctx) = 0; + + /// @brief Called when the log entry has been received by the replica dev. + /// + /// On recovery, this is called from a random worker thread before the raft server is started. It is + /// guaranteed to be serialized in log index order. + /// + /// On the leader, this is called from the same thread that replica_set::write() was called. + /// + /// On the follower, this is called when the follower has received the log entry. It is guaranteed to be serialized + /// in log sequence order. + /// + /// NOTE: Listener can choose to ignore this pre commit, however, typical use case of maintaining this is in-case + /// replica set needs to support strong consistent reads and follower needs to ignore any keys which are not being + /// currently in pre-commit, but yet to be committed. + /// + /// @param lsn - The log sequence number + /// @param header - Header originally passed with repl_dev::write() api + /// @param key - Key originally passed with repl_dev::write() api + /// @param ctx - User contenxt passed as part of the repl_dev::write() api + virtual void on_pre_commit(int64_t lsn, const sisl::blob& header, const sisl::blob& key, void* ctx) = 0; + + /// @brief Called when the log entry has been rolled back by the replica set. + /// + /// This function is called on followers only when the log entry is going to be overwritten. This function is called + /// from a random worker thread, but is guaranteed to be serialized. + /// + /// For each log index, it is guaranteed that either on_commit() or on_rollback() is called but not both. + /// + /// NOTE: Listener should do the free any resources created as part of pre-commit. + /// + /// @param lsn - The log sequence number getting rolled back + /// @param header - Header originally passed with repl_dev::write() api + /// @param key - Key originally passed with repl_dev::write() api + /// @param ctx - User contenxt passed as part of the repl_dev::write() api + virtual void on_rollback(int64_t lsn, const sisl::blob& header, const sisl::blob& key, void* ctx) = 0; + + /// @brief Called when replication module is trying to allocate a block to write the value + /// + /// This function can be called both on leader and follower when it is trying to allocate a block to write the + /// value. Caller is expected to provide hints for allocation based on the header supplied as part of original + /// write. In cases where caller don't care about the hints can return default blk_alloc_hints. + /// + /// @param header Header originally passed with repl_dev::write() api on the leader + /// @return Expected to return blk_alloc_hints for this write + virtual blk_alloc_hints get_blk_alloc_hints(sisl::blob const& header) = 0; + + /// @brief Called when the replica set is being stopped + virtual void on_replica_stop() = 0; +}; + +class ReplDev { +public: + virtual ~ReplDev() = default; + + /// @brief Replicate the data to the replica set. This method goes through the + /// following steps: + /// Step 1: Allocates blkid from the storage engine to write the value into. Storage + /// engine returns a blkid_list in cases where single contiguous blocks are not + /// available. For convenience, the comment will continue to refer blkid_list as blkids. + /// Step 2: Uses data channel to send the to all replicas + /// Step 3: Creates a log/journal entry with and calls nuraft to + /// append the entry and replicate using nuraft channel (also called header_channel). + /// Step 4: Writes the data into the allocated blk_id + /// + /// @param header - Blob representing the header (it is opaque and will be copied + /// as-is to the journal entry) + /// @param key - Blob representing the key (it is opaque and will be copied as-is to + /// the journal entry). We are tracking this seperately to support consistent read use + /// cases + /// @param value - vector of io buffers that contain value for the key. It is an optional field and if the value + /// list size is 0, then only key is written to replicadev without data. + /// @param user_ctx - User supplied opaque context which will be passed to listener + /// callbacks + virtual void async_alloc_write(const sisl::blob& header, const sisl::blob& key, const sisl::sg_list& value, + void* user_ctx) = 0; + + /// @brief Reads the data and returns a future to continue on + /// @param bid Block id to read + /// @param sgs Scatter gather buffer list to which blkids are to be read into + /// @param size Total size of the data read + /// @param part_of_batch Is read is part of a batch. If part of the batch, then submit_batch needs to be called at + /// the end + /// @return A Future with bool to notify if it has successfully read the data, raises the exception in case of + /// failure + virtual folly::Future< bool > async_read(const BlkId& bid, sisl::sg_list& sgs, uint32_t size, + bool part_of_batch = false); + + /// @brief After data is replicated and on_commit to the listener is called. the blkids can be freed. + /// + /// @param lsn - LSN of the old blkids that is being freed + /// @param blkids - blkids to be freed. + virtual void async_free_blks(int64_t lsn, const blkid_list_t& blkids) = 0; + + /// @brief Checks if this replica is the leader in this ReplDev + /// @return true or false + virtual bool is_leader() const = 0; + + /// @brief Gets the group_id this repldev is working for + /// @return group_id + virtual std::string group_id() const = 0; +}; + +} // namespace home_replication diff --git a/src/include/homestore/replication_service.hpp b/src/include/homestore/replication_service.hpp new file mode 100644 index 000000000..cb27e9306 --- /dev/null +++ b/src/include/homestore/replication_service.hpp @@ -0,0 +1,52 @@ +#pragma once +#include +#include +#include +#include + +#include + +#include "repl_decls.h" +#include "repl_set.h" + +namespace nuraft { +class state_machine; +} + +namespace homestore { + +class ReplDev; +using ReplServiceError = nuraft::cmd_result_code; +using on_replica_dev_init_t = std::function< std::unique_ptr< ReplicaDevListener >(cshared< ReplDev >& rd) >; + +template < typename V, typename E > +using Result = folly::Expected< V, E >; + +template < class V, class E > +using AsyncResult = folly::SemiFuture< Result< V, E > >; + +template < class V > +using ReplResult = Result< V, ReplServiceError >; + +template < class V > +using ReplAsyncResult = AsyncResult< V, ReplServiceError >; + +class ReplicationService { +public: + ReplicationService() = default; + virtual ~ReplicationService() = default; + + // using set_var = std::variant< shared< ReplDev >, ReplServiceError >; + + /// Sync APIs + virtual shared< ReplDev > get_replica_dev(std::string const& group_id) const = 0; + virtual void iterate_replica_devs(std::function< void(cshared< ReplDev >&) > cb) const = 0; + + /// Async APIs + virtual ReplAsyncResult< shared< ReplDev > > create_replica_dev(std::string const& group_id, + std::set< std::string, std::less<> >&& members) = 0; + + virtual folly::SemiFuture< ReplServiceError > + replace_member(std::string const& group_id, std::string const& member_out, std::string const& member_in) const = 0; +}; +} // namespace homestore diff --git a/src/include/homestore/vchunk.h b/src/include/homestore/vchunk.h index 6f09786f1..11b313de7 100644 --- a/src/include/homestore/vchunk.h +++ b/src/include/homestore/vchunk.h @@ -29,7 +29,7 @@ class VChunk { void set_user_private(const sisl::blob& data); const uint8_t* get_user_private() const; - blk_cap_t available_blks() const; + blk_num_t available_blks() const; uint32_t get_pdev_id() const; cshared< Chunk > get_internal_chunk() const; diff --git a/src/lib/blkalloc/append_blk_allocator.cpp b/src/lib/blkalloc/append_blk_allocator.cpp index f74ed8956..2017e506d 100644 --- a/src/lib/blkalloc/append_blk_allocator.cpp +++ b/src/lib/blkalloc/append_blk_allocator.cpp @@ -67,7 +67,7 @@ void AppendBlkAllocator::on_meta_blk_found(const sisl::byte_view& buf, void* met // // alloc a single block; // -BlkAllocStatus AppendBlkAllocator::alloc(BlkId& bid) { +BlkAllocStatus AppendBlkAllocator::alloc_contiguous(BlkId& bid) { std::unique_lock lk(m_mtx); if (available_blks() < 1) { COUNTER_INCREMENT(m_metrics, num_alloc_failure, 1); @@ -75,9 +75,9 @@ BlkAllocStatus AppendBlkAllocator::alloc(BlkId& bid) { return BlkAllocStatus::SPACE_FULL; } - bid.set(m_last_append_offset, 1, m_chunk_id); + bid = BlkId{m_last_append_offset, 1, m_chunk_id}; - [[maybe_unused]] auto cur_cp = hs()->cp_mgr().cp_guard(); + auto cur_cp = hs()->cp_mgr().cp_guard(); ++m_last_append_offset; --m_freeable_nblks; set_dirty_offset(cur_cp->id() % MAX_CP_COUNT); @@ -90,28 +90,27 @@ BlkAllocStatus AppendBlkAllocator::alloc(BlkId& bid) { // For append blk allocator, the assumption is only one writer will append data on one chunk. // If we want to change above design, we can open this api for vector allocation; // -BlkAllocStatus AppendBlkAllocator::alloc(blk_count_t nblks, const blk_alloc_hints& hint, - std::vector< BlkId >& out_bids) { +BlkAllocStatus AppendBlkAllocator::alloc(blk_count_t nblks, const blk_alloc_hints& hint, BlkId& out_bid) { std::unique_lock lk(m_mtx); if (available_blks() < nblks) { COUNTER_INCREMENT(m_metrics, num_alloc_failure, 1); LOGERROR("No space left to serve request nblks: {}, available_blks: {}", nblks, available_blks()); return BlkAllocStatus::SPACE_FULL; - } else if (nblks > BlkId::max_blks_in_op()) { + } else if (nblks > max_blks_per_blkid()) { // consumer(vdev) already handles this case. COUNTER_INCREMENT(m_metrics, num_alloc_failure, 1); - LOGERROR("Can't serve request nblks: {} larger than max_blks_in_op: {}", nblks, BlkId::max_blks_in_op()); + LOGERROR("Can't serve request nblks: {} larger than max_blks_in_op: {}", nblks, max_blks_per_blkid()); return BlkAllocStatus::FAILED; } // Push 1 blk to the vector which has all the requested nblks; - out_bids.emplace_back(m_last_append_offset, nblks, m_chunk_id); + out_bid = BlkId{m_last_append_offset, nblks, m_chunk_id}; - [[maybe_unused]] auto cur_cp = hs()->cp_mgr().cp_guard(); + auto cur_cp = hs()->cp_mgr().cp_guard(); m_last_append_offset += nblks; m_freeable_nblks -= nblks; - // it is garunteened dirty buffer always contains updates of current_cp or next_cp, it will + // it is guaranteed that dirty buffer always contains updates of current_cp or next_cp, it will // never get dirty buffer from across updates; set_dirty_offset(cur_cp->id() % MAX_CP_COUNT); @@ -155,38 +154,31 @@ void AppendBlkAllocator::clear_dirty_offset(const uint8_t idx) { m_sb[idx]->is_d // void AppendBlkAllocator::free(const BlkId& bid) { std::unique_lock lk(m_mtx); - [[maybe_unused]] auto cur_cp = hs()->cp_mgr().cp_guard(); - const auto n = bid.get_nblks(); + auto cur_cp = hs()->cp_mgr().cp_guard(); + const auto n = bid.blk_count(); m_freeable_nblks += n; - if (bid.get_blk_num() + n == m_last_append_offset) { + if (bid.blk_num() + n == m_last_append_offset) { // we are freeing the the last blk id, let's rewind. m_last_append_offset -= n; } set_dirty_offset(cur_cp->id() % MAX_CP_COUNT); } -void AppendBlkAllocator::free(const std::vector< BlkId >& blk_ids) { - for (const auto b : blk_ids) { - this->free(b); - } -} - -blk_cap_t AppendBlkAllocator::available_blks() const { return get_total_blks() - get_used_blks(); } +blk_num_t AppendBlkAllocator::available_blks() const { return get_total_blks() - get_used_blks(); } -blk_cap_t AppendBlkAllocator::get_used_blks() const { return m_last_append_offset; } +blk_num_t AppendBlkAllocator::get_used_blks() const { return m_last_append_offset; } bool AppendBlkAllocator::is_blk_alloced(const BlkId& in_bid, bool) const { // blk_num starts from 0; - return in_bid.get_blk_num() < get_used_blks(); + return in_bid.blk_num() < get_used_blks(); } std::string AppendBlkAllocator::get_name() const { return "AppendBlkAlloc_chunk_" + std::to_string(m_chunk_id); } std::string AppendBlkAllocator::to_string() const { - auto str = fmt::format("{}, last_append_offset: {}", get_name(), m_last_append_offset); - return str; + return fmt::format("{}, last_append_offset: {}", get_name(), m_last_append_offset); } -blk_cap_t AppendBlkAllocator::get_freeable_nblks() const { return m_freeable_nblks; } +blk_num_t AppendBlkAllocator::get_freeable_nblks() const { return m_freeable_nblks; } } // namespace homestore diff --git a/src/lib/blkalloc/append_blk_allocator.h b/src/lib/blkalloc/append_blk_allocator.h index ebfcdf61e..3c05aaabe 100644 --- a/src/lib/blkalloc/append_blk_allocator.h +++ b/src/lib/blkalloc/append_blk_allocator.h @@ -34,8 +34,8 @@ struct append_blkalloc_ctx { uint32_t version{append_blkalloc_sb_version}; bool is_dirty; // this field is needed for cp_flush, but not necessarily needed for persistence; uint64_t allocator_id; - uint64_t freeable_nblks; - uint64_t last_append_offset; + blk_num_t freeable_nblks; + blk_num_t last_append_offset; }; #pragma pack() @@ -75,15 +75,13 @@ class AppendBlkAllocator : public BlkAllocator { AppendBlkAllocator& operator=(AppendBlkAllocator&&) noexcept = delete; virtual ~AppendBlkAllocator() = default; - BlkAllocStatus alloc(BlkId& bid) override; - BlkAllocStatus alloc(blk_count_t nblks, const blk_alloc_hints& hints, std::vector< BlkId >& out_blkid) override; + BlkAllocStatus alloc_contiguous(BlkId& bid) override; + BlkAllocStatus alloc(blk_count_t nblks, blk_alloc_hints const& hints, BlkId& out_blkid) override; + void free(BlkId const& b) override; - void free(const std::vector< BlkId >& blk_ids) override; - void free(const BlkId& b) override; - - blk_cap_t available_blks() const override; - blk_cap_t get_used_blks() const override; - blk_cap_t get_freeable_nblks() const; + blk_num_t available_blks() const override; + blk_num_t get_used_blks() const override; + blk_num_t get_freeable_nblks() const; bool is_blk_alloced(const BlkId& in_bid, bool use_lock = false) const override; std::string to_string() const override; @@ -102,9 +100,9 @@ class AppendBlkAllocator : public BlkAllocator { void on_meta_blk_found(const sisl::byte_view& buf, void* meta_cookie); private: - std::mutex m_mtx; // thread_safe, TODO: open option for consumer to choose to go lockless; - uint64_t m_last_append_offset{0}; // last appended offset in blocks; - uint64_t m_freeable_nblks{0}; + std::mutex m_mtx; // thread_safe, TODO: open option for consumer to choose to go lockless; + blk_num_t m_last_append_offset{0}; // last appended offset in blocks; + blk_num_t m_freeable_nblks{0}; AppendBlkAllocMetrics m_metrics; std::array< superblk< append_blkalloc_ctx >, MAX_CP_COUNT > m_sb; }; diff --git a/src/lib/blkalloc/blk.cpp b/src/lib/blkalloc/blk.cpp index affd94b83..29507dcf3 100644 --- a/src/lib/blkalloc/blk.cpp +++ b/src/lib/blkalloc/blk.cpp @@ -17,94 +17,162 @@ #include "common/homestore_assert.hpp" namespace homestore { +BlkId::BlkId(uint64_t id_int) { + *r_cast< uint64_t* >(&s) = id_int; + DEBUG_ASSERT_EQ(is_multi(), 0, "MultiBlkId is set on BlkId constructor"); +} + +BlkId::BlkId(blk_num_t blk_num, blk_count_t nblks, chunk_num_t chunk_num) : s{0x0, blk_num, nblks, chunk_num} {} + +uint64_t BlkId::to_integer() const { return *r_cast< const uint64_t* >(&s); } + +sisl::blob BlkId::serialize() { return sisl::blob{r_cast< uint8_t* >(&s), sizeof(serialized)}; } + +uint32_t BlkId::serialized_size() const { return sizeof(BlkId); } + +void BlkId::deserialize(sisl::blob const& b, bool copy) { + serialized* other = r_cast< serialized* >(b.bytes); + s = *other; +} + +void BlkId::invalidate() { s.m_nblks = 0; } + +bool BlkId::is_valid() const { return (blk_count() > 0); } + +std::string BlkId::to_string() const { + return is_valid() ? fmt::format("BlkNum={} nblks={} chunk={}", blk_num(), blk_count(), chunk_num()) + : "Invalid_Blkid"; +} + int BlkId::compare(const BlkId& one, const BlkId& two) { - if (one.m_chunk_num > two.m_chunk_num) { + if (one.chunk_num() < two.chunk_num()) { return -1; - } else if (one.m_chunk_num < two.m_chunk_num) { + } else if (one.chunk_num() > two.chunk_num()) { return 1; } - if (one.m_blk_num > two.m_blk_num) { + if (one.blk_num() < two.blk_num()) { return -1; - } else if (one.m_blk_num < two.m_blk_num) { + } else if (one.blk_num() > two.blk_num()) { return 1; } - if (one.m_nblks > two.m_nblks) { + if (one.blk_count() < two.blk_count()) { return -1; - } else if (one.m_nblks < two.m_nblks) { + } else if (one.blk_count() > two.blk_count()) { return 1; } return 0; } -uint64_t BlkId::to_integer() const { - const uint64_t val{m_blk_num | (static_cast< uint64_t >(m_nblks) << BLK_NUM_BITS) | - (static_cast< uint64_t >(m_chunk_num) << (BLK_NUM_BITS + NBLKS_BITS))}; - return val; -} - -BlkId::BlkId(uint64_t id_int) { set(id_int); } -BlkId::BlkId(blk_num_t blk_num, blk_count_t nblks, chunk_num_t chunk_num) { set(blk_num, nblks, chunk_num); } - -void BlkId::invalidate() { set(blk_num_t{0}, blk_count_t{0}, s_chunk_num_mask); } +//////////////////////////////////// MultiBlkId Section ////////////////////////////// +MultiBlkId::MultiBlkId() : BlkId::BlkId() { s.m_is_multi = 1; } -bool BlkId::is_valid() const { return (m_chunk_num != s_chunk_num_mask); } +MultiBlkId::MultiBlkId(BlkId const& b) : BlkId::BlkId(b) { s.m_is_multi = 1; } -BlkId BlkId::get_blkid_at(uint32_t offset, uint32_t pagesz) const { - assert(offset % pagesz == 0); - const uint32_t remaining_size{((get_nblks() - (offset / pagesz)) * pagesz)}; - return (get_blkid_at(offset, remaining_size, pagesz)); +MultiBlkId::MultiBlkId(blk_num_t blk_num, blk_count_t nblks, chunk_num_t chunk_num) : + BlkId::BlkId{blk_num, nblks, chunk_num} { + s.m_is_multi = 1; } -BlkId BlkId::get_blkid_at(uint32_t offset, uint32_t size, uint32_t pagesz) const { - assert(size % pagesz == 0); - assert(offset % pagesz == 0); +void MultiBlkId::add(blk_num_t blk_num, blk_count_t nblks, chunk_num_t chunk_num) { + if (BlkId::is_valid()) { + RELEASE_ASSERT_EQ(s.m_chunk_num, chunk_num, "MultiBlkId has to be all from same chunk"); + RELEASE_ASSERT_LT(n_addln_piece, max_addln_pieces, "MultiBlkId cannot support more than {} pieces", + max_addln_pieces + 1); + addln_pieces[n_addln_piece] = chain_blkid{.m_blk_num = blk_num, .m_nblks = nblks}; + ++n_addln_piece; + } else { + s = BlkId::serialized{0x1, blk_num, nblks, chunk_num}; + } +} - BlkId other; +void MultiBlkId::add(BlkId const& b) { add(b.blk_num(), b.blk_count(), b.chunk_num()); } - other.set_blk_num(get_blk_num() + (offset / pagesz)); - other.set_nblks(size / pagesz); - other.set_chunk_num(get_chunk_num()); +sisl::blob MultiBlkId::serialize() { return sisl::blob{r_cast< uint8_t* >(this), serialized_size()}; } - assert(other.get_blk_num() < get_blk_num() + get_nblks()); - assert((other.get_blk_num() + other.get_nblks()) <= (get_blk_num() + get_nblks())); - return other; +uint32_t MultiBlkId::serialized_size() const { + uint32_t sz = BlkId::serialized_size(); + if (n_addln_piece != 0) { sz += sizeof(uint16_t) + (n_addln_piece * sizeof(chain_blkid)); } + return sz; } -void BlkId::set(blk_num_t blk_num, blk_count_t nblks, chunk_num_t chunk_num) { - set_blk_num(blk_num); - set_nblks(nblks); - set_chunk_num(chunk_num); +void MultiBlkId::deserialize(sisl::blob const& b, bool copy) { + MultiBlkId* other = r_cast< MultiBlkId* >(b.bytes); + s = other->s; + if (b.size == sizeof(BlkId)) { + n_addln_piece = 0; + } else { + n_addln_piece = other->n_addln_piece; + std::copy(other->addln_pieces.begin(), other->addln_pieces.begin() + other->n_addln_piece, + addln_pieces.begin()); + } } -void BlkId::set(const BlkId& bid) { set(bid.get_blk_num(), bid.get_nblks(), bid.get_chunk_num()); } +uint16_t MultiBlkId::num_pieces() const { return BlkId::is_valid() ? n_addln_piece + 1 : 0; } -void BlkId::set(uint64_t id_int) { - HS_DBG_ASSERT_LE(id_int, max_id_int()); - m_blk_num = (id_int & s_blk_num_mask); - m_nblks = static_cast< blk_count_t >((id_int >> BLK_NUM_BITS) & s_nblks_mask); - m_chunk_num = static_cast< chunk_num_t >((id_int >> (BLK_NUM_BITS + NBLKS_BITS)) & s_chunk_num_mask); -} +bool MultiBlkId::has_room() const { return (n_addln_piece < max_addln_pieces); } + +MultiBlkId::iterator MultiBlkId::iterate() const { return MultiBlkId::iterator{*this}; } -void BlkId::set_blk_num(blk_num_t blk_num) { - HS_DBG_ASSERT_LE(blk_num, s_blk_num_mask); - m_blk_num = blk_num; +std::string MultiBlkId::to_string() const { + std::string str = "MultiBlks: {"; + auto it = iterate(); + while (auto const b = it.next()) { + str += b->to_string(); + } + str += std::string("}"); + return str; } -void BlkId::set_nblks(blk_count_t nblks) { - HS_DBG_ASSERT_LE(nblks, max_blks_in_op()); - m_nblks = static_cast< blk_count_serialized_t >(nblks - 1); +blk_count_t MultiBlkId::blk_count() const { + blk_count_t nblks{0}; + auto it = iterate(); + while (auto b = it.next()) { + nblks += b->blk_count(); + } + return nblks; } -void BlkId::set_chunk_num(chunk_num_t chunk_num) { - HS_DBG_ASSERT_LE(chunk_num, s_chunk_num_mask); - m_chunk_num = chunk_num; +BlkId MultiBlkId::to_single_blkid() const { + HS_DBG_ASSERT_LE(num_pieces(), 1, "Can only MultiBlkId with one piece to BlkId"); + return BlkId{blk_num(), blk_count(), chunk_num()}; } -std::string BlkId::to_string() const { - return is_valid() ? fmt::format("BlkNum={} nblks={} chunk={}", get_blk_num(), get_nblks(), get_chunk_num()) - : "Invalid_Blkid"; +int MultiBlkId::compare(MultiBlkId const& left, MultiBlkId const& right) { + if (left.chunk_num() < right.chunk_num()) { + return -1; + } else if (left.chunk_num() > right.chunk_num()) { + return 1; + } + + // Shortcut path for simple BlkId search to avoid building icl set + if ((left.num_pieces() == 1) && (right.num_pieces() == 1)) { + return BlkId::compare(d_cast< BlkId const& >(left), d_cast< BlkId const& >(right)); + } + + using IntervalSet = boost::icl::interval_set< uint64_t >; + using Interval = IntervalSet::interval_type; + + IntervalSet lset; + auto lit = left.iterate(); + while (auto b = lit.next()) { + lset.insert(Interval::right_open(b->blk_num(), b->blk_num() + b->blk_count())); + } + + IntervalSet rset; + auto rit = right.iterate(); + while (auto b = rit.next()) { + rset.insert(Interval::right_open(b->blk_num(), b->blk_num() + b->blk_count())); + } + + if (lset < rset) { + return -1; + } else if (lset > rset) { + return 1; + } else { + return 0; + } } } // namespace homestore diff --git a/src/lib/blkalloc/blk_allocator.cpp b/src/lib/blkalloc/blk_allocator.cpp index 4aa3269d0..92c44df53 100644 --- a/src/lib/blkalloc/blk_allocator.cpp +++ b/src/lib/blkalloc/blk_allocator.cpp @@ -66,41 +66,54 @@ bool BlkAllocator::is_blk_alloced_on_disk(const BlkId& b, bool use_lock) const { if (!auto_recovery_on()) { return true; // nothing to compare. So always return true } - auto bits_set{[this, &b]() { - if (!get_disk_bm_const()->is_bits_set(b.get_blk_num(), b.get_nblks())) { return false; } + auto bits_set = [this](BlkId const& b) { + if (!get_disk_bm_const()->is_bits_set(b.blk_num(), b.blk_count())) { return false; } return true; - }}; + }; + if (use_lock) { - const BlkAllocPortion& portion = blknum_to_portion_const(b.get_blk_num()); + const BlkAllocPortion& portion = blknum_to_portion_const(b.blk_num()); auto lock{portion.portion_auto_lock()}; - return bits_set(); + return bits_set(b); } else { - return bits_set(); + return bits_set(b); } } -BlkAllocStatus BlkAllocator::alloc_on_disk(const BlkId& in_bid) { +BlkAllocStatus BlkAllocator::alloc_on_disk(BlkId const& bid) { if (!auto_recovery_on() && m_inited) { return BlkAllocStatus::FAILED; } rcu_read_lock(); auto list = get_alloc_blk_list(); if (list) { // cp has started, accumulating to the list - list->push_back(in_bid); + list->push_back(bid); } else { + auto set_on_disk_bm = [this](auto& b) { + BlkAllocPortion& portion = blknum_to_portion(b.blk_num()); + { + auto lock{portion.portion_auto_lock()}; + if (m_inited) { + BLKALLOC_REL_ASSERT(get_disk_bm_const()->is_bits_reset(b.blk_num(), b.blk_count()), + "Expected disk blks to reset"); + } + get_disk_bm_mutable()->set_bits(b.blk_num(), b.blk_count()); + portion.decrease_available_blocks(b.blk_count()); + BLKALLOC_LOG(DEBUG, "blks allocated {} chunk number {}", b.to_string(), m_chunk_id); + } + }; + // cp is not started or already done, allocate on disk bm directly; /* enable this assert later when reboot is supported */ // assert(auto_recovery_on() || !m_inited); - BlkAllocPortion& portion = blknum_to_portion(in_bid.get_blk_num()); - { - auto lock{portion.portion_auto_lock()}; - if (m_inited) { - BLKALLOC_REL_ASSERT(get_disk_bm_const()->is_bits_reset(in_bid.get_blk_num(), in_bid.get_nblks()), - "Expected disk blks to reset"); + if (bid.is_multi()) { + MultiBlkId const& mbid = r_cast< MultiBlkId const& >(bid); + auto it = mbid.iterate(); + while (auto b = it.next()) { + set_on_disk_bm(*b); } - get_disk_bm_mutable()->set_bits(in_bid.get_blk_num(), in_bid.get_nblks()); - portion.decrease_available_blocks(in_bid.get_nblks()); - BLKALLOC_LOG(DEBUG, "blks allocated {} chunk number {}", in_bid.to_string(), m_chunk_id); + } else { + set_on_disk_bm(bid); } } rcu_read_unlock(); @@ -108,29 +121,42 @@ BlkAllocStatus BlkAllocator::alloc_on_disk(const BlkId& in_bid) { return BlkAllocStatus::SUCCESS; } -BlkAllocStatus BlkAllocator::alloc_on_realtime(const BlkId& b) { +BlkAllocStatus BlkAllocator::alloc_on_realtime(BlkId const& bid) { if (!realtime_bm_on()) { return BlkAllocStatus::SUCCESS; } if (!auto_recovery_on() && m_inited) { return BlkAllocStatus::FAILED; } - BlkAllocPortion& portion = blknum_to_portion(b.get_blk_num()); - { - auto lock{portion.portion_auto_lock()}; - if (m_inited) { - if (!get_realtime_bm()->is_bits_reset(b.get_blk_num(), b.get_nblks())) { - BLKALLOC_LOG(ERROR, "bit not reset {} nblks {} chunk number {}", b.get_blk_num(), b.get_nblks(), - m_chunk_id); - for (blk_count_t i{0}; i < b.get_nblks(); ++i) { - if (!get_disk_bm_const()->is_bits_reset(b.get_blk_num() + i, 1)) { - BLKALLOC_LOG(ERROR, "bit not reset {}", b.get_blk_num() + i); + + auto set_on_realtime_bm = [this](BlkId const& b) { + BlkAllocPortion& portion = blknum_to_portion(b.blk_num()); + { + auto lock{portion.portion_auto_lock()}; + if (m_inited) { + if (!get_realtime_bm()->is_bits_reset(b.blk_num(), b.blk_count())) { + BLKALLOC_LOG(ERROR, "bit not reset {} nblks {} chunk number {}", b.blk_num(), b.blk_count(), + m_chunk_id); + for (blk_count_t i{0}; i < b.blk_count(); ++i) { + if (!get_disk_bm_const()->is_bits_reset(b.blk_num() + i, 1)) { + BLKALLOC_LOG(ERROR, "bit not reset {}", b.blk_num() + i); + } } + BLKALLOC_REL_ASSERT(get_realtime_bm()->is_bits_reset(b.blk_num(), b.blk_count()), + "Expected disk bits to reset blk num {} num blks {}", b.blk_num(), + b.blk_count()); } - BLKALLOC_REL_ASSERT(get_realtime_bm()->is_bits_reset(b.get_blk_num(), b.get_nblks()), - "Expected disk bits to reset blk num {} num blks {}", b.get_blk_num(), - b.get_nblks()); } + get_realtime_bm()->set_bits(b.blk_num(), b.blk_count()); + BLKALLOC_LOG(DEBUG, "realtime blks allocated {} chunk number {}", b.to_string(), m_chunk_id); + } + }; + + if (bid.is_multi()) { + MultiBlkId const& mbid = r_cast< MultiBlkId const& >(bid); + auto it = mbid.iterate(); + while (auto const b = it.next()) { + set_on_realtime_bm(*b); } - get_realtime_bm()->set_bits(b.get_blk_num(), b.get_nblks()); - BLKALLOC_LOG(DEBUG, "realtime blks allocated {} chunk number {}", b.to_string(), m_chunk_id); + } else { + set_on_realtime_bm(bid); } return BlkAllocStatus::SUCCESS; @@ -139,60 +165,90 @@ BlkAllocStatus BlkAllocator::alloc_on_realtime(const BlkId& b) { // // Caller should consume the return value and print context when return false; // -bool BlkAllocator::free_on_realtime(const BlkId& b) { +bool BlkAllocator::free_on_realtime(BlkId const& bid) { if (!realtime_bm_on()) { return true; } /* this api should be called only when auto recovery is enabled */ assert(auto_recovery_on()); - BlkAllocPortion& portion = blknum_to_portion(b.get_blk_num()); - { - auto lock{portion.portion_auto_lock()}; - if (m_inited) { - /* During recovery we might try to free the entry which is already freed while replaying the journal, - * This assert is valid only post recovery. - */ - if (!get_realtime_bm()->is_bits_set(b.get_blk_num(), b.get_nblks())) { - BLKALLOC_LOG(ERROR, "{}, bit not set {} nblks{} chunk number {}", b.to_string(), b.get_blk_num(), - b.get_nblks(), m_chunk_id); - for (blk_count_t i{0}; i < b.get_nblks(); ++i) { - if (!get_realtime_bm()->is_bits_set(b.get_blk_num() + i, 1)) { - BLKALLOC_LOG(ERROR, "bit not set {}", b.get_blk_num() + i); + + auto unset_on_realtime_bm = [this](BlkId const& b) { + BlkAllocPortion& portion = blknum_to_portion(b.blk_num()); + { + auto lock{portion.portion_auto_lock()}; + if (m_inited) { + /* During recovery we might try to free the entry which is already freed while replaying the journal, + * This assert is valid only post recovery. + */ + if (!get_realtime_bm()->is_bits_set(b.blk_num(), b.blk_count())) { + BLKALLOC_LOG(ERROR, "{}, bit not set {} nblks{} chunk number {}", b.to_string(), b.blk_num(), + b.blk_count(), m_chunk_id); + for (blk_count_t i{0}; i < b.blk_count(); ++i) { + if (!get_realtime_bm()->is_bits_set(b.blk_num() + i, 1)) { + BLKALLOC_LOG(ERROR, "bit not set {}", b.blk_num() + i); + } } + return false; } - return false; } + + BLKALLOC_LOG(DEBUG, "realtime: free bid: {}", b.to_string()); + get_realtime_bm()->reset_bits(b.blk_num(), b.blk_count()); + return true; } + }; - BLKALLOC_LOG(DEBUG, "realtime: free bid: {}", b.to_string()); - get_realtime_bm()->reset_bits(b.get_blk_num(), b.get_nblks()); - return true; + bool ret{true}; + if (bid.is_multi()) { + MultiBlkId const& mbid = r_cast< MultiBlkId const& >(bid); + auto it = mbid.iterate(); + while (auto const b = it.next()) { + if (!unset_on_realtime_bm(*b)) { + ret = false; + break; + } + } + } else { + ret = unset_on_realtime_bm(bid); } + return ret; } -void BlkAllocator::free_on_disk(const BlkId& b) { +void BlkAllocator::free_on_disk(BlkId const& bid) { /* this api should be called only when auto recovery is enabled */ assert(auto_recovery_on()); - BlkAllocPortion& portion = blknum_to_portion(b.get_blk_num()); - { - auto lock{portion.portion_auto_lock()}; - if (m_inited) { - /* During recovery we might try to free the entry which is already freed while replaying the journal, - * This assert is valid only post recovery. - */ - if (!get_disk_bm_const()->is_bits_set(b.get_blk_num(), b.get_nblks())) { - BLKALLOC_LOG(ERROR, "bit not set {} nblks {} chunk number {}", b.get_blk_num(), b.get_nblks(), - m_chunk_id); - for (blk_count_t i{0}; i < b.get_nblks(); ++i) { - if (!get_disk_bm_const()->is_bits_set(b.get_blk_num() + i, 1)) { - BLKALLOC_LOG(ERROR, "bit not set {}", b.get_blk_num() + i); + auto unset_on_disk_bm = [this](auto& b) { + BlkAllocPortion& portion = blknum_to_portion(b.blk_num()); + { + auto lock{portion.portion_auto_lock()}; + if (m_inited) { + /* During recovery we might try to free the entry which is already freed while replaying the journal, + * This assert is valid only post recovery. + */ + if (!get_disk_bm_const()->is_bits_set(b.blk_num(), b.blk_count())) { + BLKALLOC_LOG(ERROR, "bit not set {} nblks {} chunk number {}", b.blk_num(), b.blk_count(), + m_chunk_id); + for (blk_count_t i{0}; i < b.blk_count(); ++i) { + if (!get_disk_bm_const()->is_bits_set(b.blk_num() + i, 1)) { + BLKALLOC_LOG(ERROR, "bit not set {}", b.blk_num() + i); + } } + BLKALLOC_REL_ASSERT(get_disk_bm_const()->is_bits_set(b.blk_num(), b.blk_count()), + "Expected disk bits to set blk num {} num blks {}", b.blk_num(), b.blk_count()); } - BLKALLOC_REL_ASSERT(get_disk_bm_const()->is_bits_set(b.get_blk_num(), b.get_nblks()), - "Expected disk bits to set blk num {} num blks {}", b.get_blk_num(), b.get_nblks()); } + get_disk_bm_mutable()->reset_bits(b.blk_num(), b.blk_count()); + portion.increase_available_blocks(b.blk_count()); + } + }; + + if (bid.is_multi()) { + MultiBlkId const& mbid = r_cast< MultiBlkId const& >(bid); + auto it = mbid.iterate(); + while (auto const b = it.next()) { + unset_on_disk_bm(*b); } - get_disk_bm_mutable()->reset_bits(b.get_blk_num(), b.get_nblks()); - portion.increase_available_blocks(b.get_nblks()); + } else { + unset_on_disk_bm(bid); } } @@ -229,9 +285,9 @@ void BlkAllocator::create_debug_bm() { } void BlkAllocator::update_debug_bm(const BlkId& bid) { - BLKALLOC_REL_ASSERT(get_disk_bm_const()->is_bits_set(bid.get_blk_num(), bid.get_nblks()), - "Expected disk bits to set blk num {} num blks {}", bid.get_blk_num(), bid.get_nblks()); - get_debug_bm()->set_bits(bid.get_blk_num(), bid.get_nblks()); + BLKALLOC_REL_ASSERT(get_disk_bm_const()->is_bits_set(bid.blk_num(), bid.blk_count()), + "Expected disk bits to set blk num {} num blks {}", bid.blk_num(), bid.blk_count()); + get_debug_bm()->set_bits(bid.blk_num(), bid.blk_count()); } bool BlkAllocator::verify_debug_bm(bool free_debug_bm) { diff --git a/src/lib/blkalloc/blk_allocator.h b/src/lib/blkalloc/blk_allocator.h index 43f49842e..fb75bc0f4 100644 --- a/src/lib/blkalloc/blk_allocator.h +++ b/src/lib/blkalloc/blk_allocator.h @@ -63,8 +63,8 @@ struct BlkAllocConfig { public: const uint32_t m_blk_size; const uint32_t m_align_size; - const blk_cap_t m_capacity; - const blk_cap_t m_blks_per_portion; + const blk_num_t m_capacity; + const blk_num_t m_blks_per_portion; const std::string m_unique_name; bool m_auto_recovery{false}; bool m_realtime_bm_on{false}; // only specifically turn off in BlkAlloc Test; @@ -74,7 +74,7 @@ struct BlkAllocConfig { bool realtime_bm_on = true) : m_blk_size{blk_size}, m_align_size{align_size}, - m_capacity{static_cast< blk_cap_t >(size / blk_size)}, + m_capacity{static_cast< blk_num_t >(size / blk_size)}, m_blks_per_portion{std::min(HS_DYNAMIC_CONFIG(blkallocator.num_blks_per_portion), m_capacity)}, m_unique_name{name} { #ifdef _PRERELEASE @@ -86,9 +86,9 @@ struct BlkAllocConfig { #endif } - BlkAllocConfig(const BlkAllocConfig&) = default; + BlkAllocConfig(BlkAllocConfig const&) = default; BlkAllocConfig(BlkAllocConfig&&) noexcept = delete; - BlkAllocConfig& operator=(const BlkAllocConfig&) = default; + BlkAllocConfig& operator=(BlkAllocConfig const&) = default; BlkAllocConfig& operator=(BlkAllocConfig&&) noexcept = delete; virtual ~BlkAllocConfig() = default; void set_auto_recovery(bool is_auto_recovery) { m_auto_recovery = is_auto_recovery; } @@ -118,9 +118,9 @@ class BlkAllocPortion { public: BlkAllocPortion(blk_temp_t temp = default_temperature()) : m_temperature(temp) {} ~BlkAllocPortion() = default; - BlkAllocPortion(const BlkAllocPortion&) = delete; + BlkAllocPortion(BlkAllocPortion const&) = delete; BlkAllocPortion(BlkAllocPortion&&) noexcept = delete; - BlkAllocPortion& operator=(const BlkAllocPortion&) = delete; + BlkAllocPortion& operator=(BlkAllocPortion const&) = delete; BlkAllocPortion& operator=(BlkAllocPortion&&) noexcept = delete; auto portion_auto_lock() const { return std::scoped_lock< std::mutex >(m_blk_lock); } @@ -175,20 +175,19 @@ class BlkAllocPortion { class CP; class BlkAllocator { public: - BlkAllocator(const BlkAllocConfig& cfg, chunk_num_t id = 0); - BlkAllocator(const BlkAllocator&) = delete; + BlkAllocator(BlkAllocConfig const& cfg, chunk_num_t id = 0); + BlkAllocator(BlkAllocator const&) = delete; BlkAllocator(BlkAllocator&&) noexcept = delete; - BlkAllocator& operator=(const BlkAllocator&) = delete; + BlkAllocator& operator=(BlkAllocator const&) = delete; BlkAllocator& operator=(BlkAllocator&&) noexcept = delete; virtual ~BlkAllocator() = default; - virtual BlkAllocStatus alloc(BlkId& bid) = 0; - virtual BlkAllocStatus alloc(blk_count_t nblks, const blk_alloc_hints& hints, std::vector< BlkId >& out_blkid) = 0; - virtual void free(const std::vector< BlkId >& blk_ids) = 0; - virtual void free(const BlkId& id) = 0; - virtual blk_cap_t available_blks() const = 0; - virtual blk_cap_t get_used_blks() const = 0; - virtual bool is_blk_alloced(const BlkId& b, bool use_lock = false) const = 0; + virtual BlkAllocStatus alloc_contiguous(BlkId& bid) = 0; + virtual BlkAllocStatus alloc(blk_count_t nblks, blk_alloc_hints const& hints, BlkId& out_blkid) = 0; + virtual void free(BlkId const& id) = 0; + virtual blk_num_t available_blks() const = 0; + virtual blk_num_t get_used_blks() const = 0; + virtual bool is_blk_alloced(BlkId const& b, bool use_lock = false) const = 0; virtual std::string to_string() const = 0; virtual void cp_flush(CP* cp); // TODO: it needs to be a pure virtual function after bitmap blkallocator is derived @@ -217,30 +216,23 @@ class BlkAllocator { void decr_alloced_blk_count(blk_count_t nblks) { m_alloced_blk_count.fetch_sub(nblks, std::memory_order_relaxed); } int64_t get_alloced_blk_count() const { return m_alloced_blk_count.load(std::memory_order_acquire); } - bool is_blk_alloced_on_disk(const BlkId& b, bool use_lock = false) const; /* It is used during recovery in both mode :- auto recovery and manual recovery * It is also used in normal IO during auto recovery mode. */ - BlkAllocStatus alloc_on_disk(const BlkId& in_bid); + BlkAllocStatus alloc_on_disk(BlkId const& in_bid); - BlkAllocStatus alloc_on_realtime(const BlkId& b); + BlkAllocStatus alloc_on_realtime(BlkId const& b); + + bool is_blk_alloced_on_disk(BlkId const& b, bool use_lock = false) const; // // Caller should consume the return value and print context when return false; // + [[nodiscard]] bool free_on_realtime(BlkId const& b); - bool free_on_realtime(const BlkId& b); - - void free_on_disk(const BlkId& b); - - // Acquire the underlying bitmap buffer and while the caller has acquired, all the new allocations - // will be captured in a separate list and then pushes into buffer once released. - // NOTE: THIS IS NON-THREAD SAFE METHOD. Caller is expected to ensure synchronization between multiple - // acquires/releases - sisl::byte_array acquire_underlying_buffer(); - void release_underlying_buffer(); + void free_on_disk(BlkId const& b); /* CP start is called when all its consumers have purged their free lists and now want to persist the * disk bitmap. @@ -250,21 +242,21 @@ class BlkAllocator { // void cp_done(); uint32_t get_align_size() const { return m_align_size; } - blk_cap_t get_total_blks() const { return m_num_blks; } - blk_cap_t get_blks_per_portion() const { return m_blks_per_portion; } - blk_cap_t get_num_portions() const { return (m_num_blks - 1) / m_blks_per_portion + 1; } + blk_num_t get_total_blks() const { return m_num_blks; } + blk_num_t get_blks_per_portion() const { return m_blks_per_portion; } + blk_num_t get_num_portions() const { return (m_num_blks - 1) / m_blks_per_portion + 1; } const std::string& get_name() const { return m_name; } bool auto_recovery_on() const { return m_auto_recovery; } uint32_t get_blk_size() const { return m_blk_size; } blk_num_t blknum_to_portion_num(const blk_num_t blknum) const { return blknum / m_blks_per_portion; } BlkAllocPortion& blknum_to_portion(blk_num_t blknum) { return m_blk_portions[blknum_to_portion_num(blknum)]; } - const BlkAllocPortion& blknum_to_portion_const(blk_num_t blknum) const { + BlkAllocPortion const& blknum_to_portion_const(blk_num_t blknum) const { return m_blk_portions[blknum_to_portion_num(blknum)]; } void create_debug_bm(); - void update_debug_bm(const BlkId& bid); + void update_debug_bm(BlkId const& bid); bool verify_debug_bm(bool free_debug_bm); /* Get status */ @@ -278,12 +270,19 @@ class BlkAllocator { sisl::ThreadVector< BlkId >* get_alloc_blk_list(); void set_disk_bm_dirty() { is_disk_bm_dirty = true; } + // Acquire the underlying bitmap buffer and while the caller has acquired, all the new allocations + // will be captured in a separate list and then pushes into buffer once released. + // NOTE: THIS IS NON-THREAD SAFE METHOD. Caller is expected to ensure synchronization between multiple + // acquires/releases + sisl::byte_array acquire_underlying_buffer(); + void release_underlying_buffer(); + protected: const std::string m_name; const uint32_t m_blk_size; const uint32_t m_align_size; - const blk_cap_t m_num_blks; - blk_cap_t m_blks_per_portion; + const blk_num_t m_num_blks; + blk_num_t m_blks_per_portion; const bool m_auto_recovery{false}; const bool m_realtime_bm_on{false}; // only specifically turn off in BlkAlloc Test; bool m_inited{false}; @@ -307,22 +306,21 @@ class BlkAllocator { */ class FixedBlkAllocator : public BlkAllocator { public: - FixedBlkAllocator(const BlkAllocConfig& cfg, bool init, chunk_num_t chunk_id); - FixedBlkAllocator(const FixedBlkAllocator&) = delete; + FixedBlkAllocator(BlkAllocConfig const& cfg, bool init, chunk_num_t chunk_id); + FixedBlkAllocator(FixedBlkAllocator const&) = delete; FixedBlkAllocator(FixedBlkAllocator&&) noexcept = delete; - FixedBlkAllocator& operator=(const FixedBlkAllocator&) = delete; + FixedBlkAllocator& operator=(FixedBlkAllocator const&) = delete; FixedBlkAllocator& operator=(FixedBlkAllocator&&) noexcept = delete; ~FixedBlkAllocator() override = default; - BlkAllocStatus alloc(BlkId& bid) override; - BlkAllocStatus alloc(blk_count_t nblks, const blk_alloc_hints& hints, std::vector< BlkId >& out_blkid) override; - void free(const std::vector< BlkId >& blk_ids) override; - void free(const BlkId& b) override; + BlkAllocStatus alloc_contiguous(BlkId& bid) override; + BlkAllocStatus alloc(blk_count_t nblks, blk_alloc_hints const& hints, BlkId& out_blkid) override; + void free(BlkId const& b) override; void inited() override; - blk_cap_t available_blks() const override; - blk_cap_t get_used_blks() const override; - bool is_blk_alloced(const BlkId& in_bid, bool use_lock = false) const override; + blk_num_t available_blks() const override; + blk_num_t get_used_blks() const override; + bool is_blk_alloced(BlkId const& in_bid, bool use_lock = false) const override; std::string to_string() const override; private: diff --git a/src/lib/blkalloc/blk_cache.h b/src/lib/blkalloc/blk_cache.h index 7b0c5d0e2..132230465 100644 --- a/src/lib/blkalloc/blk_cache.h +++ b/src/lib/blkalloc/blk_cache.h @@ -30,7 +30,7 @@ #include "common/homestore_assert.hpp" namespace homestore { -typedef blk_count_t slab_idx_t; +using slab_idx_t = blk_count_t; static constexpr uint16_t slab_tbl_size{257}; @@ -77,30 +77,30 @@ struct blk_cache_entry { blk_cache_entry() : blk_cache_entry{0, 0, 0} {} blk_cache_entry(const blk_num_t blk_num, const blk_count_t nblks, const blk_temp_t temp) { set_blk_num(blk_num); - set_nblks(nblks); + set_blk_count(nblks); set_temperature(temp); } void set_blk_num(const blk_num_t blk_num) { m_blk_num = blk_num; } [[nodiscard]] blk_num_t get_blk_num() const { return m_blk_num; } - void set_nblks(const blk_count_t nblks) { - HS_DBG_ASSERT_LE(nblks, BlkId::max_blks_in_op()); - m_nblks = static_cast< blk_count_serialized_t >(nblks - 1); + void set_blk_count(const blk_count_t nblks) { + HS_DBG_ASSERT_LE(nblks, max_blks_per_blkid()); + m_nblks = nblks; } - [[nodiscard]] blk_count_t get_nblks() const { return static_cast< blk_count_t >(m_nblks) + 1; } + [[nodiscard]] blk_count_t blk_count() const { return m_nblks; } void set_temperature(const blk_temp_t temp) { m_temp = temp; } [[nodiscard]] blk_temp_t get_temperature() const { return m_temp; } [[nodiscard]] std::string to_string() const { - return fmt::format("BlkNum={} nblks={} temp={}", get_blk_num(), get_nblks(), get_temperature()); + return fmt::format("BlkNum={} nblks={} temp={}", get_blk_num(), blk_count(), get_temperature()); } private: - blk_num_t m_blk_num; // Blk number within the chunk - blk_count_serialized_t m_nblks; // Total number of blocks - blk_temp_t m_temp; // Temperature of each page + blk_num_t m_blk_num; // Blk number within the chunk + blk_count_t m_nblks; // Total number of blocks + blk_temp_t m_temp; // Temperature of each page }; #pragma pack() @@ -143,8 +143,8 @@ struct blk_cache_fill_req { }; struct blk_cache_refill_status { - blk_cap_t slab_required_count{0}; - blk_cap_t slab_refilled_count{0}; + blk_num_t slab_required_count{0}; + blk_num_t slab_refilled_count{0}; [[nodiscard]] bool need_refill() const { return (slab_required_count && (slab_refilled_count != slab_required_count)); @@ -160,9 +160,9 @@ struct blk_cache_refill_status { struct blk_cache_fill_session { uint64_t session_id; std::vector< blk_cache_refill_status > slab_requirements; // A slot for each slab about count of required/refilled - blk_cap_t overall_refilled_num_blks{0}; + blk_num_t overall_refilled_num_blks{0}; bool overall_refill_done{false}; - std::atomic< blk_cap_t > urgent_refill_blks_count{0}; // Send notification after approx this much blks refilled + std::atomic< blk_num_t > urgent_refill_blks_count{0}; // Send notification after approx this much blks refilled [[nodiscard]] static uint64_t gen_session_id() { static std::atomic< uint64_t > s_session_id{1}; @@ -179,7 +179,7 @@ struct blk_cache_fill_session { slab_requirements.reserve(num_slabs); } - void urgent_need_atleast(const blk_cap_t wait_count) { + void urgent_need_atleast(const blk_num_t wait_count) { urgent_refill_blks_count.store(overall_refilled_num_blks + wait_count, std::memory_order_release); } @@ -211,7 +211,7 @@ struct blk_cache_fill_session { struct SlabCacheConfig { struct _slab_config { blk_count_t slab_size; // Size of this slab (in terms of number of blks) - blk_cap_t max_entries; // Max entries allowed in this slab + blk_num_t max_entries; // Max entries allowed in this slab float refill_threshold_pct; // At what percentage empty should we start refilling this slab cache std::vector< float > m_level_distribution_pct; // How to distribute entries into multiple levels std::string m_name; // Name of the base blk allocator @@ -257,7 +257,7 @@ class FreeBlkCache { std::vector< blk_cache_entry >& excess_blks) = 0; [[maybe_unused]] virtual blk_count_t try_free_blks(const std::vector< blk_cache_entry >& blks, std::vector< blk_cache_entry >& excess_blks) = 0; - [[nodiscard]] virtual blk_cap_t try_fill_cache(const blk_cache_fill_req& fill_req, + [[nodiscard]] virtual blk_num_t try_fill_cache(const blk_cache_fill_req& fill_req, blk_cache_fill_session& fill_session) = 0; [[nodiscard]] virtual std::shared_ptr< blk_cache_fill_session > @@ -268,8 +268,7 @@ class FreeBlkCache { [[nodiscard]] static slab_idx_t find_slab(const blk_count_t nblks) { if (sisl_unlikely(nblks >= slab_tbl_size)) { - return static_cast< slab_idx_t >((nblks > 1) ? sisl::logBase2(static_cast< blk_count_t >(nblks - 1)) + 1 - : 0); + return s_cast< slab_idx_t >((nblks > 1) ? sisl::logBase2(s_cast< blk_count_t >(nblks - 1)) + 1 : 0); } return nblks_to_slab_tbl[nblks]; } diff --git a/src/lib/blkalloc/blk_cache_queue.cpp b/src/lib/blkalloc/blk_cache_queue.cpp index 5478ce487..87cda9f3e 100644 --- a/src/lib/blkalloc/blk_cache_queue.cpp +++ b/src/lib/blkalloc/blk_cache_queue.cpp @@ -26,16 +26,16 @@ FreeBlkCacheQueue::FreeBlkCacheQueue(const SlabCacheConfig& cfg, BlkAllocMetrics m_slab_queues.reserve(cfg.m_per_slab_cfg.size()); for (const auto& slab_cfg : cfg.m_per_slab_cfg) { - std::vector< blk_cap_t > level_limits; + std::vector< blk_num_t > level_limits; level_limits.reserve(slab_cfg.m_level_distribution_pct.size()); #ifndef NDEBUG HS_DBG_ASSERT_EQ(slab_cfg.slab_size, slab_size, "Slab config size is not contiguous power of 2"); slab_size *= 2; #endif - blk_cap_t sum{0}; + blk_num_t sum{0}; for (const auto& p : slab_cfg.m_level_distribution_pct) { - const blk_cap_t limit{static_cast< blk_cap_t >((static_cast< double >(slab_cfg.max_entries) * p) / 100.0)}; + const blk_num_t limit{static_cast< blk_num_t >((static_cast< double >(slab_cfg.max_entries) * p) / 100.0)}; sum += limit; level_limits.push_back(limit); } @@ -51,10 +51,10 @@ FreeBlkCacheQueue::FreeBlkCacheQueue(const SlabCacheConfig& cfg, BlkAllocMetrics } BlkAllocStatus FreeBlkCacheQueue::try_alloc_blks(const blk_cache_alloc_req& req, blk_cache_alloc_resp& resp) { - const auto slab_idx{std::min(FreeBlkCache::find_slab(req.nblks), req.max_slab_idx)}; + const auto slab_idx = std::min(FreeBlkCache::find_slab(req.nblks), req.max_slab_idx); COUNTER_INCREMENT(slab_metrics(slab_idx), num_slab_alloc, 1); - BlkAllocStatus status{try_alloc_in_slab(slab_idx, req, resp)}; + BlkAllocStatus status = try_alloc_in_slab(slab_idx, req, resp); if (status == BlkAllocStatus::SUCCESS) { BLKALLOC_LOG(TRACE, "Alloced in slab {}", resp.out_blks.front().to_string()); return status; @@ -95,8 +95,8 @@ blk_count_t FreeBlkCacheQueue::try_free_blks(const blk_cache_entry& entry, blk_cache_entry e{entry}; blk_count_t num_zombied{0}; - while (e.get_nblks() > 0) { - const auto [slab_idx, excess]{FreeBlkCache::find_round_down_slab(e.get_nblks())}; + while (e.blk_count() > 0) { + const auto [slab_idx, excess]{FreeBlkCache::find_round_down_slab(e.blk_count())}; #ifndef NDEBUG if (slab_idx >= m_slab_queues.size()) { BLKALLOC_LOG(ERROR, "Entry=[{}] slab_idx={} exceeds max slab queues {}", entry.to_string(), slab_idx, @@ -104,15 +104,15 @@ blk_count_t FreeBlkCacheQueue::try_free_blks(const blk_cache_entry& entry, } #endif - e.set_nblks(m_slab_queues[slab_idx]->get_slab_size()); + e.set_blk_count(m_slab_queues[slab_idx]->get_slab_size()); if (!push_slab(slab_idx, e, false /* only_this_level */)) { excess_blks.push_back(e); - num_zombied += e.get_nblks(); + num_zombied += e.blk_count(); } if (excess == 0) { break; } e.set_blk_num(e.get_blk_num() + m_slab_queues[slab_idx]->get_slab_size()); - e.set_nblks(excess); + e.set_blk_count(excess); } return num_zombied; @@ -129,8 +129,8 @@ blk_count_t FreeBlkCacheQueue::try_free_blks(const std::vector< blk_cache_entry return num_zombied; } -blk_cap_t FreeBlkCacheQueue::try_fill_cache(const blk_cache_fill_req& fill_req, blk_cache_fill_session& fill_session) { - blk_cap_t nblks_added{0}; +blk_num_t FreeBlkCacheQueue::try_fill_cache(const blk_cache_fill_req& fill_req, blk_cache_fill_session& fill_session) { + blk_num_t nblks_added{0}; slab_idx_t slabs_pending_refill{static_cast< slab_idx_t >(m_slab_queues.size())}; auto slab_idx{FreeBlkCache::find_slab(fill_req.nblks)}; @@ -167,8 +167,8 @@ blk_cap_t FreeBlkCacheQueue::try_fill_cache(const blk_cache_fill_req& fill_req, return (fill_req.nblks - nblks_remain); } -blk_cap_t FreeBlkCacheQueue::total_free_blks() const { - blk_cap_t count{0}; +blk_num_t FreeBlkCacheQueue::total_free_blks() const { + blk_num_t count{0}; for (const auto& sq : m_slab_queues) { count += sq->entry_count() * sq->slab_size(); } @@ -194,13 +194,13 @@ BlkAllocStatus FreeBlkCacheQueue::try_alloc_in_slab(const slab_idx_t slab_idx, c "Residue block count are not expected to exceed last entry"); const blk_count_t needed_blocks{ static_cast< blk_count_t >(m_slab_queues[slab_idx]->slab_size() - residue_nblks)}; - resp.out_blks.back().set_nblks(needed_blocks); + resp.out_blks.back().set_blk_count(needed_blocks); resp.nblks_alloced -= residue_nblks; // Create the trail residue entry and use that to free them. auto residue_e{resp.out_blks.back()}; residue_e.set_blk_num(residue_e.get_blk_num() + needed_blocks); - residue_e.set_nblks(residue_nblks); + residue_e.set_blk_count(residue_nblks); BLKALLOC_LOG(TRACE, "Residue blocks {}", residue_e.to_string()); resp.nblks_zombied += try_free_blks(residue_e, resp.excess_blks); } @@ -292,7 +292,7 @@ std::optional< blk_temp_t > FreeBlkCacheQueue::pop_slab(const slab_idx_t slab_id return ret; } -SlabCacheQueue::SlabCacheQueue(const blk_count_t slab_size, const std::vector< blk_cap_t >& level_limits, +SlabCacheQueue::SlabCacheQueue(const blk_count_t slab_size, const std::vector< blk_num_t >& level_limits, const float refill_pct, BlkAllocMetrics* parent_metrics) : m_slab_size{slab_size}, m_metrics{m_slab_size, this, parent_metrics} { for (auto& limit : level_limits) { @@ -305,8 +305,8 @@ SlabCacheQueue::SlabCacheQueue(const blk_count_t slab_size, const std::vector< b } std::optional< blk_temp_t > SlabCacheQueue::push(const blk_cache_entry& entry, const bool only_this_level) { - const blk_temp_t start_level{ - static_cast< blk_temp_t >((entry.get_temperature() >= m_level_queues.size()) ? m_level_queues.size() - 1 : entry.get_temperature())}; + const blk_temp_t start_level{static_cast< blk_temp_t >( + (entry.get_temperature() >= m_level_queues.size()) ? m_level_queues.size() - 1 : entry.get_temperature())}; blk_temp_t level{start_level}; bool pushed{m_level_queues[start_level]->write(entry)}; @@ -337,20 +337,20 @@ std::optional< blk_temp_t > SlabCacheQueue::pop(const blk_temp_t input_level, co return popped ? std::optional< blk_temp_t >{start_level} : std::nullopt; } -blk_cap_t SlabCacheQueue::entry_count() const { - blk_cap_t sz{0}; +blk_num_t SlabCacheQueue::entry_count() const { + blk_num_t sz{0}; for (size_t l{0}; l < m_level_queues.size(); ++l) { sz += num_level_entries(l); } return sz; } -blk_cap_t SlabCacheQueue::entry_capacity() const { return m_total_capacity; } +blk_num_t SlabCacheQueue::entry_capacity() const { return m_total_capacity; } -blk_cap_t SlabCacheQueue::num_level_entries(const blk_temp_t level) const { return m_level_queues[level]->sizeGuess(); } +blk_num_t SlabCacheQueue::num_level_entries(const blk_temp_t level) const { return m_level_queues[level]->sizeGuess(); } -blk_cap_t SlabCacheQueue::open_session(const uint64_t session_id, const bool fill_entire_cache) { - blk_cap_t count{0}; +blk_num_t SlabCacheQueue::open_session(const uint64_t session_id, const bool fill_entire_cache) { + blk_num_t count{0}; uint64_t id{m_refill_session.load(std::memory_order_acquire)}; if (id == 0) { diff --git a/src/lib/blkalloc/blk_cache_queue.h b/src/lib/blkalloc/blk_cache_queue.h index f9f51315d..87ac901bf 100644 --- a/src/lib/blkalloc/blk_cache_queue.h +++ b/src/lib/blkalloc/blk_cache_queue.h @@ -47,7 +47,7 @@ class SlabMetrics : public sisl::MetricsGroup { class SlabCacheQueue { public: - SlabCacheQueue(const blk_count_t slab_size, const std::vector< blk_cap_t >& level_limits, const float refill_pct, + SlabCacheQueue(const blk_count_t slab_size, const std::vector< blk_num_t >& level_limits, const float refill_pct, BlkAllocMetrics* metrics); SlabCacheQueue(const SlabCacheQueue&) = delete; SlabCacheQueue(SlabCacheQueue&&) noexcept = delete; @@ -58,15 +58,15 @@ class SlabCacheQueue { [[nodiscard]] std::optional< blk_temp_t > push(const blk_cache_entry& entry, const bool only_this_level); [[nodiscard]] std::optional< blk_temp_t > pop(const blk_temp_t level, const bool only_this_level, blk_cache_entry& out_entry); - [[nodiscard]] blk_cap_t entry_count() const; - [[nodiscard]] blk_cap_t entry_capacity() const; - [[nodiscard]] blk_cap_t num_level_entries(const blk_temp_t level) const; + [[nodiscard]] blk_num_t entry_count() const; + [[nodiscard]] blk_num_t entry_capacity() const; + [[nodiscard]] blk_num_t num_level_entries(const blk_temp_t level) const; [[nodiscard]] blk_num_t entries_needed(const blk_num_t nblks) const { return (nblks - 1) / m_slab_size + 1; } [[nodiscard]] blk_count_t slab_size() const { return m_slab_size; } void refilled(); - [[nodiscard]] blk_cap_t open_session(const uint64_t session_id, const bool fill_entire_cache); + [[nodiscard]] blk_num_t open_session(const uint64_t session_id, const bool fill_entire_cache); void close_session(const uint64_t session_id); [[nodiscard]] SlabMetrics& metrics() { return m_metrics; } @@ -74,11 +74,11 @@ class SlabCacheQueue { blk_count_t get_slab_size() const { return m_slab_size; } private: - blk_count_t m_slab_size; // Slab size in-terms of number of pages + blk_count_t m_slab_size; // Slab size in-terms of number of pages std::vector< std::unique_ptr< folly::MPMCQueue< blk_cache_entry > > > m_level_queues; std::atomic< uint64_t > m_refill_session{0}; // Is a refill pending for this slab - blk_cap_t m_total_capacity{0}; - blk_cap_t m_refill_threshold_limits; // For every level whats their threshold limit size + blk_num_t m_total_capacity{0}; + blk_num_t m_refill_threshold_limits; // For every level whats their threshold limit size SlabMetrics m_metrics; }; @@ -90,35 +90,29 @@ class FreeBlkCacheQueue : public FreeBlkCache { FreeBlkCacheQueue& operator=(const FreeBlkCacheQueue&) = delete; FreeBlkCacheQueue& operator=(FreeBlkCacheQueue&&) noexcept = delete; - [[nodiscard]] BlkAllocStatus try_alloc_blks(const blk_cache_alloc_req& req, blk_cache_alloc_resp& resp) override; - [[maybe_unused]] blk_count_t try_free_blks(const blk_cache_entry& entry, - std::vector< blk_cache_entry >& excess_blks) override; - [[maybe_unused]] blk_count_t try_free_blks(const std::vector< blk_cache_entry >& blks, - std::vector< blk_cache_entry >& excess_blks) override; - [[nodiscard]] blk_cap_t try_fill_cache(const blk_cache_fill_req& fill_req, - blk_cache_fill_session& fill_session) override; + BlkAllocStatus try_alloc_blks(const blk_cache_alloc_req& req, blk_cache_alloc_resp& resp) override; + blk_count_t try_free_blks(const blk_cache_entry& entry, std::vector< blk_cache_entry >& excess_blks) override; + blk_count_t try_free_blks(const std::vector< blk_cache_entry >& blks, + std::vector< blk_cache_entry >& excess_blks) override; + blk_num_t try_fill_cache(const blk_cache_fill_req& fill_req, blk_cache_fill_session& fill_session) override; - [[nodiscard]] blk_cap_t total_free_blks() const override; + blk_num_t total_free_blks() const override; - [[nodiscard]] std::shared_ptr< blk_cache_fill_session > create_cache_fill_session(const bool fill_entire_cache); + std::shared_ptr< blk_cache_fill_session > create_cache_fill_session(const bool fill_entire_cache); void close_cache_fill_session(blk_cache_fill_session& fill_session); private: - [[nodiscard]] BlkAllocStatus break_up(const slab_idx_t slab_idx, const blk_cache_alloc_req& req, - blk_cache_alloc_resp& resp); - [[nodiscard]] BlkAllocStatus merge_down(const slab_idx_t slab_idx, const blk_cache_alloc_req& req, - blk_cache_alloc_resp& resp); - [[nodiscard]] BlkAllocStatus try_alloc_in_slab(const slab_idx_t slab_num, const blk_cache_alloc_req& req, - blk_cache_alloc_resp& resp); - - [[nodiscard]] std::optional< blk_temp_t > push_slab(const slab_idx_t slab_idx, const blk_cache_entry& entry, - const bool only_this_level); - [[nodiscard]] std::optional< blk_temp_t > pop_slab(const slab_idx_t slab_idx, const blk_temp_t level, - const bool only_this_level, blk_cache_entry& out_entry); - - [[nodiscard]] inline SlabMetrics& slab_metrics(const slab_idx_t slab_idx) const { - return m_slab_queues[slab_idx]->metrics(); - } + BlkAllocStatus break_up(const slab_idx_t slab_idx, const blk_cache_alloc_req& req, blk_cache_alloc_resp& resp); + BlkAllocStatus merge_down(const slab_idx_t slab_idx, const blk_cache_alloc_req& req, blk_cache_alloc_resp& resp); + BlkAllocStatus try_alloc_in_slab(const slab_idx_t slab_num, const blk_cache_alloc_req& req, + blk_cache_alloc_resp& resp); + + std::optional< blk_temp_t > push_slab(const slab_idx_t slab_idx, const blk_cache_entry& entry, + const bool only_this_level); + std::optional< blk_temp_t > pop_slab(const slab_idx_t slab_idx, const blk_temp_t level, const bool only_this_level, + blk_cache_entry& out_entry); + + inline SlabMetrics& slab_metrics(const slab_idx_t slab_idx) const { return m_slab_queues[slab_idx]->metrics(); } std::string get_name() { return m_cfg.get_name(); } diff --git a/src/lib/blkalloc/fixed_blk_allocator.cpp b/src/lib/blkalloc/fixed_blk_allocator.cpp index 134c2736f..d922edf03 100644 --- a/src/lib/blkalloc/fixed_blk_allocator.cpp +++ b/src/lib/blkalloc/fixed_blk_allocator.cpp @@ -20,7 +20,7 @@ #include "blk_allocator.h" namespace homestore { -FixedBlkAllocator::FixedBlkAllocator(const BlkAllocConfig& cfg, bool init, chunk_num_t chunk_id) : +FixedBlkAllocator::FixedBlkAllocator(BlkAllocConfig const& cfg, bool init, chunk_num_t chunk_id) : BlkAllocator(cfg, chunk_id), m_blk_q{get_total_blks()} { LOGINFO("total blks: {}", get_total_blks()); if (init) { inited(); } @@ -53,45 +53,29 @@ blk_num_t FixedBlkAllocator::init_portion(BlkAllocPortion& portion, blk_num_t st return blk_num; } -bool FixedBlkAllocator::is_blk_alloced(const BlkId& b, bool use_lock) const { return true; } +bool FixedBlkAllocator::is_blk_alloced(BlkId const& b, bool use_lock) const { return true; } -BlkAllocStatus FixedBlkAllocator::alloc(blk_count_t nblks, const blk_alloc_hints& hints, - std::vector< BlkId >& out_blkid) { - /* TODO:If it is more then 1 then we need to make sure that we never allocate across the portions. As of now - * we don't support the vector of blkids in fixed blk allocator */ +BlkAllocStatus FixedBlkAllocator::alloc([[maybe_unused]] blk_count_t nblks, blk_alloc_hints const&, BlkId& out_blkid) { HS_DBG_ASSERT_EQ(nblks, 1, "FixedBlkAllocator does not support multiple blk allocation yet"); - - BlkId bid; - const auto status = alloc(bid); - if (status == BlkAllocStatus::SUCCESS) { - out_blkid.push_back(bid); - // no need to update real time bm as it is already updated in alloc of single blkid api; - } - return status; + return alloc_contiguous(r_cast< BlkId& >(out_blkid)); } -BlkAllocStatus FixedBlkAllocator::alloc(BlkId& out_blkid) { +BlkAllocStatus FixedBlkAllocator::alloc_contiguous(BlkId& out_blkid) { #ifdef _PRERELEASE if (iomgr_flip::instance()->test_flip("fixed_blkalloc_no_blks")) { return BlkAllocStatus::SPACE_FULL; } #endif const auto ret = m_blk_q.read(out_blkid); if (ret) { // update real time bitmap; - alloc_on_realtime(out_blkid); + if (realtime_bm_on()) { alloc_on_realtime(out_blkid); } return BlkAllocStatus::SUCCESS; } else { return BlkAllocStatus::SPACE_FULL; } } -void FixedBlkAllocator::free(const std::vector< BlkId >& blk_ids) { - for (const auto& blk_id : blk_ids) { - free(blk_id); - } -} - -void FixedBlkAllocator::free(const BlkId& b) { - HS_DBG_ASSERT_EQ(b.get_nblks(), 1, "Multiple blk free for FixedBlkAllocator? allocated by different allocator?"); +void FixedBlkAllocator::free(BlkId const& b) { + HS_DBG_ASSERT_EQ(b.blk_count(), 1, "Multiple blk free for FixedBlkAllocator? allocated by different allocator?"); // No need to set in cache if it is not recovered. When recovery is complete we copy the disk_bm to cache bm. if (m_inited) { @@ -100,8 +84,8 @@ void FixedBlkAllocator::free(const BlkId& b) { } } -blk_cap_t FixedBlkAllocator::available_blks() const { return m_blk_q.sizeGuess(); } -blk_cap_t FixedBlkAllocator::get_used_blks() const { return get_total_blks() - available_blks(); } +blk_num_t FixedBlkAllocator::available_blks() const { return m_blk_q.sizeGuess(); } +blk_num_t FixedBlkAllocator::get_used_blks() const { return get_total_blks() - available_blks(); } std::string FixedBlkAllocator::to_string() const { return fmt::format("Total Blks={} Available_Blks={}", get_total_blks(), available_blks()); diff --git a/src/lib/blkalloc/varsize_blk_allocator.cpp b/src/lib/blkalloc/varsize_blk_allocator.cpp index 8380621b8..34d2e6dab 100644 --- a/src/lib/blkalloc/varsize_blk_allocator.cpp +++ b/src/lib/blkalloc/varsize_blk_allocator.cpp @@ -33,7 +33,7 @@ SISL_LOGGING_DECL(blkalloc) template <> struct fmt::formatter< std::thread::id > { constexpr auto parse(format_parse_context& ctx) -> format_parse_context::iterator { return ctx.begin(); } - auto format(const std::thread::id& i, format_context& ctx) const -> format_context::iterator { + auto format(std::thread::id const& i, format_context& ctx) const -> format_context::iterator { return fmt::format_to(ctx.out(), "{}", std::hash< std::thread::id >{}(i)); } }; @@ -50,7 +50,7 @@ std::condition_variable VarsizeBlkAllocator::s_sweeper_cv; std::queue< VarsizeBlkAllocator* > VarsizeBlkAllocator::s_sweeper_queue; std::unordered_set< VarsizeBlkAllocator* > VarsizeBlkAllocator::s_block_allocators; -VarsizeBlkAllocator::VarsizeBlkAllocator(const VarsizeBlkAllocConfig& cfg, bool init, chunk_num_t chunk_id) : +VarsizeBlkAllocator::VarsizeBlkAllocator(VarsizeBlkAllocConfig const& cfg, bool init, chunk_num_t chunk_id) : BlkAllocator{cfg, chunk_id}, m_state{BlkAllocatorState::INIT}, m_cfg{cfg}, @@ -232,27 +232,6 @@ bool VarsizeBlkAllocator::allocator_state_machine() { return active_state; } -bool VarsizeBlkAllocator::is_blk_alloced(const BlkId& b, bool use_lock) const { - if (!m_inited) { return true; } - auto bits_set{[this, &b]() { - // No need to set in cache if it is not recovered. When recovery is complete we copy the disk_bm to cache - // bm. - if (!m_cache_bm->is_bits_set(b.get_blk_num(), b.get_nblks())) { - BLKALLOC_REL_ASSERT(0, "Expected bits to set"); - return false; - } - return true; - }}; - if (use_lock) { - const BlkAllocPortion& portion = blknum_to_portion_const(b.get_blk_num()); - auto lock{portion.portion_auto_lock()}; - if (!bits_set()) return false; - } else { - if (!bits_set()) return false; - } - return true; -} - void VarsizeBlkAllocator::inited() { m_cache_bm->copy(*(get_disk_bm_const())); BlkAllocator::inited(); @@ -405,251 +384,376 @@ void VarsizeBlkAllocator::fill_cache_in_portion(blk_num_t portion_num, blk_cache fill_session.session_id, portion_num, fill_session.overall_refilled_num_blks); } -BlkAllocStatus VarsizeBlkAllocator::alloc(BlkId& out_blkid) { - static thread_local std::vector< BlkId > s_ids; - s_ids.clear(); +BlkAllocStatus VarsizeBlkAllocator::alloc_contiguous(BlkId& out_blkid) { + return alloc_contiguous(1, blk_alloc_hints{}, out_blkid); +} - auto const status = alloc(1, blk_alloc_hints{}, s_ids); - if (status == BlkAllocStatus::SUCCESS) { - out_blkid = s_ids[0]; - // we don't update realtime here; - // it is already updated at vector version of alloc; - } +BlkAllocStatus VarsizeBlkAllocator::alloc_contiguous(blk_count_t nblks, blk_alloc_hints const& hints, + BlkId& out_blkid) { + MultiBlkId mbid; + auto const status = alloc(nblks, hints, mbid); + if (status == BlkAllocStatus::SUCCESS) { out_blkid = mbid; } return status; } -BlkAllocStatus VarsizeBlkAllocator::alloc(blk_count_t nblks, const blk_alloc_hints& hints, - std::vector< BlkId >& out_blkids) { - BLKALLOC_LOG_ASSERT(m_inited, "Alloc before initialized"); - BLKALLOC_LOG_ASSERT_CMP(nblks % hints.multiplier, ==, 0); - BLKALLOC_LOG(TRACE, "nblks={}, hints multiplier={}", nblks, hints.multiplier); +BlkAllocStatus VarsizeBlkAllocator::alloc(blk_count_t nblks, blk_alloc_hints const& hints, BlkId& out_blkid) { + bool use_slabs = m_cfg.m_use_slabs; #ifdef _PRERELEASE - if (hints.error_simulate && iomgr_flip::instance()->test_flip("varsize_blkalloc_no_blks", nblks)) { - return BlkAllocStatus::SPACE_FULL; - } - - if (iomgr_flip::instance()->test_flip("varsize_blkalloc_bypass_cache")) { - blk_count_t num_alllocated{0}; - auto const status = alloc_blks_direct(nblks, hints, out_blkids, num_alllocated); - if (status == BlkAllocStatus::SUCCESS) { - incr_alloced_blk_count(num_alllocated); - return status; - } else { - // NOTE: There is a small chance this can fail if all the blocks have already been allocated - // to slabs. So clear any partial and fall through to normal routine below - if (status == BlkAllocStatus::PARTIAL) { - for (const auto& blk_id : out_blkids) { - free_on_bitmap(blk_id); - } - out_blkids.clear(); - } - } - } + if (iomgr_flip::instance()->test_flip("varsize_blkalloc_no_blks", nblks)) { return BlkAllocStatus::SPACE_FULL; } + if (iomgr_flip::instance()->test_flip("varsize_blkalloc_bypass_cache")) { use_slabs = false; } #endif - auto status = BlkAllocStatus::FAILED; - blk_count_t total_allocated{0}; - if (m_cfg.m_use_slabs) { - // Allocate from blk cache - static thread_local blk_cache_alloc_resp s_alloc_resp; - const blk_cache_alloc_req alloc_req{nblks, hints.desired_temp, hints.is_contiguous, - FreeBlkCache::find_slab(hints.multiplier), - FreeBlkCache::find_slab(hints.max_blks_per_entry)}; - COUNTER_INCREMENT(m_metrics, num_alloc, 1); - - auto free_excess_blocks{[this]() { - // put excess blocks back on bitmap - for (const auto& e : s_alloc_resp.excess_blks) { - BLKALLOC_LOG(DEBUG, "Freeing in bitmap of entry={} - excess of alloc_blks size={}", e.to_string(), - s_alloc_resp.excess_blks.size()); - free_on_bitmap(blk_cache_entry_to_blkid(e)); - } - }}; - - auto discard_current_allocation{[this, &free_excess_blocks]() { - if (!s_alloc_resp.out_blks.empty()) { - s_alloc_resp.nblks_zombied = m_fb_cache->try_free_blks(s_alloc_resp.out_blks, s_alloc_resp.excess_blks); - } - free_excess_blocks(); - s_alloc_resp.reset(); - }}; - - s_alloc_resp.reset(); - // retries must be at least two to allow slab refill logic to run - const uint32_t max_retries = - std::max< uint32_t >(HS_DYNAMIC_CONFIG(blkallocator.max_varsize_blk_alloc_attempt), 2); - for (uint32_t retry{0}; (retry < max_retries); ++retry) { - status = m_fb_cache->try_alloc_blks(alloc_req, s_alloc_resp); - if ((status == BlkAllocStatus::SUCCESS) || ((status == BlkAllocStatus::PARTIAL) && !hints.is_contiguous)) { - // If the cache has depleted a bit, kick of sweep thread to fill the cache. - if (s_alloc_resp.need_refill) { request_more_blks(nullptr, false /* fill_entire_cache */); } - BLKALLOC_LOG(TRACE, "Alloced first blk_num={}", s_alloc_resp.out_blks[0].to_string()); - - // Convert the response block cache entries to blkids - blk_cache_entries_to_blkids(s_alloc_resp.out_blks, out_blkids); - total_allocated = s_alloc_resp.nblks_alloced; - break; - } else { - discard_current_allocation(); - if ((retry + 1) < max_retries) { - COUNTER_INCREMENT(m_metrics, num_retries, 1); - auto const min_nblks = std::max< blk_count_t >(m_cfg.highest_slab_blks_count() * 2, nblks); - BLKALLOC_LOG( - DEBUG, - "Failed to allocate {} blks from blk cache, requesting refill at least {} blks and retry={}", - nblks, min_nblks, retry); - request_more_blks_wait(nullptr /* seg */, min_nblks); - } - } - } - free_excess_blocks(); + if (!hints.is_contiguous && !out_blkid.is_multi()) { + HS_DBG_ASSERT(false, "Invalid Input: Non contiguous allocation needs MultiBlkId to store"); + return BlkAllocStatus::INVALID_INPUT; } - if (hints.is_contiguous) { - // failed to allocate in slab try direct. - if (status != BlkAllocStatus::SUCCESS) { - blk_count_t num_allocated{0}; - status = alloc_blks_direct(nblks, hints, out_blkids, num_allocated); - if (status == BlkAllocStatus::SUCCESS) { - total_allocated += num_allocated; - BLKALLOC_LOG(TRACE, "Alloced blk_num={} directly", out_blkids.back().to_string()); - } - } - } else { - if (status != BlkAllocStatus::SUCCESS) { - // try to allocate remainder - const blk_count_t nblks_remaining = static_cast< blk_count_t >(nblks - total_allocated); - BLKALLOC_LOG(DEBUG, "nblks={} failed to alloc all from fb cache, trying to alloc rest from bitset directly", - nblks_remaining); - blk_count_t num_allocated{0}; - auto status2 = alloc_blks_direct(nblks_remaining, hints, out_blkids, num_allocated); - if ((status2 == BlkAllocStatus::SUCCESS) || (status2 == BlkAllocStatus::PARTIAL)) { - total_allocated += num_allocated; - BLKALLOC_LOG(TRACE, "Alloced additional blk_num={} directly", out_blkids.back().to_string()); - } else { - // failure to get more is really partial if we have some - BLKALLOC_LOG(TRACE, "Failed to alloc additional blks directly with code {}", status2); - if (status == BlkAllocStatus::PARTIAL) status2 = BlkAllocStatus::PARTIAL; - } - status = status2; + MultiBlkId tmp_blkid; + MultiBlkId& out_mbid = out_blkid.is_multi() ? r_cast< MultiBlkId& >(out_blkid) : tmp_blkid; + BlkAllocStatus status; + blk_count_t num_allocated{0}; + blk_count_t nblks_remain; + + if (use_slabs && (nblks <= m_cfg.highest_slab_blks_count())) { + num_allocated = alloc_blks_slab(nblks, hints, out_mbid); + if (num_allocated >= nblks) { + status = BlkAllocStatus::SUCCESS; + goto out; } + // Fall through to alloc_blks_direct } - switch (status) { - case BlkAllocStatus::FAILED: - case BlkAllocStatus::SPACE_FULL: - COUNTER_INCREMENT(m_metrics, num_alloc_failure, 1); - BLKALLOC_LOG(ERROR, "nblks={} failed to alloc any number of blocks", nblks); - break; - case BlkAllocStatus::PARTIAL: - COUNTER_INCREMENT(m_metrics, num_alloc_partial, 1); - BLKALLOC_LOG(DEBUG, "nblks={} allocated={} partial allocation", nblks, total_allocated); - break; - case BlkAllocStatus::SUCCESS: - break; - default: - BLKALLOC_LOG(ERROR, "Unexpected status", status); + nblks_remain = nblks - num_allocated; + num_allocated += alloc_blks_direct(nblks_remain, hints, out_mbid); + if (num_allocated == nblks) { + status = BlkAllocStatus::SUCCESS; + BLKALLOC_LOG(TRACE, "Alloced blks [{}] directly", out_mbid.to_string()); + } else if ((num_allocated != 0) && hints.partial_alloc_ok) { + status = BlkAllocStatus::PARTIAL; + } else { + free_blks_direct(out_mbid); + status = hints.is_contiguous ? BlkAllocStatus::FAILED : BlkAllocStatus::SPACE_FULL; } +out: if ((status == BlkAllocStatus::SUCCESS) || (status == BlkAllocStatus::PARTIAL)) { - incr_alloced_blk_count(total_allocated); + incr_alloced_blk_count(num_allocated); // update real time bitmap - for (const auto& b : out_blkids) { - alloc_on_realtime(b); - } + if (realtime_bm_on()) { alloc_on_realtime(out_mbid); } #ifdef _PRERELEASE - alloc_sanity_check(total_allocated, hints, out_blkids); + alloc_sanity_check(num_allocated, hints, out_mbid); #endif } + if (!out_blkid.is_multi()) { out_blkid = out_mbid.to_single_blkid(); } return status; } -void VarsizeBlkAllocator::free(const std::vector< BlkId >& blk_ids) { - for (const auto& blk_id : blk_ids) { - free(blk_id); +BlkAllocStatus VarsizeBlkAllocator::alloc(blk_count_t nblks, blk_alloc_hints const& hints, + std::vector< BlkId >& out_blkids) { + // Regular alloc blks will allocate in MultiBlkId, but there is an upper limit on how many it can accomodate in a + // single MultiBlkId, if caller is ok to generate multiple MultiBlkids, this method is called. + auto h = hints; + h.partial_alloc_ok = true; + blk_count_t nblks_remain = nblks; + BlkAllocStatus status; + + do { + MultiBlkId mbid; + status = alloc(nblks_remain, h, mbid); + if ((status != BlkAllocStatus::SUCCESS) && (status != BlkAllocStatus::PARTIAL)) { break; } + + blk_count_t nblks_this_iter{0}; + auto it = mbid.iterate(); + while (auto const bid = it.next()) { + out_blkids.push_back(*bid); + nblks_this_iter += bid->blk_count(); + } + + if (status == BlkAllocStatus::SUCCESS) { + HS_DBG_ASSERT_GE(nblks_this_iter, nblks_remain, + "alloc_blks returned success, but return id doesn't have reqd blks"); + break; + } + + if (nblks_this_iter >= nblks_remain) { + HS_DBG_ASSERT(false, "alloc_blks returns partial, while it has fully allocated reqd blks"); + status = BlkAllocStatus::SUCCESS; + break; + } + nblks_remain -= nblks_this_iter; + } while (nblks_remain); + + return status; +} + +blk_count_t VarsizeBlkAllocator::alloc_blks_slab(blk_count_t nblks, blk_alloc_hints const& hints, + MultiBlkId& out_blkid) { + blk_count_t num_allocated{0}; + + // Allocate from blk cache + static thread_local blk_cache_alloc_resp s_alloc_resp; + const blk_cache_alloc_req alloc_req{nblks, hints.desired_temp, hints.is_contiguous, + FreeBlkCache::find_slab(hints.min_blks_per_piece), + s_cast< slab_idx_t >(m_cfg.get_slab_cnt() - 1)}; + COUNTER_INCREMENT(m_metrics, num_alloc, 1); + + auto free_excess_blocks = [this]() { + // put excess blocks back on bitmap + for (auto const& e : s_alloc_resp.excess_blks) { + BLKALLOC_LOG(DEBUG, "Freeing in bitmap of entry={} - excess of alloc_blks size={}", e.to_string(), + s_alloc_resp.excess_blks.size()); + free_blks_direct(MultiBlkId{blk_cache_entry_to_blkid(e)}); + } + }; + + auto discard_current_allocation = [this, &free_excess_blocks]() { + if (!s_alloc_resp.out_blks.empty()) { + s_alloc_resp.nblks_zombied = m_fb_cache->try_free_blks(s_alloc_resp.out_blks, s_alloc_resp.excess_blks); + } + free_excess_blocks(); + s_alloc_resp.reset(); + }; + + s_alloc_resp.reset(); + // retries must be at least two to allow slab refill logic to run + const uint32_t max_retries = std::max< uint32_t >(HS_DYNAMIC_CONFIG(blkallocator.max_varsize_blk_alloc_attempt), 2); + for (uint32_t retry{0}; ((retry < max_retries) && out_blkid.has_room()); ++retry) { + auto status = m_fb_cache->try_alloc_blks(alloc_req, s_alloc_resp); + + // If the blk allocation is only partially completed, then we are ok in proceeding further for cases where + // caller does not want a contiguous allocation. In that case, return these partial results and then caller will + // use direct allocation to allocate remaining blks. In case where caller is also ok with partial allocation, + // then it doesn't matter if request is for contiguous allocation or not, we can return the partial results. + if ((status == BlkAllocStatus::SUCCESS) || + ((status == BlkAllocStatus::PARTIAL) && (hints.partial_alloc_ok || !hints.is_contiguous))) { + // If the cache has depleted a bit, kick of sweep thread to fill the cache. + if (s_alloc_resp.need_refill) { request_more_blks(nullptr, false /* fill_entire_cache */); } + BLKALLOC_LOG(TRACE, "Alloced first blk_num={}", s_alloc_resp.out_blks[0].to_string()); + + // Convert the response block cache entries to blkids + for (size_t piece{0}; piece < s_alloc_resp.out_blks.size(); ++piece) { + auto& e = s_alloc_resp.out_blks[piece]; + if (out_blkid.has_room()) { + out_blkid.add(e.get_blk_num(), e.blk_count(), m_chunk_id); + num_allocated += e.blk_count(); + } else { + // We are not able to put all of the response to out_blkid, because it doesn't have room, + // If caller is ok with partial allocation, we can free remaining entry and send partial result. + // If caller is not ok with partial allocation, we should discard entire allocation and retry + if (hints.partial_alloc_ok) { + s_alloc_resp.excess_blks.insert(s_alloc_resp.excess_blks.end(), + s_alloc_resp.out_blks.begin() + piece, + s_alloc_resp.out_blks.end()); + } else { + num_allocated = 0; + out_blkid = MultiBlkId{}; + status = BlkAllocStatus::TOO_MANY_PIECES; + } + break; + } + } + + if (status != BlkAllocStatus::TOO_MANY_PIECES) { break; } + } + + discard_current_allocation(); + if ((retry + 1) < max_retries) { + COUNTER_INCREMENT(m_metrics, num_retries, 1); + auto const min_nblks = std::max< blk_count_t >(m_cfg.highest_slab_blks_count() * 2, nblks); + BLKALLOC_LOG(DEBUG, + "Failed to allocate {} blks from blk cache, requesting refill at least {} blks " + "and retry={}", + nblks, min_nblks, retry); + request_more_blks_wait(nullptr /* seg */, min_nblks); + } } + + free_excess_blocks(); + + return num_allocated; +} + +blk_count_t VarsizeBlkAllocator::alloc_blks_direct(blk_count_t nblks, blk_alloc_hints const& hints, + MultiBlkId& out_blkid) { + // Search all segments starting with some random portion num within each segment + static thread_local std::random_device rd{}; + static thread_local std::default_random_engine re{rd()}; + + if (m_start_portion_num == INVALID_PORTION_NUM) { m_start_portion_num = m_rand_portion_num_generator(re); } + + auto portion_num = m_start_portion_num; + auto const max_pieces = hints.is_contiguous ? 1u : MultiBlkId::max_pieces; + + blk_count_t const min_blks = hints.is_contiguous ? nblks : std::min< blk_count_t >(nblks, hints.min_blks_per_piece); + blk_count_t nblks_remain = nblks; + do { + BlkAllocPortion& portion = get_blk_portion(portion_num); + auto cur_blk_id = portion_num * get_blks_per_portion(); + auto const end_blk_id = cur_blk_id + get_blks_per_portion() - 1; + { + auto lock{portion.portion_auto_lock()}; + while (nblks_remain && (cur_blk_id <= end_blk_id) && portion.get_available_blocks() && + out_blkid.has_room()) { + // Get next reset bits and insert to cache and then reset those bits + auto const b = m_cache_bm->get_next_contiguous_n_reset_bits( + cur_blk_id, end_blk_id, std::min(min_blks, nblks_remain), nblks_remain); + if (b.nbits == 0) { break; } + HS_DBG_ASSERT_GE(end_blk_id, b.start_bit, "Expected start bit to be smaller than end bit"); + HS_DBG_ASSERT_LE(b.nbits, nblks_remain); + HS_DBG_ASSERT_GE(b.nbits, std::min(min_blks, nblks_remain)); + HS_DBG_ASSERT_GE(end_blk_id, (b.start_bit + b.nbits - 1), + "Expected end bit to be smaller than portion end bit"); + + nblks_remain -= b.nbits; + out_blkid.add(b.start_bit, b.nbits, m_chunk_id); + + BLKALLOC_LOG(DEBUG, "Allocated directly from portion={} nnblks={} Blk_num={} nblks={} set_bit_count={}", + portion_num, nblks, b.start_bit, b.nbits, get_alloced_blk_count()); + + // Set the bitmap indicating the blocks are allocated + m_cache_bm->set_bits(b.start_bit, b.nbits); + if (portion.decrease_available_blocks(b.nbits) == 0) break; + cur_blk_id = b.start_bit + b.nbits; + } + } + if (++portion_num == get_num_portions()) { portion_num = 0; } + BLKALLOC_LOG(TRACE, "alloc direct unable to find in prev portion, searching in portion={}, start_portion={}", + portion_num, m_start_portion_num); + } while (nblks_remain && (portion_num != m_start_portion_num) && !hints.is_contiguous && out_blkid.has_room()); + + // save which portion we were at for next allocation; + m_start_portion_num = portion_num; + + COUNTER_INCREMENT(m_metrics, num_blks_alloc_direct, 1); + return (nblks - nblks_remain); } -void VarsizeBlkAllocator::free(const BlkId& b) { +void VarsizeBlkAllocator::free(BlkId const& bid) { if (!m_inited) { - BLKALLOC_LOG(DEBUG, "Free not required for blk num = {}", b.get_blk_num()); + BLKALLOC_LOG(DEBUG, "Free not required for blk num = {}", bid.blk_num()); return; } - if (m_cfg.m_use_slabs) { - static thread_local std::vector< blk_cache_entry > excess_blks; - excess_blks.clear(); + blk_count_t n_freed = (m_cfg.m_use_slabs && (bid.blk_count() <= m_cfg.highest_slab_blks_count())) + ? free_blks_slab(r_cast< MultiBlkId const& >(bid)) + : free_blks_direct(r_cast< MultiBlkId const& >(bid)); + decr_alloced_blk_count(n_freed); + BLKALLOC_LOG(TRACE, "Freed blk_num={}", bid.to_string()); +} + +blk_count_t VarsizeBlkAllocator::free_blks_slab(MultiBlkId const& bid) { + static thread_local std::vector< blk_cache_entry > excess_blks; + excess_blks.clear(); - [[maybe_unused]] const blk_count_t num_zombied{ - m_fb_cache->try_free_blks(blkid_to_blk_cache_entry(b, 2), excess_blks)}; + auto const do_free = [this](BlkId const& b) { + m_fb_cache->try_free_blks(blkid_to_blk_cache_entry(b, 2), excess_blks); + return b.blk_count(); + }; - for (const auto& e : excess_blks) { - BLKALLOC_LOG(TRACE, "Freeing in bitmap of entry={} - excess of free_blks size={}", e.to_string(), - excess_blks.size()); - free_on_bitmap(blk_cache_entry_to_blkid(e)); + blk_count_t n_freed{0}; + if (bid.is_multi()) { + auto it = bid.iterate(); + while (auto const b = it.next()) { + n_freed += do_free(*b); } } else { - // free directly on bitmap - free_on_bitmap(b); + n_freed += do_free(bid); } - decr_alloced_blk_count(b.get_nblks()); - BLKALLOC_LOG(TRACE, "Freed blk_num={}", blkid_to_blk_cache_entry(b).to_string()); + for (auto const& e : excess_blks) { + BLKALLOC_LOG(TRACE, "Freeing in bitmap of entry={} - excess of free_blks size={}", e.to_string(), + excess_blks.size()); + free_blks_direct(MultiBlkId{blk_cache_entry_to_blkid(e)}); + } + return n_freed; } -blk_cap_t VarsizeBlkAllocator::available_blks() const { return get_total_blks() - get_used_blks(); } -blk_cap_t VarsizeBlkAllocator::get_used_blks() const { return get_alloced_blk_count(); } - -void VarsizeBlkAllocator::free_on_bitmap(const BlkId& b) { - BlkAllocPortion& portion = blknum_to_portion(b.get_blk_num()); - { - auto const start_blk_id = portion.get_portion_num() * get_blks_per_portion(); - auto const end_blk_id = start_blk_id + get_blks_per_portion() - 1; - auto lock{portion.portion_auto_lock()}; - HS_DBG_ASSERT_LE(start_blk_id, b.get_blk_num(), "Expected start bit to be greater than portion start bit"); - HS_DBG_ASSERT_GE(end_blk_id, (b.get_blk_num() + b.get_nblks() - 1), - "Expected end bit to be smaller than portion end bit"); - BLKALLOC_REL_ASSERT(m_cache_bm->is_bits_set(b.get_blk_num(), b.get_nblks()), "Expected bits to be set"); - m_cache_bm->reset_bits(b.get_blk_num(), b.get_nblks()); - portion.increase_available_blocks(b.get_nblks()); +blk_count_t VarsizeBlkAllocator::free_blks_direct(MultiBlkId const& bid) { + auto const do_free = [this](BlkId const& b) { + BlkAllocPortion& portion = blknum_to_portion(b.blk_num()); + { + auto const start_blk_id = portion.get_portion_num() * get_blks_per_portion(); + auto const end_blk_id = start_blk_id + get_blks_per_portion() - 1; + auto lock{portion.portion_auto_lock()}; + HS_DBG_ASSERT_LE(start_blk_id, b.blk_num(), "Expected start bit to be greater than portion start bit"); + HS_DBG_ASSERT_GE(end_blk_id, (b.blk_num() + b.blk_count() - 1), + "Expected end bit to be smaller than portion end bit"); + BLKALLOC_REL_ASSERT(m_cache_bm->is_bits_set(b.blk_num(), b.blk_count()), "Expected bits to be set"); + m_cache_bm->reset_bits(b.blk_num(), b.blk_count()); + portion.increase_available_blocks(b.blk_count()); + } + BLKALLOC_LOG(TRACE, "Freeing directly to portion={} blkid={} set_bits_count={}", + blknum_to_portion_num(b.blk_num()), b.to_string(), get_alloced_blk_count()); + return b.blk_count(); + }; + + blk_count_t n_freed{0}; + if (bid.is_multi()) { + auto it = bid.iterate(); + while (auto const b = it.next()) { + n_freed += do_free(*b); + } + } else { + n_freed += do_free(bid); } - BLKALLOC_LOG(TRACE, "Freeing directly to portion={} blkid={} set_bits_count={}", - blknum_to_portion_num(b.get_blk_num()), b.to_string(), get_alloced_blk_count()); + return n_freed; } -#ifdef _PRERELEASE -bool VarsizeBlkAllocator::is_set_on_bitmap(const BlkId& b) const { - const BlkAllocPortion& portion = blknum_to_portion_const(b.get_blk_num()); - { - // No need to set in cache if it is not recovered. When recovery is complete we copy the disk_bm to cache bm. - auto lock{portion.portion_auto_lock()}; - return m_cache_bm->is_bits_set(b.get_blk_num(), b.get_nblks()); +bool VarsizeBlkAllocator::is_blk_alloced(BlkId const& bid, bool use_lock) const { + if (!m_inited) { return true; } + + auto check_bits_set = [this](BlkId const& b, bool use_lock) { + if (use_lock) { + BlkAllocPortion const& portion = blknum_to_portion_const(b.blk_num()); + auto lock{portion.portion_auto_lock()}; + return m_cache_bm->is_bits_set(b.blk_num(), b.blk_count()); + } else { + return m_cache_bm->is_bits_set(b.blk_num(), b.blk_count()); + } + }; + + bool ret; + if (bid.is_multi()) { + auto& mbid = r_cast< MultiBlkId const& >(bid); + auto it = mbid.iterate(); + while (auto const b = it.next()) { + ret = check_bits_set(*b, use_lock); + if (!ret) { break; } + } + } else { + ret = check_bits_set(bid, use_lock); } + return ret; } -void VarsizeBlkAllocator::alloc_sanity_check(blk_count_t nblks, const blk_alloc_hints& hints, - const std::vector< BlkId >& out_blkids) const { +blk_num_t VarsizeBlkAllocator::available_blks() const { return get_total_blks() - get_used_blks(); } +blk_num_t VarsizeBlkAllocator::get_used_blks() const { return get_alloced_blk_count(); } + +#ifdef _PRERELEASE +void VarsizeBlkAllocator::alloc_sanity_check(blk_count_t nblks, blk_alloc_hints const& hints, + MultiBlkId const& out_blkid) const { if (HS_DYNAMIC_CONFIG(generic.sanity_check_level)) { blk_count_t alloced_nblks{0}; - for (const auto& b : out_blkids) { - const BlkAllocPortion& portion = blknum_to_portion_const(b.get_blk_num()); + auto it = out_blkid.iterate(); + while (auto const b = it.next()) { + BlkAllocPortion const& portion = blknum_to_portion_const(b->blk_num()); auto lock{portion.portion_auto_lock()}; - BLKALLOC_REL_ASSERT(m_cache_bm->is_bits_set(b.get_blk_num(), b.get_nblks()), - "Expected blkid={} to be already set in cache bitmap", b.to_string()); + BLKALLOC_REL_ASSERT(m_cache_bm->is_bits_set(b->blk_num(), b->blk_count()), + "Expected blkid={} to be already set in cache bitmap", b->to_string()); if (get_disk_bm_const()) { - BLKALLOC_REL_ASSERT(!is_blk_alloced_on_disk(b), "Expected blkid={} to be already free in disk bitmap", - b.to_string()); + BLKALLOC_REL_ASSERT(!is_blk_alloced_on_disk(*b), "Expected blkid={} to be already free in disk bitmap", + b->to_string()); } - alloced_nblks += b.get_nblks(); + alloced_nblks += b->blk_count(); } BLKALLOC_REL_ASSERT((nblks == alloced_nblks), "Requested blks={} alloced_blks={} num_pieces={}", nblks, - alloced_nblks, out_blkids.size()); - BLKALLOC_REL_ASSERT((!hints.is_contiguous || (out_blkids.size() == 1)), + alloced_nblks, out_blkid.num_pieces()); + BLKALLOC_REL_ASSERT((!hints.is_contiguous || (out_blkid.num_pieces() == 1)), "Multiple blkids allocated for contiguous request"); } } @@ -703,70 +807,6 @@ void VarsizeBlkAllocator::request_more_blks_wait(BlkAllocSegment* seg, blk_count } } -BlkAllocStatus VarsizeBlkAllocator::alloc_blks_direct(blk_count_t nblks, const blk_alloc_hints& hints, - std::vector< BlkId >& out_blkids, blk_count_t& num_allocated) { - // Search all segments starting with some random portion num within each segment - static thread_local std::random_device rd{}; - static thread_local std::default_random_engine re{rd()}; - - if (m_start_portion_num == INVALID_PORTION_NUM) { m_start_portion_num = m_rand_portion_num_generator(re); } - - auto portion_num = m_start_portion_num; - blk_count_t const min_blks = hints.is_contiguous ? nblks : std::min< blk_count_t >(nblks, hints.multiplier); - blk_count_t nblks_remain = nblks; - do { - BlkAllocPortion& portion = get_blk_portion(portion_num); - auto cur_blk_id = portion_num * get_blks_per_portion(); - auto const end_blk_id = cur_blk_id + get_blks_per_portion() - 1; - { - auto lock{portion.portion_auto_lock()}; - while (nblks_remain && (cur_blk_id <= end_blk_id) && (portion.get_available_blocks() > 0)) { - // Get next reset bits and insert to cache and then reset those bits - auto const b = m_cache_bm->get_next_contiguous_n_reset_bits( - cur_blk_id, end_blk_id, std::min(min_blks, nblks_remain), nblks_remain); - if (b.nbits == 0) { break; } - HS_DBG_ASSERT_GE(end_blk_id, b.start_bit, "Expected start bit to be smaller than end bit"); - HS_DBG_ASSERT_LE(b.nbits, nblks_remain); - HS_DBG_ASSERT_GE(b.nbits, std::min(min_blks, nblks_remain)); - HS_DBG_ASSERT_GE(end_blk_id, (b.start_bit + b.nbits - 1), - "Expected end bit to be smaller than portion end bit"); - - nblks_remain -= b.nbits; - out_blkids.emplace_back(b.start_bit, b.nbits, m_chunk_id); - - BLKALLOC_LOG(DEBUG, "Allocated directly from portion={} nnblks={} Blk_num={} nblks={} set_bit_count={}", - portion_num, nblks, b.start_bit, b.nbits, get_alloced_blk_count()); - - // Set the bitmap indicating the blocks are allocated - m_cache_bm->set_bits(b.start_bit, b.nbits); - if (portion.decrease_available_blocks(b.nbits) == 0) break; - cur_blk_id = b.start_bit + b.nbits; - } - } - if (++portion_num == get_num_portions()) { portion_num = 0; } - BLKALLOC_LOG(TRACE, "alloc direct unable to find in prev portion, searching in portion={}, start_portion={}", - portion_num, m_start_portion_num); - } while ((nblks_remain > 0) && (portion_num != m_start_portion_num) && !hints.is_contiguous); - - // save which portion we were at for next allocation; - m_start_portion_num = portion_num; - - COUNTER_INCREMENT(m_metrics, num_blks_alloc_direct, 1); - num_allocated = nblks - nblks_remain; - if (nblks_remain > 0) { - if (nblks_remain == nblks) { - // allocated no blocks. NOTE: if contiguous we may or may not be full. Don't really know without - // searching for a single free block - return hints.is_contiguous ? BlkAllocStatus::FAILED : BlkAllocStatus::SPACE_FULL; - } else { - // allocated some blocks - return BlkAllocStatus::PARTIAL; - } - } - - return BlkAllocStatus::SUCCESS; -} - /* This method assumes that mutex to protect state is already taken. */ bool VarsizeBlkAllocator::prepare_sweep(BlkAllocSegment* seg, bool fill_entire_cache) { m_sweep_segment = seg; @@ -780,19 +820,29 @@ bool VarsizeBlkAllocator::prepare_sweep(BlkAllocSegment* seg, bool fill_entire_c } } -void VarsizeBlkAllocator::blk_cache_entries_to_blkids(const std::vector< blk_cache_entry >& entries, - std::vector< BlkId >& out_blkids) { - for (const auto& e : entries) { - out_blkids.emplace_back(e.get_blk_num(), e.get_nblks(), m_chunk_id); +#if 0 +blk_num_t VarsizeBlkAllocator::blk_cache_entries_to_blkids(const std::vector< blk_cache_entry >& entries, + MultiBlkId& out_blkid) { + uint32_t num_added{0}; + for (auto const& e : entries) { + if (out_blkid.has_room()) { + out_blkid.add(e.get_blk_num(), e.blk_count(), m_chunk_id); + ++num_added; + } else { + break; + } } + + return num_added; } +#endif -BlkId VarsizeBlkAllocator::blk_cache_entry_to_blkid(const blk_cache_entry& e) { - return BlkId{e.get_blk_num(), e.get_nblks(), m_chunk_id}; +BlkId VarsizeBlkAllocator::blk_cache_entry_to_blkid(blk_cache_entry const& e) { + return BlkId{e.get_blk_num(), e.blk_count(), m_chunk_id}; } -blk_cache_entry VarsizeBlkAllocator::blkid_to_blk_cache_entry(const BlkId& bid, blk_temp_t preferred_level) { - return blk_cache_entry{bid.get_blk_num(), bid.get_nblks(), preferred_level}; +blk_cache_entry VarsizeBlkAllocator::blkid_to_blk_cache_entry(BlkId const& bid, blk_temp_t preferred_level) { + return blk_cache_entry{bid.blk_num(), bid.blk_count(), preferred_level}; } std::string VarsizeBlkAllocator::to_string() const { diff --git a/src/lib/blkalloc/varsize_blk_allocator.h b/src/lib/blkalloc/varsize_blk_allocator.h index 7544fac55..7e23597fd 100644 --- a/src/lib/blkalloc/varsize_blk_allocator.h +++ b/src/lib/blkalloc/varsize_blk_allocator.h @@ -45,27 +45,27 @@ struct VarsizeBlkAllocConfig : public BlkAllocConfig { public: const uint32_t m_phys_page_size; const seg_num_t m_nsegments; - const blk_cap_t m_blks_per_temp_group; - blk_cap_t m_max_cache_blks; + const blk_num_t m_blks_per_temp_group; + blk_num_t m_max_cache_blks; SlabCacheConfig m_slab_config; const bool m_use_slabs{true}; // use sweeping thread pool with slabs in variable size block allocator public: VarsizeBlkAllocConfig() : VarsizeBlkAllocConfig{0, 0, 0, 0, ""} {} - VarsizeBlkAllocConfig(const std::string& name) : VarsizeBlkAllocConfig{0, 0, 0, 0, name} {} + VarsizeBlkAllocConfig(std::string const& name) : VarsizeBlkAllocConfig{0, 0, 0, 0, name} {} VarsizeBlkAllocConfig(uint32_t blk_size, uint32_t ppage_sz, uint32_t align_sz, uint64_t size, - const std::string& name, bool realtime_bm_on = true, bool use_slabs = true) : + std::string const& name, bool realtime_bm_on = true, bool use_slabs = true) : BlkAllocConfig{blk_size, align_sz, size, name, realtime_bm_on}, m_phys_page_size{ppage_sz}, m_nsegments{HS_DYNAMIC_CONFIG(blkallocator.max_segments)}, m_blks_per_temp_group{m_capacity / HS_DYNAMIC_CONFIG(blkallocator.num_blk_temperatures)}, m_use_slabs{use_slabs} { // Initialize the max cache blks as minimum dictated by the number of blks or memory limits whichever is lower - const blk_cap_t size_by_count{static_cast< blk_cap_t >( + const blk_num_t size_by_count{static_cast< blk_num_t >( std::trunc(HS_DYNAMIC_CONFIG(blkallocator.free_blk_cache_count_by_vdev_percent) * m_capacity / 100.0))}; - const blk_cap_t size_by_mem{ - static_cast< blk_cap_t >(std::trunc(HS_DYNAMIC_CONFIG(blkallocator.max_free_blk_cache_memory_percent) * + const blk_num_t size_by_mem{ + static_cast< blk_num_t >(std::trunc(HS_DYNAMIC_CONFIG(blkallocator.max_free_blk_cache_memory_percent) * HS_STATIC_CONFIG(input.app_mem_size) / 100.0))}; m_max_cache_blks = std::min(size_by_count, size_by_mem); @@ -81,11 +81,11 @@ struct VarsizeBlkAllocConfig : public BlkAllocConfig { const auto num_temp_slab_pct{(100.0 - reuse_pct) / static_cast< double >(num_temp)}; m_slab_config.m_name = name; - for (const auto& pct : HS_DYNAMIC_CONFIG(blkallocator.free_blk_slab_distribution)) { + for (auto const& pct : HS_DYNAMIC_CONFIG(blkallocator.free_blk_slab_distribution)) { cum_pct += pct; SlabCacheConfig::_slab_config s_cfg; s_cfg.slab_size = static_cast< blk_count_t >(1) << slab_idx; - s_cfg.max_entries = static_cast< blk_cap_t >((m_max_cache_blks / s_cfg.slab_size) * (pct / 100.0)); + s_cfg.max_entries = static_cast< blk_num_t >((m_max_cache_blks / s_cfg.slab_size) * (pct / 100.0)); s_cfg.m_name = name; s_cfg.refill_threshold_pct = HS_DYNAMIC_CONFIG(blkallocator.free_blk_cache_refill_threshold_pct); @@ -108,9 +108,9 @@ struct VarsizeBlkAllocConfig : public BlkAllocConfig { } } - VarsizeBlkAllocConfig(const VarsizeBlkAllocConfig& other) = default; + VarsizeBlkAllocConfig(VarsizeBlkAllocConfig const& other) = default; VarsizeBlkAllocConfig(VarsizeBlkAllocConfig&&) noexcept = delete; - VarsizeBlkAllocConfig& operator=(const VarsizeBlkAllocConfig&) = delete; + VarsizeBlkAllocConfig& operator=(VarsizeBlkAllocConfig const&) = delete; VarsizeBlkAllocConfig& operator=(VarsizeBlkAllocConfig&&) noexcept = delete; virtual ~VarsizeBlkAllocConfig() override = default; @@ -119,20 +119,20 @@ struct VarsizeBlkAllocConfig : public BlkAllocConfig { //////////// Segments related getters/setters ///////////// seg_num_t get_total_segments() const { return m_nsegments; } - blk_cap_t get_blks_per_segment() const { return (m_capacity / m_nsegments); } + blk_num_t get_blks_per_segment() const { return (m_capacity / m_nsegments); } //////////// Blks related getters/setters ///////////// - blk_cap_t get_max_cache_blks() const { return m_max_cache_blks; } - blk_cap_t get_blks_per_temp_group() const { return m_blks_per_temp_group; } - blk_cap_t get_blks_per_phys_page() const { return m_phys_page_size / m_blk_size; } + blk_num_t get_max_cache_blks() const { return m_max_cache_blks; } + blk_num_t get_blks_per_temp_group() const { return m_blks_per_temp_group; } + blk_num_t get_blks_per_phys_page() const { return m_phys_page_size / m_blk_size; } //////////// Slab related getters/setters ///////////// - slab_idx_t get_slab_cnt() const { return m_slab_config.m_per_slab_cfg.size(); } + slab_idx_t get_slab_cnt() const { return s_cast< slab_idx_t >(m_slab_config.m_per_slab_cfg.size()); } blk_count_t get_slab_block_count(const slab_idx_t index) { return m_slab_config.m_per_slab_cfg[index].slab_size; } - blk_cap_t get_slab_capacity(const slab_idx_t slab_idx) const { + blk_num_t get_slab_capacity(const slab_idx_t slab_idx) const { return m_slab_config.m_per_slab_cfg[slab_idx].max_entries; } - blk_cap_t highest_slab_blks_count() const { + blk_num_t highest_slab_blks_count() const { const slab_idx_t index{get_slab_cnt()}; return (index > 0) ? m_slab_config.m_per_slab_cfg[index - 1].slab_size : 0; } @@ -151,12 +151,12 @@ class BlkAllocSegment { blk_num_t m_alloc_clock_hand; public: - BlkAllocSegment(const seg_num_t seg_num, const blk_num_t nportions, const std::string& seg_name) : + BlkAllocSegment(const seg_num_t seg_num, const blk_num_t nportions, std::string const& seg_name) : m_total_portions{nportions}, m_seg_num{seg_num}, m_alloc_clock_hand{0} {} - BlkAllocSegment(const BlkAllocSegment&) = delete; + BlkAllocSegment(BlkAllocSegment const&) = delete; BlkAllocSegment(BlkAllocSegment&&) noexcept = delete; - BlkAllocSegment& operator=(const BlkAllocSegment&) = delete; + BlkAllocSegment& operator=(BlkAllocSegment const&) = delete; BlkAllocSegment& operator=(BlkAllocSegment&&) noexcept = delete; virtual ~BlkAllocSegment() {} @@ -185,9 +185,9 @@ class BlkAllocMetrics : public sisl::MetricsGroup { register_me_to_farm(); } - BlkAllocMetrics(const BlkAllocMetrics&) = delete; + BlkAllocMetrics(BlkAllocMetrics const&) = delete; BlkAllocMetrics(BlkAllocMetrics&&) noexcept = delete; - BlkAllocMetrics& operator=(const BlkAllocMetrics&) = delete; + BlkAllocMetrics& operator=(BlkAllocMetrics const&) = delete; BlkAllocMetrics& operator=(BlkAllocMetrics&&) noexcept = delete; ~BlkAllocMetrics() { deregister_me_from_farm(); } }; @@ -201,24 +201,23 @@ class BlkAllocMetrics : public sisl::MetricsGroup { */ class VarsizeBlkAllocator : public BlkAllocator { public: - VarsizeBlkAllocator(const VarsizeBlkAllocConfig& cfg, bool init, chunk_num_t chunk_id); - VarsizeBlkAllocator(const VarsizeBlkAllocator&) = delete; + VarsizeBlkAllocator(VarsizeBlkAllocConfig const& cfg, bool init, chunk_num_t chunk_id); + VarsizeBlkAllocator(VarsizeBlkAllocator const&) = delete; VarsizeBlkAllocator(VarsizeBlkAllocator&&) noexcept = delete; - VarsizeBlkAllocator& operator=(const VarsizeBlkAllocator&) = delete; + VarsizeBlkAllocator& operator=(VarsizeBlkAllocator const&) = delete; VarsizeBlkAllocator& operator=(VarsizeBlkAllocator&&) noexcept = delete; virtual ~VarsizeBlkAllocator() override; - BlkAllocStatus alloc(BlkId& bid) override; - BlkAllocStatus alloc(blk_count_t nblks, const blk_alloc_hints& hints, std::vector< BlkId >& out_blkid) override; - void free(const std::vector< BlkId >& blk_ids) override; - void free(const BlkId& b) override; + BlkAllocStatus alloc_contiguous(BlkId& bid) override; + BlkAllocStatus alloc_contiguous(blk_count_t nblks, blk_alloc_hints const& hints, BlkId& out_blkid); + BlkAllocStatus alloc(blk_count_t nblks, blk_alloc_hints const& hints, BlkId& out_blkid) override; + BlkAllocStatus alloc(blk_count_t nblks, blk_alloc_hints const& hints, std::vector< BlkId >& out_blkids); + void free(BlkId const& blk_id) override; void inited() override; - BlkAllocStatus alloc_blks_direct(blk_count_t nblks, const blk_alloc_hints& hints, std::vector< BlkId >& out_blkids, - blk_count_t& num_allocated); - blk_cap_t available_blks() const override; - blk_cap_t get_used_blks() const override; - bool is_blk_alloced(const BlkId& in_bid, bool use_lock = false) const override; + blk_num_t available_blks() const override; + blk_num_t get_used_blks() const override; + bool is_blk_alloced(BlkId const& in_bid, bool use_lock = false) const override; std::string to_string() const override; nlohmann::json get_metrics_in_json(); @@ -256,17 +255,20 @@ class VarsizeBlkAllocator : public BlkAllocator { // TODO: this fields needs to be passed in from hints and persisted in volume's sb; blk_num_t m_start_portion_num{INVALID_PORTION_NUM}; - blk_cap_t m_blks_per_seg{1}; + blk_num_t m_blks_per_seg{1}; blk_num_t m_portions_per_seg{1}; private: static void sweeper_thread(size_t thread_num); bool allocator_state_machine(); + blk_count_t alloc_blks_slab(blk_count_t nblks, blk_alloc_hints const& hints, MultiBlkId& out_blkid); + blk_count_t alloc_blks_direct(blk_count_t nblks, blk_alloc_hints const& hints, MultiBlkId& out_blkids); + blk_count_t free_blks_slab(MultiBlkId const& b); + blk_count_t free_blks_direct(MultiBlkId const& b); + #ifdef _PRERELEASE - bool is_set_on_bitmap(const BlkId& b) const; - void alloc_sanity_check(blk_count_t nblks, const blk_alloc_hints& hints, - const std::vector< BlkId >& out_blkids) const; + void alloc_sanity_check(blk_count_t nblks, blk_alloc_hints const& hints, MultiBlkId const& out_blkids) const; #endif // Sweep and cache related functions @@ -277,7 +279,7 @@ class VarsizeBlkAllocator : public BlkAllocator { void fill_cache(BlkAllocSegment* seg, blk_cache_fill_session& fill_session); void fill_cache_in_portion(blk_num_t portion_num, blk_cache_fill_session& fill_session); - void free_on_bitmap(const BlkId& b); + void free_on_bitmap(BlkId const& b); //////////////////////////////////////////// Convenience routines /////////////////////////////////////////// ///////////////////// Physical page related routines //////////////////////// @@ -296,8 +298,8 @@ class VarsizeBlkAllocator : public BlkAllocator { } ///////////////////// Cache Entry related routines //////////////////////// - void blk_cache_entries_to_blkids(const std::vector< blk_cache_entry >& entries, std::vector< BlkId >& out_blkids); - BlkId blk_cache_entry_to_blkid(const blk_cache_entry& e); - blk_cache_entry blkid_to_blk_cache_entry(const BlkId& bid, blk_temp_t preferred_level = 1); + // void blk_cache_entries_to_blkids(const std::vector< blk_cache_entry >& entries, MultiBlkId& out_blkids); + BlkId blk_cache_entry_to_blkid(blk_cache_entry const& e); + blk_cache_entry blkid_to_blk_cache_entry(BlkId const& bid, blk_temp_t preferred_level = 1); }; } // namespace homestore diff --git a/src/lib/blkdata_svc/blk_read_tracker.cpp b/src/lib/blkdata_svc/blk_read_tracker.cpp index 65da3f55f..1d90618a9 100644 --- a/src/lib/blkdata_svc/blk_read_tracker.cpp +++ b/src/lib/blkdata_svc/blk_read_tracker.cpp @@ -32,18 +32,14 @@ void BlkReadTracker::merge(const BlkId& blkid, int64_t new_ref_count, // Don't move alignment handling outside of this function, because the nblks between (first and last blk num after // alignment) could be larger than 255 which exceeds a BlkId can hold; // - auto cur_blk_num_aligned = s_cast< blk_num_t >(sisl::round_down(blkid.get_blk_num(), entries_per_record())); - auto last_blk_num_aligned_up = s_cast< blk_num_t >(sisl::round_up(blkid.get_last_blk_num(), entries_per_record()) - - 1); // -1 so that it does not cover next base id; - if (blkid.get_last_blk_num() % entries_per_record() == 0) { - // if last blk num happens to be aligned, it actually belongs to next base id, so add 1 back; - last_blk_num_aligned_up += 1; - } + auto cur_blk_num_aligned = s_cast< blk_num_t >(sisl::round_down(blkid.blk_num(), entries_per_record())); + auto last_blk_num_aligned_up = + s_cast< blk_num_t >(sisl::round_up(blkid.blk_num() + blkid.blk_count() + 1, entries_per_record()) - 1); [[maybe_unused]] bool waiter_rescheduled{false}; // everything is aligned after this point, so we don't need to handle sub_range in a base blkid; while (cur_blk_num_aligned <= last_blk_num_aligned_up) { - BlkId base_blkid{cur_blk_num_aligned, entries_per_record(), blkid.get_chunk_num()}; + BlkId base_blkid{cur_blk_num_aligned, entries_per_record(), blkid.chunk_num()}; BlkTrackRecord rec; const auto rec_found = m_pending_reads_map.get(base_blkid, rec); @@ -98,8 +94,16 @@ void BlkReadTracker::merge(const BlkId& blkid, int64_t new_ref_count, void BlkReadTracker::insert(const BlkId& blkid) { merge(blkid, 1, nullptr); } void BlkReadTracker::remove(const BlkId& blkid) { merge(blkid, -1, nullptr); } -void BlkReadTracker::wait_on(const BlkId& blkid, after_remove_cb_t&& after_remove_cb) { - merge(blkid, 0, std::make_shared< blk_track_waiter >(std::move(after_remove_cb))); +void BlkReadTracker::wait_on(MultiBlkId const& blkids, after_remove_cb_t&& after_remove_cb) { + if (blkids.num_pieces() == 1) { + merge(blkids, 0, std::make_shared< blk_track_waiter >(std::move(after_remove_cb))); + } else { + auto waiter = std::make_shared< blk_track_waiter >(std::move(after_remove_cb)); + auto it = blkids.iterate(); + while (auto const b = it.next()) { + merge(*b, 0, waiter); + } + } } uint16_t BlkReadTracker::entries_per_record() const { diff --git a/src/lib/blkdata_svc/blk_read_tracker.hpp b/src/lib/blkdata_svc/blk_read_tracker.hpp index 10c2572e4..ec62d77c4 100644 --- a/src/lib/blkdata_svc/blk_read_tracker.hpp +++ b/src/lib/blkdata_svc/blk_read_tracker.hpp @@ -20,7 +20,7 @@ #include #include #include -#include "homestore/blk.h" +#include namespace homestore { typedef folly::Function< void(void) > after_remove_cb_t; @@ -157,7 +157,7 @@ class BlkReadTracker { * @param blkid : blkid that caller wants to wait on for pending read; * @param after_remove_cb : the callback to be sent after read on this blkid are all completed; */ - void wait_on(const BlkId& blkid, after_remove_cb_t&& after_remove_cb); + void wait_on(MultiBlkId const& blkids, after_remove_cb_t&& after_remove_cb); /** * @brief : get size of the pending map; diff --git a/src/lib/blkdata_svc/blkdata_service.cpp b/src/lib/blkdata_svc/blkdata_service.cpp index 3c52f89de..33ed6fede 100644 --- a/src/lib/blkdata_svc/blkdata_service.cpp +++ b/src/lib/blkdata_svc/blkdata_service.cpp @@ -17,7 +17,7 @@ #include #include "device/chunk.h" #include "device/virtual_dev.hpp" -#include "device/physical_dev.hpp" // vdev_info_block +#include "device/physical_dev.hpp" // vdev_info_block #include "common/homestore_config.hpp" // is_data_drive_hdd #include "common/homestore_assert.hpp" #include "common/error.h" @@ -53,87 +53,161 @@ void BlkDataService::create_vdev(uint64_t size, blk_allocator_type_t alloc_type, // both first_time_boot and recovery path will come here shared< VirtualDev > BlkDataService::open_vdev(const vdev_info& vinfo, bool load_existing) { m_vdev = std::make_shared< VirtualDev >(*(hs()->device_mgr()), vinfo, nullptr, true /* auto_recovery */); - m_page_size = vinfo.blk_size; + m_blk_size = vinfo.blk_size; return m_vdev; } -folly::Future< bool > BlkDataService::async_read(const BlkId& bid, sisl::sg_list& sgs, uint32_t size, - bool part_of_batch) { - m_blk_read_tracker->insert(bid); - HS_DBG_ASSERT_EQ(sgs.iovs.size(), 1, "Expecting iov size to be 1 since reading on one blk."); +static auto collect_all_futures(std::vector< folly::Future< std::error_code > >& futs) { + return folly::collectAllUnsafe(futs).thenValue([](auto&& vf) { + for (auto const& err_c : vf) { + if (sisl_unlikely(err_c.value())) { + auto ec = err_c.value(); + return folly::makeFuture< std::error_code >(std::move(ec)); + } + } + return folly::makeFuture< std::error_code >(std::error_code{}); + }); +} - return m_vdev->async_readv(sgs.iovs.data(), sgs.iovs.size(), size, bid, part_of_batch) - .thenValue([this, bid](auto&&) { +folly::Future< std::error_code > BlkDataService::async_read(MultiBlkId const& blkid, uint8_t* buf, uint32_t size, + bool part_of_batch) { + auto do_read = [this](BlkId const& bid, uint8_t* buf, uint32_t size, bool part_of_batch) { + m_blk_read_tracker->insert(bid); + + return m_vdev->async_read(r_cast< char* >(buf), size, bid, part_of_batch).thenValue([this, bid](auto&& ec) { m_blk_read_tracker->remove(bid); - return folly::makeFuture< bool >(true); + return folly::makeFuture< std::error_code >(std::move(ec)); }); + }; + + if (blkid.num_pieces() == 1) { + return do_read(blkid.to_single_blkid(), buf, size, part_of_batch); + } else { + static thread_local std::vector< folly::Future< std::error_code > > s_futs; + s_futs.clear(); + + auto it = blkid.iterate(); + while (auto const bid = it.next()) { + uint32_t sz = bid->blk_count() * m_blk_size; + s_futs.emplace_back(do_read(*bid, buf, sz, part_of_batch)); + buf += sz; + } + + return collect_all_futures(s_futs); + } } -folly::Future< bool > BlkDataService::async_write(const sisl::sg_list& sgs, const blk_alloc_hints& hints, - const std::vector< BlkId >& blkids, bool part_of_batch) { - if (blkids.size() == 1) { - // Shortcut to most common case - return m_vdev->async_writev(sgs.iovs.data(), sgs.iovs.size(), blkids[0], part_of_batch); +folly::Future< std::error_code > BlkDataService::async_read(MultiBlkId const& blkid, sisl::sg_list& sgs, uint32_t size, + bool part_of_batch) { + // TODO: sg_iovs_t should not be passed by value. We need it pass it as const&, but that is failing because + // iovs.data() will then return "const iovec*", but unfortunately all the way down to iomgr, we take iovec* + // instead it can easily take "const iovec*". Until we change this is made as copy by value + auto do_read = [this](BlkId const& bid, sisl::sg_iovs_t iovs, uint32_t size, bool part_of_batch) { + m_blk_read_tracker->insert(bid); + + return m_vdev->async_readv(iovs.data(), iovs.size(), size, bid, part_of_batch) + .thenValue([this, bid](auto&& ec) { + m_blk_read_tracker->remove(bid); + return folly::makeFuture< std::error_code >(std::move(ec)); + }); + }; + + if (blkid.num_pieces() == 1) { + return do_read(blkid.to_single_blkid(), sgs.iovs, size, part_of_batch); } else { - static thread_local std::vector< folly::Future< bool > > s_futs; + static thread_local std::vector< folly::Future< std::error_code > > s_futs; s_futs.clear(); + sisl::sg_iterator sg_it{sgs.iovs}; - for (const auto& bid : blkids) { - const auto iovs = sg_it.next_iovs(bid.get_nblks() * m_page_size); - s_futs.emplace_back(m_vdev->async_writev(iovs.data(), iovs.size(), bid, part_of_batch)); + auto blkid_it = blkid.iterate(); + while (auto const bid = blkid_it.next()) { + uint32_t const sz = bid->blk_count() * m_blk_size; + s_futs.emplace_back(do_read(*bid, sg_it.next_iovs(sz), sz, part_of_batch)); } - return folly::collectAllUnsafe(s_futs).thenTry([](auto&&) { return folly::makeFuture< bool >(true); }); + + return collect_all_futures(s_futs); } } -folly::Future< bool > BlkDataService::async_alloc_write(const sisl::sg_list& sgs, const blk_alloc_hints& hints, - std::vector< BlkId >& out_blkids, bool part_of_batch) { - out_blkids.clear(); +folly::Future< std::error_code > BlkDataService::async_alloc_write(const sisl::sg_list& sgs, + const blk_alloc_hints& hints, MultiBlkId& out_blkids, + bool part_of_batch) { const auto status = alloc_blks(sgs.size, hints, out_blkids); if (status != BlkAllocStatus::SUCCESS) { - return folly::makeFuture< bool >( - std::system_error(std::make_error_code(std::errc::resource_unavailable_try_again))); + return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::resource_unavailable_try_again)); } - return async_write(sgs, hints, out_blkids, part_of_batch); + return async_write(sgs, out_blkids, part_of_batch); } -BlkAllocStatus BlkDataService::alloc_blks(uint32_t size, const blk_alloc_hints& hints, - std::vector< BlkId >& out_blkids) { - HS_DBG_ASSERT_EQ(size % m_page_size, 0, "Non aligned size requested"); - blk_count_t nblks = static_cast< blk_count_t >(size / m_page_size); +folly::Future< std::error_code > BlkDataService::async_write(const char* buf, uint32_t size, MultiBlkId const& blkid, + bool part_of_batch) { + if (blkid.num_pieces() == 1) { + // Shortcut to most common case + return m_vdev->async_write(buf, size, blkid.to_single_blkid(), part_of_batch); + } else { + static thread_local std::vector< folly::Future< std::error_code > > s_futs; + s_futs.clear(); - return m_vdev->alloc_blk(nblks, hints, out_blkids); + const char* ptr = buf; + auto blkid_it = blkid.iterate(); + while (auto const bid = blkid_it.next()) { + uint32_t sz = bid->blk_count() * m_blk_size; + s_futs.emplace_back(m_vdev->async_write(ptr, sz, *bid, part_of_batch)); + ptr += sz; + } + return collect_all_futures(s_futs); + } } -void BlkDataService::commit_blk(const BlkId& bid) { m_vdev->commit_blk(bid); } - -blk_list_t BlkDataService::alloc_blks(uint32_t size) { - blk_alloc_hints hints; // default hints - std::vector< BlkId > out_blkids; - const auto status = alloc_blks(size, hints, out_blkids); +folly::Future< std::error_code > BlkDataService::async_write(sisl::sg_list const& sgs, MultiBlkId const& blkid, + bool part_of_batch) { + // TODO: Async write should pass this by value the sgs.size parameter as well, currently vdev write routine + // walks through again all the iovs and then getting the len to pass it down to iomgr. This defeats the purpose of + // taking size parameters (which was done exactly done to avoid this walk through) + if (blkid.num_pieces() == 1) { + // Shortcut to most common case + return m_vdev->async_writev(sgs.iovs.data(), sgs.iovs.size(), blkid.to_single_blkid(), part_of_batch); + } else { + static thread_local std::vector< folly::Future< std::error_code > > s_futs; + s_futs.clear(); + sisl::sg_iterator sg_it{sgs.iovs}; - blk_list_t blk_list; - if (status != BlkAllocStatus::SUCCESS) { - LOGERROR("Resouce unavailable!"); - return blk_list; + auto blkid_it = blkid.iterate(); + while (auto const bid = blkid_it.next()) { + const auto iovs = sg_it.next_iovs(bid->blk_count() * m_blk_size); + s_futs.emplace_back(m_vdev->async_writev(iovs.data(), iovs.size(), *bid, part_of_batch)); + } + return collect_all_futures(s_futs); } +} - // convert BlkId to blklist; - for (auto i = 0ul; i < out_blkids.size(); ++i) { - blk_list.emplace_back(out_blkids[i].to_integer()); - } +BlkAllocStatus BlkDataService::alloc_blks(uint32_t size, const blk_alloc_hints& hints, MultiBlkId& out_blkids) { + HS_DBG_ASSERT_EQ(size % m_blk_size, 0, "Non aligned size requested"); + blk_count_t nblks = static_cast< blk_count_t >(size / m_blk_size); + + return m_vdev->alloc_blks(nblks, hints, out_blkids); +} - return blk_list; +void BlkDataService::commit_blk(MultiBlkId const& blkid) { + if (blkid.num_pieces() == 1) { + // Shortcut to most common case + m_vdev->commit_blk(blkid); + } else { + auto it = blkid.iterate(); + while (auto const bid = it.next()) { + m_vdev->commit_blk(*bid); + } + } } -folly::Future< bool > BlkDataService::async_free_blk(const BlkId bid) { +folly::Future< std::error_code > BlkDataService::async_free_blk(MultiBlkId const& bids) { // create blk read waiter instance; - folly::Promise< bool > promise; + folly::Promise< std::error_code > promise; auto f = promise.getFuture(); - m_blk_read_tracker->wait_on(bid, [this, bid, p = std::move(promise)]() mutable { - m_vdev->free_blk(bid); - p.setValue(true); + m_blk_read_tracker->wait_on(bids, [this, bids, p = std::move(promise)]() mutable { + m_vdev->free_blk(bids); + p.setValue(std::error_code{}); }); return f; } diff --git a/src/lib/device/journal_vdev.cpp b/src/lib/device/journal_vdev.cpp index 4240fc223..1db2c55dc 100644 --- a/src/lib/device/journal_vdev.cpp +++ b/src/lib/device/journal_vdev.cpp @@ -134,9 +134,9 @@ auto JournalVirtualDev::process_pwrite_offset(size_t len, off_t offset) { } /////////////////////////////// Write Section ////////////////////////////////// -folly::Future< bool > JournalVirtualDev::async_append(const uint8_t* buf, size_t size) { +folly::Future< std::error_code > JournalVirtualDev::async_append(const uint8_t* buf, size_t size) { if (!validate_append_size(size)) { - return folly::makeFuture< bool >(std::system_error(std::make_error_code(std::errc::no_space_on_device))); + return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::no_space_on_device)); } else { auto const [chunk, offset_in_chunk] = process_pwrite_offset(size, m_seek_cursor); m_seek_cursor += size; @@ -157,7 +157,7 @@ folly::Future< bool > JournalVirtualDev::async_append(const uint8_t* buf, size_t * @param cb : callback after write is completed, can be null * */ -folly::Future< bool > JournalVirtualDev::async_pwrite(const uint8_t* buf, size_t size, off_t offset) { +folly::Future< std::error_code > JournalVirtualDev::async_pwrite(const uint8_t* buf, size_t size, off_t offset) { HS_REL_ASSERT_LE(size, m_reserved_sz, "Write size: larger then reserved size is not allowed!"); m_reserved_sz -= size; // update reserved size @@ -165,7 +165,7 @@ folly::Future< bool > JournalVirtualDev::async_pwrite(const uint8_t* buf, size_t return async_write(r_cast< const char* >(buf), size, chunk, offset_in_chunk); } -folly::Future< bool > JournalVirtualDev::async_pwritev(const iovec* iov, int iovcnt, off_t offset) { +folly::Future< std::error_code > JournalVirtualDev::async_pwritev(const iovec* iov, int iovcnt, off_t offset) { auto const size = VirtualDev::get_len(iov, iovcnt); // if size is smaller than reserved size, it means write will never be overlapping start offset; @@ -216,7 +216,9 @@ void JournalVirtualDev::sync_next_read(uint8_t* buf, size_t size_rd) { across_chunk = true; } - sync_pread(buf, size_rd, m_seek_cursor); + auto ec = sync_pread(buf, size_rd, m_seek_cursor); + // TODO: Check if we can have tolerate this error and somehow start homestore without replaying or in degraded mode? + HS_REL_ASSERT(!ec, "Error in reading next stream of bytes, proceeding could cause some inconsistency, exiting"); // Update seek cursor after read; m_seek_cursor += size_rd; @@ -224,7 +226,7 @@ void JournalVirtualDev::sync_next_read(uint8_t* buf, size_t size_rd) { m_seek_cursor = m_seek_cursor % size(); } -void JournalVirtualDev::sync_pread(uint8_t* buf, size_t size, off_t offset) { +std::error_code JournalVirtualDev::sync_pread(uint8_t* buf, size_t size, off_t offset) { auto const [chunk, offset_in_chunk] = offset_to_chunk(offset); // if the read count is acrossing chunk, only return what's left in this chunk @@ -236,7 +238,7 @@ void JournalVirtualDev::sync_pread(uint8_t* buf, size_t size, off_t offset) { return sync_read(r_cast< char* >(buf), size, chunk, offset_in_chunk); } -void JournalVirtualDev::sync_preadv(iovec* iov, int iovcnt, off_t offset) { +std::error_code JournalVirtualDev::sync_preadv(iovec* iov, int iovcnt, off_t offset) { uint64_t len = VirtualDev::get_len(iov, iovcnt); auto const [chunk, offset_in_chunk] = offset_to_chunk(offset); @@ -251,7 +253,7 @@ void JournalVirtualDev::sync_preadv(iovec* iov, int iovcnt, off_t offset) { iov[0].iov_len = len; // is this needed? } - sync_readv(iov, iovcnt, chunk, offset_in_chunk); + return sync_readv(iov, iovcnt, chunk, offset_in_chunk); } off_t JournalVirtualDev::lseek(off_t offset, int whence) { diff --git a/src/lib/device/journal_vdev.hpp b/src/lib/device/journal_vdev.hpp index 28c72a6bf..9ecac4342 100644 --- a/src/lib/device/journal_vdev.hpp +++ b/src/lib/device/journal_vdev.hpp @@ -44,7 +44,7 @@ class JournalVirtualDev : public VirtualDev { off_t m_data_start_offset{0}; // Start offset of where actual data begin for this vdev std::atomic< uint64_t > m_write_sz_in_total{0}; // this size will be decreased by truncate and increased by append; bool m_truncate_done{true}; - uint64_t m_reserved_sz{0}; // write size within chunk, used to check chunk boundary; + uint64_t m_reserved_sz{0}; // write size within chunk, used to check chunk boundary; public: /* Create a new virtual dev for these parameters */ @@ -79,7 +79,7 @@ class JournalVirtualDev : public VirtualDev { * * @return : On success, the number of bytes written is returned. On error, -1 is returned. */ - folly::Future< bool > async_append(const uint8_t* buf, size_t count); + folly::Future< std::error_code > async_append(const uint8_t* buf, size_t count); /** * @brief : writes up to count bytes from the buffer starting at buf at offset offset. @@ -95,7 +95,7 @@ class JournalVirtualDev : public VirtualDev { * * @return : On success, the number of bytes read or written is returned, or -1 on error. */ - folly::Future< bool > async_pwrite(const uint8_t* buf, size_t size, off_t offset); + folly::Future< std::error_code > async_pwrite(const uint8_t* buf, size_t size, off_t offset); /** * @brief : writes iovcnt buffers of data described by iov to the offset. @@ -110,7 +110,7 @@ class JournalVirtualDev : public VirtualDev { * * @return : On success, number of bytes written. On error, -1 is returned */ - folly::Future< bool > async_pwritev(const iovec* iov, int iovcnt, off_t offset); + folly::Future< std::error_code > async_pwritev(const iovec* iov, int iovcnt, off_t offset); /// @brief writes up to count bytes from the buffer starting at buf at offset offset. The cursor is not /// changed. pwrite always use offset returned from alloc_next_append_blk to do the write;pwrite should not across @@ -145,9 +145,9 @@ class JournalVirtualDev : public VirtualDev { * @param count : size of buffer * @param offset : the start offset to do read * - * @return : On success, returns the number of bytes. On error, -1 is returned. + * @return : return the error code of the read */ - void sync_pread(uint8_t* buf, size_t count_in, off_t offset); + std::error_code sync_pread(uint8_t* buf, size_t count_in, off_t offset); /** * @brief : read at offset and save output to iov. @@ -159,9 +159,9 @@ class JournalVirtualDev : public VirtualDev { * @param iovcnt : size of iovev * @param offset : the start offset to read * - * @return : return the number of bytes read; On error, -1 is returned. + * @return : return the error code of the read */ - void sync_preadv(iovec* iov, int iovcnt, off_t offset); + std::error_code sync_preadv(iovec* iov, int iovcnt, off_t offset); /** * @brief : repositions the cusor of the device to the argument offset diff --git a/src/lib/device/physical_dev.cpp b/src/lib/device/physical_dev.cpp index 163162efd..122f735d8 100644 --- a/src/lib/device/physical_dev.cpp +++ b/src/lib/device/physical_dev.cpp @@ -118,48 +118,53 @@ PhysicalDev::PhysicalDev(const dev_info& dinfo, int oflags, const pdev_info_head PhysicalDev::~PhysicalDev() { close_device(); } void PhysicalDev::write_super_block(uint8_t* buf, uint32_t sb_size, uint64_t offset) { - m_drive_iface->sync_write(m_iodev.get(), c_charptr_cast(buf), sb_size, offset); + auto err_c = m_drive_iface->sync_write(m_iodev.get(), c_charptr_cast(buf), sb_size, offset); if (m_super_blk_in_footer) { auto t_offset = data_end_offset() + offset; - m_drive_iface->sync_write(m_iodev.get(), c_charptr_cast(buf), sb_size, t_offset); + err_c = m_drive_iface->sync_write(m_iodev.get(), c_charptr_cast(buf), sb_size, t_offset); } + + HS_REL_ASSERT(!err_c, "Super block write failed on dev={} at size={} offset={}, homestore will go down", m_devname, + sb_size, offset); } -void PhysicalDev::read_super_block(uint8_t* buf, uint32_t sb_size, uint64_t offset) { - m_drive_iface->sync_read(m_iodev.get(), charptr_cast(buf), sb_size, offset); +std::error_code PhysicalDev::read_super_block(uint8_t* buf, uint32_t sb_size, uint64_t offset) { + return m_drive_iface->sync_read(m_iodev.get(), charptr_cast(buf), sb_size, offset); } void PhysicalDev::close_device() { close_and_uncache_dev(m_devname, m_iodev); } -folly::Future< bool > PhysicalDev::async_write(const char* data, uint32_t size, uint64_t offset, bool part_of_batch) { +folly::Future< std::error_code > PhysicalDev::async_write(const char* data, uint32_t size, uint64_t offset, + bool part_of_batch) { HISTOGRAM_OBSERVE(m_metrics, write_io_sizes, (((size - 1) / 1024) + 1)); return m_drive_iface->async_write(m_iodev.get(), data, size, offset, part_of_batch); } -folly::Future< bool > PhysicalDev::async_writev(const iovec* iov, int iovcnt, uint32_t size, uint64_t offset, - bool part_of_batch) { +folly::Future< std::error_code > PhysicalDev::async_writev(const iovec* iov, int iovcnt, uint32_t size, uint64_t offset, + bool part_of_batch) { HISTOGRAM_OBSERVE(m_metrics, write_io_sizes, (((size - 1) / 1024) + 1)); return m_drive_iface->async_writev(m_iodev.get(), iov, iovcnt, size, offset, part_of_batch); } -folly::Future< bool > PhysicalDev::async_read(char* data, uint32_t size, uint64_t offset, bool part_of_batch) { +folly::Future< std::error_code > PhysicalDev::async_read(char* data, uint32_t size, uint64_t offset, + bool part_of_batch) { HISTOGRAM_OBSERVE(m_metrics, read_io_sizes, (((size - 1) / 1024) + 1)); return m_drive_iface->async_read(m_iodev.get(), data, size, offset, part_of_batch); } -folly::Future< bool > PhysicalDev::async_readv(iovec* iov, int iovcnt, uint32_t size, uint64_t offset, - bool part_of_batch) { +folly::Future< std::error_code > PhysicalDev::async_readv(iovec* iov, int iovcnt, uint32_t size, uint64_t offset, + bool part_of_batch) { HISTOGRAM_OBSERVE(m_metrics, read_io_sizes, (((size - 1) / 1024) + 1)); return m_drive_iface->async_readv(m_iodev.get(), iov, iovcnt, size, offset, part_of_batch); } -folly::Future< bool > PhysicalDev::async_write_zero(uint64_t size, uint64_t offset) { +folly::Future< std::error_code > PhysicalDev::async_write_zero(uint64_t size, uint64_t offset) { return m_drive_iface->async_write_zero(m_iodev.get(), size, offset); } #if 0 -folly::Future< bool > PhysicalDev::async_write_zero(uint64_t size, uint64_t offset) { +folly::Future< std::error_code > PhysicalDev::async_write_zero(uint64_t size, uint64_t offset) { return m_drive_iface->async_write_zero(m_iodev.get(), size, offset).thenError([this](auto const& e) -> bool { LOGERROR("Error on async_write_zero: exception={}", e.what()); device_manager_mutable()->handle_error(this); @@ -168,62 +173,48 @@ folly::Future< bool > PhysicalDev::async_write_zero(uint64_t size, uint64_t offs } #endif -folly::Future< bool > PhysicalDev::queue_fsync() { return m_drive_iface->queue_fsync(m_iodev.get()); } +folly::Future< std::error_code > PhysicalDev::queue_fsync() { return m_drive_iface->queue_fsync(m_iodev.get()); } -void PhysicalDev::sync_write(const char* data, uint32_t size, uint64_t offset) { - try { - HISTOGRAM_OBSERVE(m_metrics, write_io_sizes, (((size - 1) / 1024) + 1)); - COUNTER_INCREMENT(m_metrics, drive_sync_write_count, 1); - auto const start_time = Clock::now(); - m_drive_iface->sync_write(m_iodev.get(), data, size, offset); - HISTOGRAM_OBSERVE(m_metrics, drive_write_latency, get_elapsed_time_us(start_time)); - } catch (const std::system_error& e) { - // device_manager_mutable()->handle_error(this); - throw std::system_error(e.code(), fmt::format("dev_name: {}: {}", m_devname, e.what())); - } +__attribute__((no_sanitize_address)) static auto get_current_time() { return Clock::now(); } + +std::error_code PhysicalDev::sync_write(const char* data, uint32_t size, uint64_t offset) { + HISTOGRAM_OBSERVE(m_metrics, write_io_sizes, (((size - 1) / 1024) + 1)); + COUNTER_INCREMENT(m_metrics, drive_sync_write_count, 1); + auto const start_time = get_current_time(); + auto const ret = m_drive_iface->sync_write(m_iodev.get(), data, size, offset); + HISTOGRAM_OBSERVE(m_metrics, drive_write_latency, get_elapsed_time_us(start_time)); + return ret; } -void PhysicalDev::sync_writev(const iovec* iov, int iovcnt, uint32_t size, uint64_t offset) { - try { - HISTOGRAM_OBSERVE(m_metrics, write_io_sizes, (((size - 1) / 1024) + 1)); - COUNTER_INCREMENT(m_metrics, drive_sync_write_count, 1); - auto const start_time = Clock::now(); - m_drive_iface->sync_writev(m_iodev.get(), iov, iovcnt, size, offset); - HISTOGRAM_OBSERVE(m_metrics, drive_write_latency, get_elapsed_time_us(start_time)); - } catch (const std::system_error& e) { - // device_manager_mutable()->handle_error(this); - throw std::system_error(e.code(), fmt::format("dev_name: {}: {}", m_devname, e.what())); - } +std::error_code PhysicalDev::sync_writev(const iovec* iov, int iovcnt, uint32_t size, uint64_t offset) { + HISTOGRAM_OBSERVE(m_metrics, write_io_sizes, (((size - 1) / 1024) + 1)); + COUNTER_INCREMENT(m_metrics, drive_sync_write_count, 1); + auto const start_time = Clock::now(); + auto const ret = m_drive_iface->sync_writev(m_iodev.get(), iov, iovcnt, size, offset); + HISTOGRAM_OBSERVE(m_metrics, drive_write_latency, get_elapsed_time_us(start_time)); + return ret; } -void PhysicalDev::sync_read(char* data, uint32_t size, uint64_t offset) { - try { - HISTOGRAM_OBSERVE(m_metrics, read_io_sizes, (((size - 1) / 1024) + 1)); - COUNTER_INCREMENT(m_metrics, drive_sync_read_count, 1); - auto const start_time = Clock::now(); - m_drive_iface->sync_read(m_iodev.get(), data, size, offset); - HISTOGRAM_OBSERVE(m_metrics, drive_read_latency, get_elapsed_time_us(start_time)); - } catch (const std::system_error& e) { - // device_manager_mutable()->handle_error(this); - throw std::system_error(e.code(), fmt::format("dev_name: {}: {}", m_devname, e.what())); - } +std::error_code PhysicalDev::sync_read(char* data, uint32_t size, uint64_t offset) { + HISTOGRAM_OBSERVE(m_metrics, read_io_sizes, (((size - 1) / 1024) + 1)); + COUNTER_INCREMENT(m_metrics, drive_sync_read_count, 1); + auto const start_time = Clock::now(); + auto const ret = m_drive_iface->sync_read(m_iodev.get(), data, size, offset); + HISTOGRAM_OBSERVE(m_metrics, drive_read_latency, get_elapsed_time_us(start_time)); + return ret; } -void PhysicalDev::sync_readv(iovec* iov, int iovcnt, uint32_t size, uint64_t offset) { - try { - HISTOGRAM_OBSERVE(m_metrics, read_io_sizes, (((size - 1) / 1024) + 1)); - COUNTER_INCREMENT(m_metrics, drive_sync_read_count, 1); - auto const start_time = Clock::now(); - m_drive_iface->sync_readv(m_iodev.get(), iov, iovcnt, size, offset); - HISTOGRAM_OBSERVE(m_metrics, drive_read_latency, get_elapsed_time_us(start_time)); - } catch (const std::system_error& e) { - // device_manager_mutable()->handle_error(this); - throw std::system_error(e.code(), fmt::format("dev_name: {}: {}", m_devname, e.what())); - } +std::error_code PhysicalDev::sync_readv(iovec* iov, int iovcnt, uint32_t size, uint64_t offset) { + HISTOGRAM_OBSERVE(m_metrics, read_io_sizes, (((size - 1) / 1024) + 1)); + COUNTER_INCREMENT(m_metrics, drive_sync_read_count, 1); + auto const start_time = Clock::now(); + auto const ret = m_drive_iface->sync_readv(m_iodev.get(), iov, iovcnt, size, offset); + HISTOGRAM_OBSERVE(m_metrics, drive_read_latency, get_elapsed_time_us(start_time)); + return ret; } -void PhysicalDev::sync_write_zero(uint64_t size, uint64_t offset) { - m_drive_iface->sync_write_zero(m_iodev.get(), size, offset); +std::error_code PhysicalDev::sync_write_zero(uint64_t size, uint64_t offset) { + return m_drive_iface->sync_write_zero(m_iodev.get(), size, offset); } void PhysicalDev::submit_batch() { m_drive_iface->submit_batch(); } diff --git a/src/lib/device/physical_dev.hpp b/src/lib/device/physical_dev.hpp index cb74d1ff0..951e61f34 100644 --- a/src/lib/device/physical_dev.hpp +++ b/src/lib/device/physical_dev.hpp @@ -147,7 +147,7 @@ class PhysicalDev { static first_block read_first_block(const std::string& devname, int oflags); static uint64_t get_dev_size(const std::string& devname); - void read_super_block(uint8_t* buf, uint32_t sb_size, uint64_t offset); + std::error_code read_super_block(uint8_t* buf, uint32_t sb_size, uint64_t offset); void write_super_block(uint8_t* buf, uint32_t sb_size, uint64_t offset); void close_device(); @@ -194,20 +194,21 @@ class PhysicalDev { const std::string& get_devname() const { return m_devname; } /////////////////////////////////////// IO Methods ////////////////////////////////////////// - folly::Future< bool > async_write(const char* data, uint32_t size, uint64_t offset, bool part_of_batch = false); - folly::Future< bool > async_writev(const iovec* iov, int iovcnt, uint32_t size, uint64_t offset, - bool part_of_batch = false); - folly::Future< bool > async_read(char* data, uint32_t size, uint64_t offset, bool part_of_batch = false); - folly::Future< bool > async_readv(iovec* iov, int iovcnt, uint32_t size, uint64_t offset, - bool part_of_batch = false); - folly::Future< bool > async_write_zero(uint64_t size, uint64_t offset); - folly::Future< bool > queue_fsync(); - - void sync_write(const char* data, uint32_t size, uint64_t offset); - void sync_writev(const iovec* iov, int iovcnt, uint32_t size, uint64_t offset); - void sync_read(char* data, uint32_t size, uint64_t offset); - void sync_readv(iovec* iov, int iovcnt, uint32_t size, uint64_t offset); - void sync_write_zero(uint64_t size, uint64_t offset); + folly::Future< std::error_code > async_write(const char* data, uint32_t size, uint64_t offset, + bool part_of_batch = false); + folly::Future< std::error_code > async_writev(const iovec* iov, int iovcnt, uint32_t size, uint64_t offset, + bool part_of_batch = false); + folly::Future< std::error_code > async_read(char* data, uint32_t size, uint64_t offset, bool part_of_batch = false); + folly::Future< std::error_code > async_readv(iovec* iov, int iovcnt, uint32_t size, uint64_t offset, + bool part_of_batch = false); + folly::Future< std::error_code > async_write_zero(uint64_t size, uint64_t offset); + folly::Future< std::error_code > queue_fsync(); + + std::error_code sync_write(const char* data, uint32_t size, uint64_t offset); + std::error_code sync_writev(const iovec* iov, int iovcnt, uint32_t size, uint64_t offset); + std::error_code sync_read(char* data, uint32_t size, uint64_t offset); + std::error_code sync_readv(iovec* iov, int iovcnt, uint32_t size, uint64_t offset); + std::error_code sync_write_zero(uint64_t size, uint64_t offset); void submit_batch(); ///////////// Parameters Getters /////////////////////// diff --git a/src/lib/device/vchunk.cpp b/src/lib/device/vchunk.cpp index f47c372ee..e2430219c 100644 --- a/src/lib/device/vchunk.cpp +++ b/src/lib/device/vchunk.cpp @@ -23,7 +23,7 @@ void VChunk::set_user_private(const sisl::blob& data) { m_internal_chunk->set_us const uint8_t* VChunk::get_user_private() const { return m_internal_chunk->user_private(); }; -blk_cap_t VChunk::available_blks() const { return m_internal_chunk->blk_allocator()->available_blks(); } +blk_num_t VChunk::available_blks() const { return m_internal_chunk->blk_allocator()->available_blks(); } uint32_t VChunk::get_pdev_id() const { return m_internal_chunk->physical_dev()->pdev_id(); } diff --git a/src/lib/device/virtual_dev.cpp b/src/lib/device/virtual_dev.cpp index 0d771d4ab..6bcc4aebc 100644 --- a/src/lib/device/virtual_dev.cpp +++ b/src/lib/device/virtual_dev.cpp @@ -81,7 +81,7 @@ static std::shared_ptr< BlkAllocator > create_blk_allocator(blk_allocator_type_t } } -VirtualDev::VirtualDev(DeviceManager& dmgr, const vdev_info& vinfo, vdev_event_cb_t event_cb, bool is_auto_recovery) : +VirtualDev::VirtualDev(DeviceManager& dmgr, vdev_info const& vinfo, vdev_event_cb_t event_cb, bool is_auto_recovery) : m_vdev_info{vinfo}, m_dmgr{dmgr}, m_name{vinfo.name}, @@ -124,8 +124,8 @@ void VirtualDev::add_chunk(cshared< Chunk >& chunk, bool is_fresh_chunk) { m_chunk_selector->add_chunk(chunk); } -folly::Future< bool > VirtualDev::async_format() { - static thread_local std::vector< folly::Future< bool > > s_futs; +folly::Future< std::error_code > VirtualDev::async_format() { + static thread_local std::vector< folly::Future< std::error_code > > s_futs; s_futs.clear(); for (auto& chunk : m_all_chunks) { @@ -134,36 +134,42 @@ folly::Future< bool > VirtualDev::async_format() { chunk->start_offset()); s_futs.emplace_back(pdev->async_write_zero(chunk->size(), chunk->start_offset())); } - return folly::collectAllUnsafe(s_futs).thenTry([](auto&&) { return folly::makeFuture< bool >(true); }); + return folly::collectAllUnsafe(s_futs).thenTry([](auto&& t) { + for (const auto& err_c : t.value()) { + if (sisl_unlikely(err_c.value())) { return folly::makeFuture< std::error_code >(err_c); } + } + return folly::makeFuture< std::error_code >(std::error_code{}); + }); } /*std::shared_ptr< blkalloc_cp > VirtualDev::attach_prepare_cp(const std::shared_ptr< blkalloc_cp >& cur_ba_cp) { return (Chunk::attach_prepare_cp(cur_ba_cp)); }*/ -bool VirtualDev::is_blk_alloced(const BlkId& blkid) const { - return m_dmgr.get_chunk(blkid.get_chunk_num())->blk_allocator()->is_blk_alloced(blkid); +bool VirtualDev::is_blk_alloced(BlkId const& blkid) const { + return m_dmgr.get_chunk(blkid.chunk_num())->blk_allocator()->is_blk_alloced(blkid); } -BlkAllocStatus VirtualDev::commit_blk(const BlkId& blkid) { - Chunk* chunk = m_dmgr.get_chunk_mutable(blkid.get_chunk_num()); +BlkAllocStatus VirtualDev::commit_blk(BlkId const& blkid) { + Chunk* chunk = m_dmgr.get_chunk_mutable(blkid.chunk_num()); HS_LOG(DEBUG, device, "commit_blk: bid {}", blkid.to_string()); return chunk->blk_allocator_mutable()->alloc_on_disk(blkid); } -BlkAllocStatus VirtualDev::alloc_contiguous_blk(blk_count_t nblks, const blk_alloc_hints& hints, BlkId* out_blkid) { +BlkAllocStatus VirtualDev::alloc_contiguous_blks(blk_count_t nblks, blk_alloc_hints const& hints, BlkId& out_blkid) { BlkAllocStatus ret; try { - static thread_local std::vector< BlkId > blkid{}; - blkid.clear(); - HS_DBG_ASSERT_EQ(hints.is_contiguous, true); - ret = alloc_blk(nblks, hints, blkid); - if (ret == BlkAllocStatus::SUCCESS) { - HS_REL_ASSERT_EQ(blkid.size(), 1, "out blkid more than 1 entries({}) will lead to blk leak!", blkid.size()); - *out_blkid = std::move(blkid.front()); + MultiBlkId mbid; + if (!hints.is_contiguous) { + HS_DBG_ASSERT(false, "Expected alloc_contiguous_blk call to be with hints.is_contiguous=true"); + blk_alloc_hints adjusted_hints = hints; + adjusted_hints.is_contiguous = true; + ret = alloc_blks(nblks, adjusted_hints, mbid); } else { - HS_DBG_ASSERT_EQ(blkid.size(), 0); + ret = alloc_blks(nblks, hints, mbid); } + HS_REL_ASSERT_EQ(mbid.num_pieces(), 1, "out blkid more than 1 entries will lead to blk leak!"); + out_blkid = mbid.to_single_blkid(); } catch (const std::exception& e) { ret = BlkAllocStatus::FAILED; HS_DBG_ASSERT(0, "{}", e.what()); @@ -171,25 +177,7 @@ BlkAllocStatus VirtualDev::alloc_contiguous_blk(blk_count_t nblks, const blk_all return ret; } -BlkAllocStatus VirtualDev::alloc_blk(uint32_t nblks, const blk_alloc_hints& hints, std::vector< BlkId >& out_blkid) { - size_t start_idx = out_blkid.size(); - while (nblks != 0) { - const blk_count_t nblks_op = std::min(BlkId::max_blks_in_op(), s_cast< blk_count_t >(nblks)); - const auto ret = do_alloc_blk(nblks_op, hints, out_blkid); - if (ret != BlkAllocStatus::SUCCESS) { - for (auto i = start_idx; i < out_blkid.size(); ++i) { - free_blk(out_blkid[i]); - out_blkid.erase(out_blkid.begin() + start_idx, out_blkid.end()); - } - return ret; - } - nblks -= nblks_op; - } - return BlkAllocStatus::SUCCESS; -} - -BlkAllocStatus VirtualDev::do_alloc_blk(blk_count_t nblks, const blk_alloc_hints& hints, - std::vector< BlkId >& out_blkid) { +BlkAllocStatus VirtualDev::alloc_blks(blk_count_t nblks, blk_alloc_hints const& hints, MultiBlkId& out_blkid) { try { // First select a chunk to allocate it from BlkAllocStatus status; @@ -198,13 +186,19 @@ BlkAllocStatus VirtualDev::do_alloc_blk(blk_count_t nblks, const blk_alloc_hints do { chunk = m_chunk_selector->select_chunk(nblks, hints).get(); - if (chunk == nullptr) { status = BlkAllocStatus::SPACE_FULL; } + if (chunk == nullptr) { + status = BlkAllocStatus::SPACE_FULL; + break; + } - status = alloc_blk_from_chunk(nblks, hints, out_blkid, chunk); - if (status == BlkAllocStatus::SUCCESS || !hints.can_look_for_other_chunk) { break; } + status = alloc_blks_from_chunk(nblks, hints, out_blkid, chunk); + if ((status == BlkAllocStatus::SUCCESS) || !hints.can_look_for_other_chunk || + (status == BlkAllocStatus::PARTIAL && hints.partial_alloc_ok)) { + break; + } } while (++attempt < m_all_chunks.size()); - if (status != BlkAllocStatus::SUCCESS) { + if ((status != BlkAllocStatus::SUCCESS) || (status != BlkAllocStatus::PARTIAL)) { LOGERROR("nblks={} failed to alloc after trying to alloc on every chunks {} and devices {}.", nblks); COUNTER_INCREMENT(m_metrics, vdev_num_alloc_failure, 1); } @@ -217,41 +211,66 @@ BlkAllocStatus VirtualDev::do_alloc_blk(blk_count_t nblks, const blk_alloc_hints } } -BlkAllocStatus VirtualDev::alloc_blk_from_chunk(blk_count_t nblks, const blk_alloc_hints& hints, - std::vector< BlkId >& out_blkid, Chunk* chunk) { +BlkAllocStatus VirtualDev::alloc_blks(blk_count_t nblks, blk_alloc_hints const& hints, + std::vector< BlkId >& out_blkids) { + // Regular alloc blks will allocate in MultiBlkId, but there is an upper limit on how many it can accomodate in a + // single MultiBlkId, if caller is ok to generate multiple MultiBlkids, this method is called. + auto h = hints; + h.partial_alloc_ok = true; + h.is_contiguous = true; + blk_count_t nblks_remain = nblks; + BlkAllocStatus status; + + do { + out_blkids.emplace_back(); // Put an empty MultiBlkId and use that for allocating them + BlkId& out_bid = out_blkids.back(); + status = alloc_contiguous_blks(nblks_remain, h, out_bid); + + auto nblks_this_iter = out_bid.blk_count(); + nblks_remain = (nblks_remain < nblks_this_iter) ? 0 : (nblks_remain - nblks_this_iter); + } while (nblks_remain); + + return status; +} + +BlkAllocStatus VirtualDev::alloc_blks_from_chunk(blk_count_t nblks, blk_alloc_hints const& hints, MultiBlkId& out_blkid, + Chunk* chunk) { #ifdef _PRERELEASE if (auto const fake_status = iomgr_flip::instance()->get_test_flip< uint32_t >("blk_allocation_flip", nblks, chunk->vdev_id())) { return static_cast< BlkAllocStatus >(fake_status.get()); } #endif - static thread_local std::vector< BlkId > chunk_blkid{}; - chunk_blkid.clear(); - auto status = chunk->blk_allocator_mutable()->alloc(nblks, hints, chunk_blkid); - if (status == BlkAllocStatus::PARTIAL) { + auto status = chunk->blk_allocator_mutable()->alloc(nblks, hints, out_blkid); + if ((status == BlkAllocStatus::PARTIAL) && (!hints.partial_alloc_ok)) { // free partial result - for (auto const b : chunk_blkid) { - auto const ret = chunk->blk_allocator_mutable()->free_on_realtime(b); + auto it = out_blkid.iterate(); + while (auto const b = it.next()) { + auto const ret = chunk->blk_allocator_mutable()->free_on_realtime(*b); HS_REL_ASSERT(ret, "failed to free on realtime"); } - chunk->blk_allocator_mutable()->free(chunk_blkid); + chunk->blk_allocator_mutable()->free(out_blkid); + out_blkid = MultiBlkId{}; status = BlkAllocStatus::FAILED; - } else if (status == BlkAllocStatus::SUCCESS) { - // append chunk blocks to out blocks - out_blkid.insert(std::end(out_blkid), std::make_move_iterator(std::begin(chunk_blkid)), - std::make_move_iterator(std::end(chunk_blkid))); } + return status; } -/*bool VirtualDev::free_on_realtime(const BlkId& b) { - Chunk* chunk = m_dmgr.get_chunk_mutable(b.get_chunk_num()); +/*bool VirtualDev::free_on_realtime(BlkId const& b) { + Chunk* chunk = m_dmgr.get_chunk_mutable(b.chunk_num()); return chunk->blk_allocator_mutable()->free_on_realtime(b); }*/ -void VirtualDev::free_blk(const BlkId& b) { - Chunk* chunk = m_dmgr.get_chunk_mutable(b.get_chunk_num()); - chunk->blk_allocator_mutable()->free(b); +void VirtualDev::free_blk(BlkId const& b) { + if (b.is_multi()) { + MultiBlkId const& mb = r_cast< MultiBlkId const& >(b); + Chunk* chunk = m_dmgr.get_chunk_mutable(mb.chunk_num()); + chunk->blk_allocator_mutable()->free(mb); + } else { + Chunk* chunk = m_dmgr.get_chunk_mutable(b.chunk_num()); + chunk->blk_allocator_mutable()->free(b); + } } void VirtualDev::recovery_done() { @@ -261,7 +280,7 @@ void VirtualDev::recovery_done() { } } -uint64_t VirtualDev::get_len(const iovec* iov, const int iovcnt) { +uint64_t VirtualDev::get_len(const iovec* iov, int iovcnt) { uint64_t len{0}; for (int i{0}; i < iovcnt; ++i) { len += iov[i].iov_len; @@ -270,7 +289,10 @@ uint64_t VirtualDev::get_len(const iovec* iov, const int iovcnt) { } ////////////////////////// async write section ////////////////////////////////// -folly::Future< bool > VirtualDev::async_write(const char* buf, uint32_t size, const BlkId& bid, bool part_of_batch) { +folly::Future< std::error_code > VirtualDev::async_write(const char* buf, uint32_t size, BlkId const& bid, + bool part_of_batch) { + HS_DBG_ASSERT_EQ(bid.is_multi(), false, "async_write needs individual pieces of blkid - not MultiBlkid"); + Chunk* chunk; uint64_t const dev_offset = to_dev_offset(bid, &chunk); auto* pdev = chunk->physical_dev_mutable(); @@ -283,8 +305,8 @@ folly::Future< bool > VirtualDev::async_write(const char* buf, uint32_t size, co return pdev->async_write(buf, size, dev_offset, part_of_batch); } -folly::Future< bool > VirtualDev::async_write(const char* buf, uint32_t size, cshared< Chunk >& chunk, - uint64_t offset_in_chunk) { +folly::Future< std::error_code > VirtualDev::async_write(const char* buf, uint32_t size, cshared< Chunk >& chunk, + uint64_t offset_in_chunk) { auto const dev_offset = chunk->start_offset() + offset_in_chunk; auto* pdev = chunk->physical_dev_mutable(); @@ -296,8 +318,10 @@ folly::Future< bool > VirtualDev::async_write(const char* buf, uint32_t size, cs return pdev->async_write(buf, size, dev_offset, false /* part_of_batch */); } -folly::Future< bool > VirtualDev::async_writev(const iovec* iov, const int iovcnt, const BlkId& bid, - bool part_of_batch) { +folly::Future< std::error_code > VirtualDev::async_writev(const iovec* iov, const int iovcnt, BlkId const& bid, + bool part_of_batch) { + HS_DBG_ASSERT_EQ(bid.is_multi(), false, "async_writev needs individual pieces of blkid - not MultiBlkid"); + Chunk* chunk; uint64_t const dev_offset = to_dev_offset(bid, &chunk); auto const size = get_len(iov, iovcnt); @@ -311,8 +335,8 @@ folly::Future< bool > VirtualDev::async_writev(const iovec* iov, const int iovcn return pdev->async_writev(iov, iovcnt, size, dev_offset, part_of_batch); } -folly::Future< bool > VirtualDev::async_writev(const iovec* iov, const int iovcnt, cshared< Chunk >& chunk, - uint64_t offset_in_chunk) { +folly::Future< std::error_code > VirtualDev::async_writev(const iovec* iov, const int iovcnt, cshared< Chunk >& chunk, + uint64_t offset_in_chunk) { auto const dev_offset = chunk->start_offset() + offset_in_chunk; auto const size = get_len(iov, iovcnt); auto* pdev = chunk->physical_dev_mutable(); @@ -326,17 +350,22 @@ folly::Future< bool > VirtualDev::async_writev(const iovec* iov, const int iovcn } ////////////////////////// sync write section ////////////////////////////////// -void VirtualDev::sync_write(const char* buf, uint32_t size, const BlkId& bid) { +std::error_code VirtualDev::sync_write(const char* buf, uint32_t size, BlkId const& bid) { + HS_DBG_ASSERT_EQ(bid.is_multi(), false, "sync_write needs individual pieces of blkid - not MultiBlkid"); + Chunk* chunk; uint64_t const dev_offset = to_dev_offset(bid, &chunk); - chunk->physical_dev_mutable()->sync_write(buf, size, dev_offset); + return chunk->physical_dev_mutable()->sync_write(buf, size, dev_offset); } -void VirtualDev::sync_write(const char* buf, uint32_t size, cshared< Chunk >& chunk, uint64_t offset_in_chunk) { - chunk->physical_dev_mutable()->sync_write(buf, size, chunk->start_offset() + offset_in_chunk); +std::error_code VirtualDev::sync_write(const char* buf, uint32_t size, cshared< Chunk >& chunk, + uint64_t offset_in_chunk) { + return chunk->physical_dev_mutable()->sync_write(buf, size, chunk->start_offset() + offset_in_chunk); } -void VirtualDev::sync_writev(const iovec* iov, int iovcnt, const BlkId& bid) { +std::error_code VirtualDev::sync_writev(const iovec* iov, int iovcnt, BlkId const& bid) { + HS_DBG_ASSERT_EQ(bid.is_multi(), false, "sync_writev needs individual pieces of blkid - not MultiBlkid"); + Chunk* chunk; uint64_t const dev_offset = to_dev_offset(bid, &chunk); auto const size = get_len(iov, iovcnt); @@ -347,10 +376,11 @@ void VirtualDev::sync_writev(const iovec* iov, int iovcnt, const BlkId& bid) { COUNTER_INCREMENT(m_metrics, unalign_writes, 1); } - pdev->sync_writev(iov, iovcnt, size, dev_offset); + return pdev->sync_writev(iov, iovcnt, size, dev_offset); } -void VirtualDev::sync_writev(const iovec* iov, int iovcnt, cshared< Chunk >& chunk, uint64_t offset_in_chunk) { +std::error_code VirtualDev::sync_writev(const iovec* iov, int iovcnt, cshared< Chunk >& chunk, + uint64_t offset_in_chunk) { uint64_t const dev_offset = chunk->start_offset() + offset_in_chunk; auto const size = get_len(iov, iovcnt); auto* pdev = chunk->physical_dev_mutable(); @@ -360,35 +390,44 @@ void VirtualDev::sync_writev(const iovec* iov, int iovcnt, cshared< Chunk >& chu COUNTER_INCREMENT(m_metrics, unalign_writes, 1); } - pdev->sync_writev(iov, iovcnt, size, dev_offset); + return pdev->sync_writev(iov, iovcnt, size, dev_offset); } ////////////////////////////////// async read section /////////////////////////////////////////////// -folly::Future< bool > VirtualDev::async_read(char* buf, uint64_t size, const BlkId& bid, bool part_of_batch) { +folly::Future< std::error_code > VirtualDev::async_read(char* buf, uint64_t size, BlkId const& bid, + bool part_of_batch) { + HS_DBG_ASSERT_EQ(bid.is_multi(), false, "async_read needs individual pieces of blkid - not MultiBlkid"); + Chunk* pchunk; uint64_t const dev_offset = to_dev_offset(bid, &pchunk); return pchunk->physical_dev_mutable()->async_read(buf, size, dev_offset, part_of_batch); } -folly::Future< bool > VirtualDev::async_readv(iovec* iovs, int iovcnt, uint64_t size, const BlkId& bid, - bool part_of_batch) { +folly::Future< std::error_code > VirtualDev::async_readv(iovec* iovs, int iovcnt, uint64_t size, BlkId const& bid, + bool part_of_batch) { + HS_DBG_ASSERT_EQ(bid.is_multi(), false, "async_readv needs individual pieces of blkid - not MultiBlkid"); + Chunk* pchunk; uint64_t const dev_offset = to_dev_offset(bid, &pchunk); return pchunk->physical_dev_mutable()->async_readv(iovs, iovcnt, size, dev_offset, part_of_batch); } ////////////////////////////////////////// sync read section //////////////////////////////////////////// -void VirtualDev::sync_read(char* buf, uint32_t size, const BlkId& bid) { +std::error_code VirtualDev::sync_read(char* buf, uint32_t size, BlkId const& bid) { + HS_DBG_ASSERT_EQ(bid.is_multi(), false, "sync_read needs individual pieces of blkid - not MultiBlkid"); + Chunk* chunk; uint64_t const dev_offset = to_dev_offset(bid, &chunk); - chunk->physical_dev_mutable()->sync_read(buf, size, dev_offset); + return chunk->physical_dev_mutable()->sync_read(buf, size, dev_offset); } -void VirtualDev::sync_read(char* buf, uint32_t size, cshared< Chunk >& chunk, uint64_t offset_in_chunk) { - chunk->physical_dev_mutable()->sync_read(buf, size, chunk->start_offset() + offset_in_chunk); +std::error_code VirtualDev::sync_read(char* buf, uint32_t size, cshared< Chunk >& chunk, uint64_t offset_in_chunk) { + return chunk->physical_dev_mutable()->sync_read(buf, size, chunk->start_offset() + offset_in_chunk); } -void VirtualDev::sync_readv(iovec* iov, int iovcnt, const BlkId& bid) { +std::error_code VirtualDev::sync_readv(iovec* iov, int iovcnt, BlkId const& bid) { + HS_DBG_ASSERT_EQ(bid.is_multi(), false, "sync_readv needs individual pieces of blkid - not MultiBlkid"); + Chunk* chunk; uint64_t const dev_offset = to_dev_offset(bid, &chunk); auto const size = get_len(iov, iovcnt); @@ -399,10 +438,10 @@ void VirtualDev::sync_readv(iovec* iov, int iovcnt, const BlkId& bid) { COUNTER_INCREMENT(m_metrics, unalign_writes, 1); } - pdev->sync_readv(iov, iovcnt, size, dev_offset); + return pdev->sync_readv(iov, iovcnt, size, dev_offset); } -void VirtualDev::sync_readv(iovec* iov, int iovcnt, cshared< Chunk >& chunk, uint64_t offset_in_chunk) { +std::error_code VirtualDev::sync_readv(iovec* iov, int iovcnt, cshared< Chunk >& chunk, uint64_t offset_in_chunk) { uint64_t const dev_offset = chunk->start_offset() + offset_in_chunk; auto const size = get_len(iov, iovcnt); auto* pdev = chunk->physical_dev_mutable(); @@ -412,10 +451,10 @@ void VirtualDev::sync_readv(iovec* iov, int iovcnt, cshared< Chunk >& chunk, uin COUNTER_INCREMENT(m_metrics, unalign_writes, 1); } - pdev->sync_readv(iov, iovcnt, size, dev_offset); + return pdev->sync_readv(iov, iovcnt, size, dev_offset); } -folly::Future< bool > VirtualDev::queue_fsync_pdevs() { +folly::Future< std::error_code > VirtualDev::queue_fsync_pdevs() { HS_DBG_ASSERT_EQ(HS_DYNAMIC_CONFIG(device->direct_io_mode), false, "Not expect to do fsync in DIRECT_IO_MODE."); assert(m_pdevs.size() > 0); @@ -424,13 +463,18 @@ folly::Future< bool > VirtualDev::queue_fsync_pdevs() { HS_LOG(TRACE, device, "Flushing pdev {}", pdev->get_devname()); return pdev->queue_fsync(); } else { - static thread_local std::vector< folly::Future< bool > > s_futs; + static thread_local std::vector< folly::Future< std::error_code > > s_futs; s_futs.clear(); for (auto* pdev : m_pdevs) { HS_LOG(TRACE, device, "Flushing pdev {}", pdev->get_devname()); s_futs.emplace_back(pdev->queue_fsync()); } - return folly::collectAllUnsafe(s_futs).thenTry([](auto&&) { return folly::makeFuture< bool >(true); }); + return folly::collectAllUnsafe(s_futs).thenTry([](auto&& t) { + for (const auto& err_c : t.value()) { + if (sisl_unlikely(err_c.value())) { return folly::makeFuture< std::error_code >(err_c); } + } + return folly::makeFuture< std::error_code >(std::error_code{}); + }); } } @@ -543,9 +587,9 @@ void VirtualDev::cp_cleanup(CP*) { } ///////////////////////// VirtualDev Private Methods ///////////////////////////// -uint64_t VirtualDev::to_dev_offset(const BlkId& b, Chunk** chunk) const { - *chunk = m_dmgr.get_chunk_mutable(b.get_chunk_num()); - return uint64_cast(b.get_blk_num()) * block_size() + uint64_cast((*chunk)->start_offset()); +uint64_t VirtualDev::to_dev_offset(BlkId const& b, Chunk** chunk) const { + *chunk = m_dmgr.get_chunk_mutable(b.chunk_num()); + return uint64_cast(b.blk_num()) * block_size() + uint64_cast((*chunk)->start_offset()); } } // namespace homestore diff --git a/src/lib/device/virtual_dev.hpp b/src/lib/device/virtual_dev.hpp index fe0e61ac5..0fff20026 100644 --- a/src/lib/device/virtual_dev.hpp +++ b/src/lib/device/virtual_dev.hpp @@ -56,9 +56,9 @@ class VirtualDevMetrics : public sisl::MetricsGroupWrapper { register_me_to_farm(); } - VirtualDevMetrics(const VirtualDevMetrics&) = delete; + VirtualDevMetrics(VirtualDevMetrics const&) = delete; VirtualDevMetrics(VirtualDevMetrics&&) noexcept = delete; - VirtualDevMetrics& operator=(const VirtualDevMetrics&) = delete; + VirtualDevMetrics& operator=(VirtualDevMetrics const&) = delete; VirtualDevMetrics& operator=(VirtualDevMetrics&&) noexcept = delete; ~VirtualDevMetrics() { deregister_me_from_farm(); } @@ -99,8 +99,8 @@ class VirtualDev { public: VirtualDev(DeviceManager& dmgr, const vdev_info& vinfo, vdev_event_cb_t event_cb, bool is_auto_recovery); - VirtualDev(const VirtualDev& other) = delete; - VirtualDev& operator=(const VirtualDev& other) = delete; + VirtualDev(VirtualDev const& other) = delete; + VirtualDev& operator=(VirtualDev const& other) = delete; VirtualDev(VirtualDev&&) noexcept = delete; VirtualDev& operator=(VirtualDev&&) noexcept = delete; virtual ~VirtualDev() = default; @@ -114,28 +114,31 @@ class VirtualDev { /// @brief Formats the vdev asynchronously by zeroing the entire vdev. It will use underlying physical device /// capabilities to zero them if fast zero is possible, otherwise will zero block by block /// @param cb Callback after formatting is completed. - virtual folly::Future< bool > async_format(); + virtual folly::Future< std::error_code > async_format(); /////////////////////// Block Allocation related methods ///////////////////////////// /// @brief This method allocates contigous blocks in the vdev /// @param nblks : Number of blocks to allocate /// @param hints : Hints about block allocation, (specific device to allocate, stream etc) - /// @param out_blkid : Pointer to where allocated BlkId to be placed + /// @param out_blkid : Reference to where allocated BlkId to be placed /// @return BlkAllocStatus : Status about the allocation - virtual BlkAllocStatus alloc_contiguous_blk(blk_count_t nblks, const blk_alloc_hints& hints, BlkId* out_blkid); + virtual BlkAllocStatus alloc_contiguous_blks(blk_count_t nblks, blk_alloc_hints const& hints, BlkId& out_blkid); /// @brief This method allocates blocks in the vdev and it could be non-contiguous, hence multiple BlkIds are /// returned /// @param nblks : Number of blocks to allocate /// @param hints : Hints about block allocation, (specific device to allocate, stream etc) - /// @param out_blkid : Reference to the vector of blkids to be placed. It appends into the vector + /// @param out_blkid : Reference to the MultiBlkd which can hold multiple blkids. /// @return BlkAllocStatus : Status about the allocation - virtual BlkAllocStatus alloc_blk(uint32_t nblks, const blk_alloc_hints& hints, std::vector< BlkId >& out_blkid); + virtual BlkAllocStatus alloc_blks(blk_count_t nblks, blk_alloc_hints const& hints, MultiBlkId& out_blkid); + + virtual BlkAllocStatus alloc_blks(blk_count_t nblks, blk_alloc_hints const& hints, + std::vector< BlkId >& out_blkids); /// @brief Checks if a given block id is allocated in the in-memory version of the blk allocator /// @param blkid : BlkId to check for allocation /// @return true or false - virtual bool is_blk_alloced(const BlkId& blkid) const; + virtual bool is_blk_alloced(BlkId const& blkid) const; /// @brief Commits the blkid in on-disk version of the blk allocator. The blkid is assumed to be allocated using /// alloc_blk or alloc_contiguous_blk method earlier (either after reboot or prior to reboot). It is not required @@ -144,9 +147,9 @@ class VirtualDev { /// recover Please note that even calling this method is not guaranteed to persisted until checkpoint is taken. /// @param blkid BlkId to commit explicitly. /// @return Allocation Status - virtual BlkAllocStatus commit_blk(const BlkId& blkid); + virtual BlkAllocStatus commit_blk(BlkId const& blkid); - virtual void free_blk(const BlkId& b); + virtual void free_blk(BlkId const& b); /////////////////////// Write API related methods ///////////////////////////// /// @brief Asynchornously write the buffer to the device on a given blkid @@ -156,10 +159,11 @@ class VirtualDev { /// @param part_of_batch : Is this write part of batch io. If true, caller is expected to call submit_batch at /// the end of the batch, otherwise this write request will not be queued. /// @return future< bool > Future result of success or failure - folly::Future< bool > async_write(const char* buf, uint32_t size, const BlkId& bid, bool part_of_batch = false); + folly::Future< std::error_code > async_write(const char* buf, uint32_t size, BlkId const& bid, + bool part_of_batch = false); - folly::Future< bool > async_write(const char* buf, uint32_t size, cshared< Chunk >& chunk, - uint64_t offset_in_chunk); + folly::Future< std::error_code > async_write(const char* buf, uint32_t size, cshared< Chunk >& chunk, + uint64_t offset_in_chunk); /// @brief Asynchornously write the buffer to the device on a given blkid from vector of buffer /// @param iov : Vector of buffer to write data from @@ -168,31 +172,32 @@ class VirtualDev { /// @param part_of_batch : Is this write part of batch io. If true, caller is expected to call submit_batch at /// the end of the batch, otherwise this write request will not be queued. /// @return future< bool > Future result of success or failure - folly::Future< bool > async_writev(const iovec* iov, int iovcnt, const BlkId& bid, bool part_of_batch = false); + folly::Future< std::error_code > async_writev(const iovec* iov, int iovcnt, BlkId const& bid, + bool part_of_batch = false); // TODO: This needs to be removed once Journal starting to use AppendBlkAllocator - folly::Future< bool > async_writev(const iovec* iov, const int iovcnt, cshared< Chunk >& chunk, - uint64_t offset_in_chunk); + folly::Future< std::error_code > async_writev(const iovec* iov, const int iovcnt, cshared< Chunk >& chunk, + uint64_t offset_in_chunk); /// @brief Synchronously write the buffer to the blkid /// @param buf : Buffer to write data from /// @param size : Size of the buffer /// @param bid : BlkId which was previously allocated. It is expected that entire size was allocated previously. /// @return ssize_t: Size of the data actually written. - void sync_write(const char* buf, uint32_t size, const BlkId& bid); + std::error_code sync_write(const char* buf, uint32_t size, BlkId const& bid); // TODO: This needs to be removed once Journal starting to use AppendBlkAllocator - void sync_write(const char* buf, uint32_t size, cshared< Chunk >& chunk, uint64_t offset_in_chunk); + std::error_code sync_write(const char* buf, uint32_t size, cshared< Chunk >& chunk, uint64_t offset_in_chunk); /// @brief Synchronously write the vector of buffers to the blkid /// @param iov : Vector of buffer to write data from /// @param iovcnt : Count of buffer /// @param bid BlkId which was previously allocated. It is expected that entire size was allocated previously. /// @return ssize_t: Size of the data actually written. - void sync_writev(const iovec* iov, int iovcnt, const BlkId& bid); + std::error_code sync_writev(const iovec* iov, int iovcnt, BlkId const& bid); // TODO: This needs to be removed once Journal starting to use AppendBlkAllocator - void sync_writev(const iovec* iov, int iovcnt, cshared< Chunk >& chunk, uint64_t offset_in_chunk); + std::error_code sync_writev(const iovec* iov, int iovcnt, cshared< Chunk >& chunk, uint64_t offset_in_chunk); /////////////////////// Read API related methods ///////////////////////////// @@ -203,7 +208,7 @@ class VirtualDev { /// @param part_of_batch : Is this read part of batch io. If true, caller is expected to call submit_batch at /// the end of the batch, otherwise this read request will not be queued. /// @return future< bool > Future result of success or failure - folly::Future< bool > async_read(char* buf, uint64_t size, const BlkId& bid, bool part_of_batch = false); + folly::Future< std::error_code > async_read(char* buf, uint64_t size, BlkId const& bid, bool part_of_batch = false); /// @brief Asynchronously read the data for a given BlkId to the vector of buffers /// @param iov : Vector of buffer to write read to @@ -213,34 +218,34 @@ class VirtualDev { /// @param part_of_batch : Is this read part of batch io. If true, caller is expected to call submit_batch at /// the end of the batch, otherwise this read request will not be queued. /// @return future< bool > Future result of success or failure - folly::Future< bool > async_readv(iovec* iovs, int iovcnt, uint64_t size, const BlkId& bid, - bool part_of_batch = false); + folly::Future< std::error_code > async_readv(iovec* iovs, int iovcnt, uint64_t size, BlkId const& bid, + bool part_of_batch = false); /// @brief Synchronously read the data for a given BlkId. /// @param buf : Buffer to read data to /// @param size : Size of the buffer /// @param bid : BlkId from data needs to be read /// @return ssize_t: Size of the data actually read. - void sync_read(char* buf, uint32_t size, const BlkId& bid); + std::error_code sync_read(char* buf, uint32_t size, BlkId const& bid); // TODO: This needs to be removed once Journal starting to use AppendBlkAllocator - void sync_read(char* buf, uint32_t size, cshared< Chunk >& chunk, uint64_t offset_in_chunk); + std::error_code sync_read(char* buf, uint32_t size, cshared< Chunk >& chunk, uint64_t offset_in_chunk); /// @brief Synchronously read the data for a given BlkId to vector of buffers /// @param iov : Vector of buffer to write read to /// @param iovcnt : Count of buffer /// @param size : Size of the actual data, it is really to optimize the iovec from iterating again to get size /// @return ssize_t: Size of the data actually read. - void sync_readv(iovec* iov, int iovcnt, const BlkId& bid); + std::error_code sync_readv(iovec* iov, int iovcnt, BlkId const& bid); // TODO: This needs to be removed once Journal starting to use AppendBlkAllocator - void sync_readv(iovec* iov, int iovcnt, cshared< Chunk >& chunk, uint64_t offset_in_chunk); + std::error_code sync_readv(iovec* iov, int iovcnt, cshared< Chunk >& chunk, uint64_t offset_in_chunk); /////////////////////// Other API related methods ///////////////////////////// /// @brief Fsync the underlying physical devices that vdev is sitting on asynchornously /// @return future< bool > Future result with bool to indicate when fsync is actually executed - folly::Future< bool > queue_fsync_pdevs(); + folly::Future< std::error_code > queue_fsync_pdevs(); /// @brief Submit the batch of IOs previously queued as part of async read/write APIs. void submit_batch(); @@ -274,7 +279,7 @@ class VirtualDev { uint32_t optimal_page_size() const; uint32_t atomic_page_size() const; - static uint64_t get_len(const iovec* iov, const int iovcnt); + static uint64_t get_len(const iovec* iov, int iovcnt); const std::set< PhysicalDev* >& get_pdevs() const { return m_pdevs; } std::vector< shared< Chunk > > get_chunks() const; shared< Chunk > get_next_chunk(cshared< Chunk >& chunk) const; @@ -283,10 +288,9 @@ class VirtualDev { void update_vdev_private(const sisl::blob& data); private: - BlkAllocStatus do_alloc_blk(blk_count_t nblks, const blk_alloc_hints& hints, std::vector< BlkId >& out_blkid); - uint64_t to_dev_offset(const BlkId& b, Chunk** chunk) const; - BlkAllocStatus alloc_blk_from_chunk(blk_count_t nblks, const blk_alloc_hints& hints, - std::vector< BlkId >& out_blkid, Chunk* chunk); + uint64_t to_dev_offset(BlkId const& b, Chunk** chunk) const; + BlkAllocStatus alloc_blks_from_chunk(blk_count_t nblks, blk_alloc_hints const& hints, MultiBlkId& out_blkid, + Chunk* chunk); }; // place holder for future needs in which components underlying virtualdev needs cp flush context; diff --git a/src/lib/homestore.cpp b/src/lib/homestore.cpp index 033c41611..35fb79e64 100644 --- a/src/lib/homestore.cpp +++ b/src/lib/homestore.cpp @@ -113,7 +113,7 @@ void HomeStore::format_and_start(std::map< uint32_t, hs_format_params >&& format m_dev_mgr->format_devices(); hs_utils::set_btree_mempool_size(m_dev_mgr->atomic_page_size({HSDevType::Fast})); - std::vector< folly::Future< bool > > futs; + std::vector< folly::Future< std::error_code > > futs; for (const auto& [svc_type, fparams] : format_opts) { if (fparams.size_pct == 0) { continue; } @@ -133,10 +133,13 @@ void HomeStore::format_and_start(std::map< uint32_t, hs_format_params >&& format } } - try { - if (!futs.empty()) { folly::collectAllUnsafe(futs).get(); } - } catch (const std::exception& e) { HS_REL_ASSERT(false, "IO error during format of vdev, error={}", e.what()); } - + if (!futs.empty()) { + auto tlist = folly::collectAllUnsafe(futs).get(); + for (auto const& t : tlist) { + auto const err = t.value(); + HS_REL_ASSERT(!err, "IO error during format of vdev, error={}", err.message()); + } + } do_start(); } diff --git a/src/lib/index/wb_cache.cpp b/src/lib/index/wb_cache.cpp index 5b117ca03..27899db61 100644 --- a/src/lib/index/wb_cache.cpp +++ b/src/lib/index/wb_cache.cpp @@ -81,11 +81,9 @@ void IndexWBCache::start_flush_threads() { BtreeNodePtr IndexWBCache::alloc_buf(node_initializer_t&& node_initializer) { // Alloc a block of data from underlying vdev - static thread_local std::vector< BlkId > t_blkids; - t_blkids.clear(); - auto ret = m_vdev->alloc_blk(1, blk_alloc_hints{}, t_blkids); + BlkId blkid; + auto ret = m_vdev->alloc_contiguous_blks(1, blk_alloc_hints{}, blkid); if (ret != BlkAllocStatus::SUCCESS) { return nullptr; } - BlkId blkid = t_blkids[0]; // Alloc buffer and initialize the node auto idx_buf = std::make_shared< IndexBuffer >(blkid, m_node_size, m_vdev->align_size()); diff --git a/src/lib/logstore/log_store_service.cpp b/src/lib/logstore/log_store_service.cpp index ec3be9da9..02f223131 100644 --- a/src/lib/logstore/log_store_service.cpp +++ b/src/lib/logstore/log_store_service.cpp @@ -42,7 +42,7 @@ LogStoreService::LogStoreService() : m_logstore_families{std::make_unique< LogStoreFamily >(DATA_LOG_FAMILY_IDX), std::make_unique< LogStoreFamily >(CTRL_LOG_FAMILY_IDX)} {} -folly::Future< bool > LogStoreService::create_vdev(uint64_t size, logstore_family_id_t family) { +folly::Future< std::error_code > LogStoreService::create_vdev(uint64_t size, logstore_family_id_t family) { const auto atomic_page_size = hs()->device_mgr()->atomic_page_size(HSDevType::Fast); hs_vdev_context hs_ctx; diff --git a/src/lib/meta/meta_blk_service.cpp b/src/lib/meta/meta_blk_service.cpp index ac9268e5f..338eb9a80 100644 --- a/src/lib/meta/meta_blk_service.cpp +++ b/src/lib/meta/meta_blk_service.cpp @@ -136,7 +136,7 @@ void MetaBlkService::cache_clear() { void MetaBlkService::read(const BlkId& bid, uint8_t* dest, size_t sz) const { sz = sisl::round_up(sz, align_size()); - HS_DBG_ASSERT_LE(sz, bid.get_nblks() * block_size()); + HS_DBG_ASSERT_LE(sz, bid.blk_count() * block_size()); try { m_sb_vdev->sync_read(r_cast< char* >(dest), sz, bid); } catch (std::exception& e) { HS_REL_ASSERT(0, "Exception: {}", e.what()); } @@ -224,7 +224,7 @@ bool MetaBlkService::scan_and_load_meta_blks(meta_blk_map_t& meta_blks, ovf_hdr_ auto self_recover{false}; while (bid.is_valid()) { - last_mblk_id->set(bid); + *last_mblk_id = bid; // TODO: add a new API in blkstore read to by pass cache; // e.g. take caller's read buf to avoid this extra memory copy; @@ -462,7 +462,7 @@ void MetaBlkService::write_ovf_blk_to_disk(meta_blk_ovf_hdr* ovf_hdr, const uint cur_ptr = const_cast< uint8_t* >(write_context_data) + size_written; if (i < ovf_hdr->h.nbids - 1) { - cur_size = data_bid[i].get_nblks() * block_size(); + cur_size = data_bid[i].blk_count() * block_size(); size_written += cur_size; } else { const size_t remain_sz_to_write = uint64_cast(write_size - size_written); @@ -549,7 +549,7 @@ meta_blk* MetaBlkService::init_meta_blk(BlkId& bid, meta_sub_type type, const ui } // point last mblk to this mblk; - m_last_mblk_id->set(bid); + *m_last_mblk_id = bid; // add to cache; HS_DBG_ASSERT(m_meta_blks.find(bid.to_integer()) == m_meta_blks.end(), @@ -573,7 +573,7 @@ void MetaBlkService::write_meta_blk_ovf(BlkId& out_obid, const uint8_t* context_ // allocate data blocks static thread_local std::vector< BlkId > context_data_blkids{}; context_data_blkids.clear(); - alloc_meta_blk(sisl::round_up(sz, block_size()), context_data_blkids); + alloc_meta_blks(sisl::round_up(sz, block_size()), context_data_blkids); HS_LOG(DEBUG, metablk, "Start to allocate nblks(data): {}, mstore used size: {}", context_data_blkids.size(), m_sb_vdev->used_size()); @@ -603,7 +603,7 @@ void MetaBlkService::write_meta_blk_ovf(BlkId& out_obid, const uint8_t* context_ uint64_t data_size{0}; auto* data_bid = ovf_hdr->get_data_bid_mutable(); for (; (j < ovf_blk_max_num_data_blk()) && (data_blkid_indx < context_data_blkids.size()); ++j) { - data_size += context_data_blkids[data_blkid_indx].data_size(block_size()); + data_size += context_data_blkids[data_blkid_indx].blk_count() * block_size(); data_bid[j] = context_data_blkids[data_blkid_indx++]; } @@ -887,7 +887,7 @@ std::error_condition MetaBlkService::remove_sub_sb(void* cookie) { HS_LOG(DEBUG, metablk, "removing last mblk, change m_last_mblk to bid: {}, [type={}]", prev_bid.to_string(), m_meta_blks[prev_bid.to_integer()]->hdr.h.type); - m_last_mblk_id->set(prev_bid); + *m_last_mblk_id = prev_bid; } // remove the in-memory handle from meta blk map; @@ -925,7 +925,7 @@ void MetaBlkService::free_ovf_blk_chain(const BlkId& obid) { auto* data_bid = ovf_hdr->get_data_bid(); for (decltype(ovf_hdr->h.nbids) i{0}; i < ovf_hdr->h.nbids; ++i) { m_sb_vdev->free_blk(data_bid[i]); - total_nblks_freed += data_bid[i].get_nblks(); + total_nblks_freed += data_bid[i].blk_count(); HS_LOG(DEBUG, metablk, "after freeing data bid: {}, mstore used size: {}", data_bid[i].to_string(), m_sb_vdev->used_size()); @@ -933,7 +933,7 @@ void MetaBlkService::free_ovf_blk_chain(const BlkId& obid) { // free on-disk ovf header blk m_sb_vdev->free_blk(cur_obid); - total_nblks_freed += cur_obid.get_nblks(); + total_nblks_freed += cur_obid.blk_count(); HS_LOG(DEBUG, metablk, "after freeing ovf bidid: {}, mstore used size: {}", cur_obid.to_string(), m_sb_vdev->used_size()); @@ -973,15 +973,16 @@ void MetaBlkService::free_meta_blk(meta_blk* mblk) { hs_utils::iobuf_free(uintptr_cast(mblk), sisl::buftag::metablk); } -void MetaBlkService::alloc_meta_blk(uint64_t size, std::vector< BlkId >& bid) { +void MetaBlkService::alloc_meta_blks(uint64_t size, std::vector< BlkId >& bids) { auto const nblks = uint32_cast(size / m_sb_vdev->block_size()); + try { - const auto ret = m_sb_vdev->alloc_blk(nblks, blk_alloc_hints{}, bid); + const auto ret = m_sb_vdev->alloc_blks(nblks, blk_alloc_hints{}, bids); HS_REL_ASSERT_EQ(ret, BlkAllocStatus::SUCCESS); #ifndef NDEBUG uint64_t debug_size{0}; - for (size_t i{0}; i < bid.size(); ++i) { - debug_size += bid[i].data_size(m_sb_vdev->block_size()); + for (auto const& b : bids) { + debug_size += (b.blk_count() * m_sb_vdev->block_size()); } HS_DBG_ASSERT_EQ(debug_size, size); #endif @@ -997,7 +998,7 @@ void MetaBlkService::alloc_meta_blk(BlkId& bid) { hints.is_contiguous = true; try { - const auto ret = m_sb_vdev->alloc_contiguous_blk(1, hints, &bid); + const auto ret = m_sb_vdev->alloc_contiguous_blks(1, hints, bid); HS_REL_ASSERT_EQ(ret, BlkAllocStatus::SUCCESS); } catch (const std::exception& e) { HS_REL_ASSERT(0, "{}", e.what()); } } @@ -1038,7 +1039,7 @@ sisl::byte_array MetaBlkService::read_sub_sb_internal(const meta_blk* mblk) cons for (decltype(ovf_hdr->h.nbids) i{0}; i < ovf_hdr->h.nbids; ++i) { size_t read_sz_per_db{0}; if (i < ovf_hdr->h.nbids - 1) { - read_sz_per_db = data_bid[i].get_nblks() * block_size(); + read_sz_per_db = data_bid[i].blk_count() * block_size(); } else { // it is possible user context data doesn't occupy the whole block, so we need to remember the // size that was written to the last data blk; @@ -1197,7 +1198,7 @@ uint64_t MetaBlkService::meta_size(const void* cookie) const { ++nblks; // ovf header blk; const auto* data_bid = ovf_hdr->get_data_bid(); for (decltype(ovf_hdr->h.nbids) i{0}; i < ovf_hdr->h.nbids; ++i) { - nblks += data_bid[i].get_nblks(); // data blks; + nblks += data_bid[i].blk_count(); // data blks; } obid = ovf_hdr->h.next_bid; } diff --git a/src/lib/meta/meta_sb.hpp b/src/lib/meta/meta_sb.hpp index b61e4c5fc..9eda9b017 100644 --- a/src/lib/meta/meta_sb.hpp +++ b/src/lib/meta/meta_sb.hpp @@ -96,10 +96,10 @@ struct MetaSubRegInfo { // meta blk super block put as 1st block in the block chain; #pragma pack(1) struct meta_blk_sb { - uint32_t magic; // ssb magic + uint32_t magic; // ssb magic uint32_t version; - BlkId8_t next_bid; // next metablk - BlkId8_t bid; + BlkId next_bid; // next metablk + BlkId bid; uint8_t migrated; uint8_t pad[7]; std::string to_string() const { @@ -116,14 +116,14 @@ struct meta_blk_sb { // #pragma pack(1) struct meta_blk_hdr_s { - uint32_t magic; // magic + uint32_t magic; // magic uint32_t version; - uint32_t gen_cnt; // generation count, bump on every update + uint32_t gen_cnt; // generation count, bump on every update crc32_t crc; - BlkId8_t next_bid; // next metablk - BlkId8_t prev_bid; // previous metablk - BlkId8_t ovf_bid; // overflow blk id; - BlkId8_t bid; // current blk id; might not be needd; + BlkId next_bid; // next metablk + BlkId prev_bid; // previous metablk + BlkId ovf_bid; // overflow blk id; + BlkId bid; // current blk id; might not be needd; uint64_t context_sz; // total size of context data; if compressed is true, it is the round up of compressed size // that is written to disk; if compressed is false, it is the original size of context data; uint64_t compressed_sz; // compressed size before round up to align_size, used for decompress @@ -171,10 +171,10 @@ struct meta_blk { // single list overflow block chain #pragma pack(1) struct meta_blk_ovf_hdr_s { - uint32_t magic; // ovf magic - uint32_t nbids; // number of data blkids stored in data_bid; - BlkId8_t next_bid; // next ovf blk id; - BlkId8_t bid; // self blkid + uint32_t magic; // ovf magic + uint32_t nbids; // number of data blkids stored in data_bid; + BlkId next_bid; // next ovf blk id; + BlkId bid; // self blkid uint64_t context_sz; }; #pragma pack() diff --git a/src/lib/replication/repl_service.cpp b/src/lib/replication/repl_service.cpp new file mode 100644 index 000000000..52cdca413 --- /dev/null +++ b/src/lib/replication/repl_service.cpp @@ -0,0 +1,122 @@ +#include + +#include +#include +#include +#include + +#include +#include "service/repl_backend.h" +#include "service/home_repl_backend.h" + +namespace homestore { +ReplicationServiceImpl::ReplicationServiceImpl(std::unique_ptr< ReplServiceCallbacks > cbs) : + m_svc_cbs{std::move(cbs)} { + m_messaging = std::make_shared< nuraft_mesg::service >(); + + // FIXME: RAFT server parameters, should be a config and reviewed!!! + nuraft::raft_params r_params; + r_params.with_election_timeout_lower(900) + .with_election_timeout_upper(1400) + .with_hb_interval(250) + .with_max_append_size(10) + .with_rpc_failure_backoff(250) + .with_auto_forwarding(true) + .with_snapshot_enabled(1); + + meta_service().register_handler( + "replication", + [this](meta_blk* mblk, sisl::byte_view buf, size_t) { rd_super_blk_found(std::move(buf), voidptr_cast(mblk)); }, + nullptr); + + // This closure is where we initialize new ReplicaSet instances. When NuRaft Messging is asked to join a new group + // either through direct creation or gRPC request it will use this callback to initialize a new state_manager and + // state_machine for the raft_server it constructs. + auto group_type_params = nuraft_mesg::consensus_component::register_params{ + r_params, [this](int32_t const, std::string const& group_id) mutable { + return create_replica_dev(group_id, std::set< std::string, std::less<> >()) + .via(&folly::QueuedImmediateExecutor::instance()) + .get(); + // RELEASE_ASSERT(std::holds_alternative< shared< ReplDev > >(v), "Could Not Create ReplicaSet!"); + // return std::get< shared< ReplDev > >(v); + }}; + // m_messaging->register_mgr_type("homestore", group_type_params); +} + +void ReplicationServiceImpl::create_vdev(uint64_t size) { + auto const atomic_page_size = hs()->device_mgr()->atomic_page_size(HSDevType::Data); + hs_vdev_context vdev_ctx; + vdev_ctx.type = hs_vdev_type_t::REPL_DATA_VDEV; + + hs()->device_mgr()->create_vdev(vdev_parameters{.vdev_name = "index", + .vdev_size = size, + .num_chunks = 1, + .blk_size = atomic_page_size, + .dev_type = HSDevType::Data, + .multi_pdev_opts = vdev_multi_pdev_opts_t::ALL_PDEV_STRIPED, + .context_data = vdev_ctx.to_blob()}); +} + +shared< VirtualDev > ReplicationServiceImpl::open_vdev(const vdev_info& vinfo, bool load_existing) { + m_vdev = std::make_shared< VirtualDev >(*(hs()->device_mgr()), vinfo, m_svc_cbs->blk_allocator_type(), + m_svc_cbs->chunk_selector(), nullptr, true /* auto_recovery */); + return m_vdev; +} + +ReplAsyncResult< shared< ReplDev > > +ReplicationServiceImpl::create_replica_dev(std::string const& group_id, + std::set< std::string, std::less<> >&& members) { + superblk< repl_dev_superblk > rd_sb; + rd_sb.create(sizeof(repl_dev_superblk)); + rd_sb->gid = group_id; + return folly::makeSemiFuture< shared< ReplDev > >(open_replica_dev(rd_sb, false /* load_existing */)); +} + +folly::SemiFuture< ReplServiceError > ReplicationServiceImpl::replace_member(std::string const& group_id, + std::string const& member_out, + std::string const& member_in) const { + return folly::makeSemiFuture(ReplServiceError::CANCELLED); +} + +ReplAsyncResult< shared< ReplDev > > ReplicationServiceImpl::get_replica_dev(std::string const& group_id) const { + std::unique_lock lg(m_rd_map_mtx); + if (auto it = m_rd_map.find(group_id); it != m_rd_map.end()) { return it->second; } + return ReplServiceError::SERVER_NOT_FOUND; +} + +void ReplicationServiceImpl::iterate_replica_devs(std::function< void(cshared< ReplDev >&) > const& cb) { + std::unique_lock lg(m_rd_map_mtx); + for (const auto& [uuid, rd] : m_rd_map) { + cb(rd); + } +} + +shared< ReplDev > ReplicationServiceImpl::open_replica_dev(superblk< repl_dev_superblk > const& rd_sb, + bool load_existing) { + auto it = m_rd_map.end(); + bool happened = false; + + { + std::unique_lock lg(m_rd_map_mtx); + std::tie(it, happened) = m_rd_map.emplace(std::make_pair(gid, nullptr)); + } + DEBUG_ASSERT(m_rd_map.end() != it, "Could not insert into map!"); + if (!happened) { return it->second }; + + auto repl_dev = std::make_shared< ReplDevImpl >(rd_sb, load_existing); + it->second = repl_dev; + repl_dev->attach_listener(std::move(m_svc_cbs->on_repl_dev_init(repl_dev))); + + return repl_dev; +} + +void ReplicationServiceImpl::rd_super_blk_found(sisl::byte_view const& buf, void* meta_cookie) { + superblk< repl_dev_superblk > rd_sb; + rd_sb.load(buf, meta_cookie); + DEBUG_ASSERT_EQ(rd_sb->get_magic(), home_rs_superblk::REPLICA_DEV_SB_MAGIC, "Invalid rdev metablk, magic mismatch"); + DEBUG_ASSERT_EQ(rd_sb->get_version(), home_rs_superblk::REPLICA_DEV_SB_VERSION, "Invalid version of rdev metablk"); + + open_replica_dev(rd_sb, true /* load_existing */); +} + +} // namespace homestore diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt index fdaa3fa0d..021138bd9 100644 --- a/src/tests/CMakeLists.txt +++ b/src/tests/CMakeLists.txt @@ -37,11 +37,6 @@ if (${build_nonio_tests}) target_link_libraries(test_mem_btree ${COMMON_TEST_DEPS} GTest::gtest) add_test(NAME MemBtree COMMAND test_mem_btree) - set(TEST_INDEXBTREE_SOURCE_FILES test_index_btree.cpp) - add_executable(test_index_btree ${TEST_INDEXBTREE_SOURCE_FILES}) - target_link_libraries(test_index_btree homestore ${COMMON_TEST_DEPS} GTest::gtest) - add_test(NAME IndexBtree COMMAND test_index_btree) - add_executable(test_blk_read_tracker) target_sources(test_blk_read_tracker PRIVATE test_blk_read_tracker.cpp ../lib/blkdata_svc/blk_read_tracker.cpp ../lib/blkalloc/blk.cpp) target_link_libraries(test_blk_read_tracker ${COMMON_TEST_DEPS} GTest::gtest) @@ -61,10 +56,22 @@ if (${build_nonio_tests}) target_sources(test_append_blk_allocator PRIVATE test_append_blkalloc.cpp) target_link_libraries(test_append_blk_allocator homestore ${COMMON_TEST_DEPS} GTest::gmock) add_test(NAME AppendBlkAlloc COMMAND test_append_blk_allocator) + set_property(TEST AppendBlkAlloc PROPERTY ENVIRONMENT "ASAN_OPTIONS=detect_stack_use_after_return=true") + + set(TEST_BLKID_SOURCES test_blkid.cpp ../lib/blkalloc/blk.cpp) + add_executable(test_blkid ${TEST_BLKID_SOURCES}) + target_link_libraries(test_blkid ${COMMON_TEST_DEPS} GTest::gtest) + add_test(NAME TestBlkid COMMAND test_blkid) + endif() can_build_io_tests(io_tests) if (${io_tests}) + set(TEST_INDEXBTREE_SOURCE_FILES test_index_btree.cpp) + add_executable(test_index_btree ${TEST_INDEXBTREE_SOURCE_FILES}) + target_link_libraries(test_index_btree homestore ${COMMON_TEST_DEPS} GTest::gtest) + add_test(NAME IndexBtree COMMAND test_index_btree) + set_property(TEST IndexBtree PROPERTY ENVIRONMENT "ASAN_OPTIONS=detect_stack_use_after_return=true") add_executable(test_data_service) target_sources(test_data_service PRIVATE test_data_service.cpp) diff --git a/src/tests/test_append_blkalloc.cpp b/src/tests/test_append_blkalloc.cpp index 27125f810..4c0c01e4a 100644 --- a/src/tests/test_append_blkalloc.cpp +++ b/src/tests/test_append_blkalloc.cpp @@ -104,28 +104,23 @@ class AppendBlkAllocatorTest : public testing::Test { auto sg_read_ptr = std::make_shared< sisl::sg_list >(); write_sgs(io_size, sg_write_ptr, 1 /* num_iovs */) - .thenValue([sg_write_ptr, sg_read_ptr, this](const std::vector< BlkId >& out_bids) mutable { + .thenValue([sg_write_ptr, sg_read_ptr, this](auto&& written_bid_ptr) mutable { // this will be called in write io completion cb; LOGINFO("after_write_cb: Write completed;"); - HS_DBG_ASSERT_EQ(out_bids.size(), 1); + iovec iov; + iov.iov_len = written_bid_ptr->blk_count() * inst().get_blk_size(); + iov.iov_base = iomanager.iobuf_alloc(512, iov.iov_len); + sg_read_ptr->iovs.push_back(iov); + sg_read_ptr->size = iov.iov_len; - const auto num_iovs = out_bids.size(); - - for (auto i = 0ul; i < num_iovs; ++i) { - struct iovec iov; - iov.iov_len = out_bids[i].get_nblks() * inst().get_page_size(); - iov.iov_base = iomanager.iobuf_alloc(512, iov.iov_len); - sg_read_ptr->iovs.push_back(iov); - sg_read_ptr->size += iov.iov_len; - } - - LOGINFO("Step 2: async read on blkid: {}", out_bids[0].to_string()); - return inst().async_read(out_bids[0], *sg_read_ptr, sg_read_ptr->size); + LOGINFO("Step 2: async read on blkid: {}", written_bid_ptr->to_string()); + return inst().async_read(*written_bid_ptr, *sg_read_ptr, sg_read_ptr->size); }) - .thenValue([this, sg_write_ptr, sg_read_ptr](auto) mutable { + .thenValue([this, sg_write_ptr, sg_read_ptr](auto err) mutable { + RELEASE_ASSERT(!err, "read failured"); const auto equal = test_common::HSTestHelper::compare(*sg_read_ptr, *sg_write_ptr); - assert(equal); + RELEASE_ASSERT(equal, "read/write mismatch"); LOGINFO("Read completed;"); free(*sg_write_ptr); @@ -138,24 +133,19 @@ class AppendBlkAllocatorTest : public testing::Test { void write_io_free_blk(const uint64_t io_size) { std::shared_ptr< sisl::sg_list > sg_write_ptr = std::make_shared< sisl::sg_list >(); - auto futs = write_sgs(io_size, sg_write_ptr, 1 /* num_iovs */) - .thenValue([sg_write_ptr, this](const std::vector< BlkId >& out_bids) { - LOGINFO("after_write_cb: Write completed;"); - free(*sg_write_ptr); - - std::vector< folly::Future< bool > > futs; - for (const auto& free_bid : out_bids) { - LOGINFO("Step 2: started async_free_blk: {}", free_bid.to_string()); - auto f = inst().async_free_blk(free_bid); - futs.emplace_back(std::move(f)); - } - return futs; - }); - - folly::collectAllUnsafe(futs).then([this](auto) { - LOGINFO("completed async_free_blks"); - this->finish_and_notify(); - }); + write_sgs(io_size, sg_write_ptr, 1 /* num_iovs */) + .thenValue([sg_write_ptr, this](auto&& written_bid_ptr) { + LOGINFO("after_write_cb: Write completed;"); + free(*sg_write_ptr); + + LOGINFO("Step 2: started async_free_blk: {}", written_bid_ptr->to_string()); + return inst().async_free_blk(*written_bid_ptr); + }) + .thenValue([this](auto&& err) { + RELEASE_ASSERT(!err, "Failed to free blks"); + LOGINFO("completed async_free_blks"); + this->finish_and_notify(); + }); } private: @@ -166,7 +156,7 @@ class AppendBlkAllocatorTest : public testing::Test { // caller should be responsible to call free(sg) to free the iobuf allocated in iovs, // normally it should be freed in after_write_cb; // - folly::Future< std::vector< BlkId > > write_sgs(uint64_t io_size, cshared< sisl::sg_list >& sg, uint32_t num_iovs) { + folly::Future< shared< BlkId > > write_sgs(uint64_t io_size, cshared< sisl::sg_list >& sg, uint32_t num_iovs) { // TODO: What if iov_len is not multiple of 4Ki? HS_DBG_ASSERT_EQ(io_size % (4 * Ki * num_iovs), 0, "Expecting iov_len : {} to be multiple of {}.", io_size / num_iovs, 4 * Ki); @@ -180,15 +170,12 @@ class AppendBlkAllocatorTest : public testing::Test { sg->size += iov_len; } - auto out_bids_ptr = std::make_shared< std::vector< BlkId > >(); + MultiBlkId blkid; return inst() - .async_alloc_write(*(sg.get()), blk_alloc_hints{}, *out_bids_ptr, false /* part_of_batch*/) - .thenValue([sg, this, out_bids_ptr](bool success) { - assert(success); - for (const auto& bid : *out_bids_ptr) { - LOGINFO("bid: {}", bid.to_string()); - } - return folly::makeFuture< std::vector< BlkId > >(std::move(*out_bids_ptr)); + .async_alloc_write(*(sg.get()), blk_alloc_hints{}, blkid, false /* part_of_batch*/) + .thenValue([sg, this, blkid](auto err) { + RELEASE_ASSERT(!err, "Write failure"); + return folly::makeFuture< shared< MultiBlkId > >(std::make_shared< MultiBlkId >(blkid)); }); } diff --git a/src/tests/test_blk_cache_queue.cpp b/src/tests/test_blk_cache_queue.cpp index c91aa4e19..840c921af 100644 --- a/src/tests/test_blk_cache_queue.cpp +++ b/src/tests/test_blk_cache_queue.cpp @@ -77,7 +77,7 @@ struct BlkCacheQueueTest : public ::testing::Test { if (!(fill_session->slab_requirements.empty())) { uint32_t blk_id{0}; for (const auto& slab_cfg : m_cfg.m_per_slab_cfg) { - for (blk_cap_t i{0}; i < slab_cfg.max_entries; ++i) { + for (blk_num_t i{0}; i < slab_cfg.max_entries; ++i) { blk_cache_fill_req fill_req; fill_req.start_blk_num = blk_id; fill_req.nblks = slab_cfg.slab_size; diff --git a/src/tests/test_blkalloc.cpp b/src/tests/test_blkalloc.cpp index e0328c2a2..2fdf2f90c 100644 --- a/src/tests/test_blkalloc.cpp +++ b/src/tests/test_blkalloc.cpp @@ -114,8 +114,8 @@ struct BlkAllocatorTest { for (size_t slab_index{0}; slab_index < slab_distribution.size(); ++slab_index) { cum_pct += slab_distribution[slab_index]; const blk_count_t slab_size{static_cast< blk_count_t >(static_cast< blk_count_t >(1) << slab_index)}; - const blk_cap_t slab_count{ - static_cast< blk_cap_t >((m_total_count / slab_size) * (slab_distribution[slab_index] / 100.0))}; + const blk_num_t slab_count{ + static_cast< blk_num_t >((m_total_count / slab_size) * (slab_distribution[slab_index] / 100.0))}; if (slab_index == 0) { m_slab_alloced_blks[0].m_max_quota = slab_count; } else { @@ -137,7 +137,7 @@ struct BlkAllocatorTest { } [[nodiscard]] bool alloced(const BlkId& bid, const bool track_block_group) { - uint32_t blk_num{static_cast< uint32_t >(bid.get_blk_num())}; + uint32_t blk_num = bid.blk_num(); if (blk_num >= m_total_count) { { std::scoped_lock< std::mutex > lock{s_print_mutex}; @@ -145,12 +145,12 @@ struct BlkAllocatorTest { } return false; } - m_alloced_count.fetch_add(bid.get_nblks(), std::memory_order_acq_rel); + m_alloced_count.fetch_add(bid.blk_count(), std::memory_order_acq_rel); - const slab_idx_t slab_idx{m_track_slabs ? nblks_to_idx(bid.get_nblks()) : static_cast< slab_idx_t >(0)}; + const slab_idx_t slab_idx{m_track_slabs ? nblks_to_idx(bid.blk_count()) : static_cast< slab_idx_t >(0)}; if (track_block_group) { // add blocks as group to each slab - if (!blk_map(slab_idx).insert(blk_num, bid.get_nblks()).second) { + if (!blk_map(slab_idx).insert(blk_num, bid.blk_count()).second) { { std::scoped_lock< std::mutex > lock{s_print_mutex}; std::cout << "Duplicate alloc of blk=" << blk_num << std::endl; @@ -163,7 +163,7 @@ struct BlkAllocatorTest { } else { // add blocks individually to each slab - for (blk_count_t i{0}; i < bid.get_nblks(); ++i) { + for (blk_count_t i{0}; i < bid.blk_count(); ++i) { if (!blk_list(slab_idx).add(blk_num)) { { std::scoped_lock< std::mutex > lock{s_print_mutex}; @@ -176,7 +176,7 @@ struct BlkAllocatorTest { } LOGTRACEMOD(blkalloc, "After Alloced nblks={} blk_range=[{}-{}] skip_list_size={} alloced_count={}", - bid.get_nblks(), blk_num, blk_num + bid.get_nblks() - 1, blk_list(slab_idx).size(), + bid.blk_count(), blk_num, blk_num + bid.blk_count() - 1, blk_list(slab_idx).size(), m_alloced_count.load(std::memory_order_relaxed)); return true; } @@ -381,8 +381,8 @@ struct FixedBlkAllocatorTest : public ::testing::Test, BlkAllocatorTest { virtual void SetUp() override{}; virtual void TearDown() override{}; - [[nodiscard]] bool alloc_blk(const BlkAllocStatus exp_status, BlkId& bid, const bool track_block_group) { - const auto ret{m_allocator->alloc(bid)}; + bool alloc_blk(const BlkAllocStatus exp_status, BlkId& bid, const bool track_block_group) { + const auto ret = m_allocator->alloc_contiguous(bid); if (ret != exp_status) { { std::scoped_lock< std::mutex > lock{s_print_mutex}; @@ -442,7 +442,7 @@ struct VarsizeBlkAllocatorTest : public ::testing::Test, BlkAllocatorTest { static thread_local std::vector< BlkId > bids; bids.clear(); - const auto ret{m_allocator->alloc(reqd_size, hints, bids)}; + const auto ret = m_allocator->alloc(reqd_size, hints, bids); if (ret != exp_status) { { std::scoped_lock< std::mutex > lock{s_print_mutex}; @@ -465,7 +465,7 @@ struct VarsizeBlkAllocatorTest : public ::testing::Test, BlkAllocatorTest { blk_count_t sz{0}; for (auto& bid : bids) { if (!alloced(bid, track_block_group)) { return false; } - sz += bid.get_nblks(); + sz += bid.blk_count(); } if (sz != reqd_size) { { @@ -546,8 +546,8 @@ struct VarsizeBlkAllocatorTest : public ::testing::Test, BlkAllocatorTest { while (freed_size < rand_size) { const auto bid{ free_random_alloced_sized_blk(rand_size - freed_size, round_nblks, track_block_group)}; - freed_nblks += bid.get_nblks(); - freed_size += bid.get_nblks(); + freed_nblks += bid.blk_count(); + freed_size += bid.blk_count(); } } } diff --git a/src/tests/test_blkid.cpp b/src/tests/test_blkid.cpp new file mode 100644 index 000000000..435e41784 --- /dev/null +++ b/src/tests/test_blkid.cpp @@ -0,0 +1,178 @@ +#include +#include + +#include +#include +#include + +#include + +SISL_LOGGING_INIT(test_blkid, iomgr, flip, io_wd) +SISL_OPTIONS_ENABLE(logging, test_blkid) + +SISL_OPTION_GROUP(test_blkid, + (num_iterations, "", "num_iterations", "number of iterations", + ::cxxopts::value< uint32_t >()->default_value("1"), "number")); + +using namespace homestore; +TEST(BlkIdTest, SingleBlkIdBasic) { + BlkId b1; + ASSERT_EQ(b1.is_valid(), false); + ASSERT_EQ(b1.to_integer(), 0ULL); + ASSERT_EQ(b1.to_string(), "Invalid_Blkid"); + + BlkId b2{10, 5, 1}; + ASSERT_EQ(b2.is_valid(), true); + ASSERT_EQ(b2.blk_num(), 10); + ASSERT_EQ(b2.blk_count(), 5); + ASSERT_EQ(b2.chunk_num(), 1); + ASSERT_EQ(b2.is_multi(), false); + + sisl::blob buf = b2.serialize(); + ASSERT_EQ(buf.size, sizeof(uint64_t)); + + BlkId b3; + b3.deserialize(buf, true); + ASSERT_EQ(b3.is_valid(), true); + ASSERT_EQ(b3, b2); + + BlkId b4{10, 6, 1}; + BlkId b5{9, 6, 1}; + BlkId b6{10, 5, 2}; + BlkId b7{10, 5, 1}; + ASSERT_LT(BlkId::compare(b2, b4), 0); + ASSERT_GT(BlkId::compare(b2, b5), 0); + ASSERT_LT(BlkId::compare(b2, b6), 0); + ASSERT_EQ(BlkId::compare(b2, b7), 0); +} + +TEST(BlkIdTest, SingleBlkIdInMap) { + std::map< int, BlkId > m1; + BlkId b1{30, 4, 2}; + m1.emplace(std::pair(84, BlkId{30, 4, 2})); + ASSERT_EQ(m1.at(84), b1); + + std::map< BlkId, int > m2; + m2.insert(std::pair(BlkId{30, 4, 2}, 94)); + m2.insert(std::pair(BlkId{30, 4, 1}, 96)); + + auto const it1 = m2.find(BlkId{30, 4, 2}); + ASSERT_EQ(it1->second, 94); + auto const it2 = m2.find(BlkId{30, 4, 3}); + ASSERT_EQ(it2, m2.cend()); +} + +TEST(BlkIdTest, MultiBlkIdTest) { + MultiBlkId mb1; + ASSERT_EQ(mb1.is_valid(), false); + ASSERT_EQ(mb1.to_string(), "MultiBlks: {}"); + ASSERT_EQ(mb1.is_multi(), true); + ASSERT_EQ(mb1.num_pieces(), 0); + + mb1.add(10, 5, 1); + ASSERT_EQ(mb1.is_valid(), true); + ASSERT_EQ(mb1.blk_num(), 10); + ASSERT_EQ(mb1.blk_count(), 5); + ASSERT_EQ(mb1.chunk_num(), 1); + ASSERT_EQ(mb1.is_multi(), true); + + std::array< BlkId, 5 > abs{BlkId{20, 8, 1}, BlkId{30, 1, 1}, BlkId{60, 9, 1}, BlkId{80, 5, 1}, BlkId{90, 2, 1}}; + for (auto const& b : abs) { + mb1.add(b); + } + ASSERT_EQ(mb1.num_pieces(), 6); + + auto it = mb1.iterate(); + uint32_t i = 0; + while (auto b = it.next()) { + if (i == 0) { + ASSERT_EQ(b->blk_num(), 10); + ASSERT_EQ(b->blk_count(), 5); + } else { + ASSERT_EQ(*b, abs[i - 1]); + } + ++i; + } + ASSERT_EQ(i, 6); + + auto bl = mb1.serialize(); + MultiBlkId mb2; + mb2.add(5, 6, 2); + mb2.add(11, 10, 2); + mb2.deserialize(bl, true); // Overwrite + ASSERT_EQ(mb1, mb2); +} + +TEST(BlkIdTest, MultiBlkIdInMap) { + std::map< MultiBlkId, int > m1; + std::unordered_map< MultiBlkId, int > m2; + + MultiBlkId mb1{30, 4, 2}; + mb1.add(90, 4, 2); + mb1.add(80, 4, 2); + mb1.add(20, 4, 2); + mb1.add(10, 4, 2); + ASSERT_EQ(mb1.num_pieces(), 5); + + m1.insert(std::pair(mb1, 92)); + m2.insert(std::pair(mb1, 92)); + + MultiBlkId mb2{30, 4, 1}; + mb2.add(90, 4, 1); + mb2.add(30, 4, 1); + mb2.add(20, 4, 1); + mb2.add(10, 4, 1); + m1.insert(std::pair(mb2, 89)); + m2.insert(std::pair(mb2, 89)); // Insert exactly same except chunk_id different + + MultiBlkId mb3{30, 4, 1}; + mb3.add(90, 4, 1); + mb3.add(30, 4, 1); + mb3.add(20, 4, 1); + mb3.add(10, 4, 1); + m1.insert_or_assign(mb3, 90); + m2.insert_or_assign(mb3, 90); // Update the value to validate == works correctly + + MultiBlkId mb4{30, 4, 2}; + mb4.add(80, 4, 2); + ASSERT_EQ(mb4.num_pieces(), 2); + m1.insert(std::pair(mb4, 93)); + m2.insert(std::pair(mb4, 93)); + + MultiBlkId mb5{30, 4, 2}; + mb5.add(10, 3, 2); + m1.insert(std::pair(mb5, 91)); + m2.insert(std::pair(mb5, 91)); + + // Validate get on both the maps + ASSERT_EQ(m1[mb1], 92); + ASSERT_EQ(m2[mb1], 92); + ASSERT_EQ(m1[mb3], 90); + ASSERT_EQ(m2[mb3], 90); + ASSERT_EQ(m1[mb4], 93); + ASSERT_EQ(m2[mb4], 93); + ASSERT_EQ(m1[mb5], 91); + ASSERT_EQ(m2[mb5], 91); + auto const it1 = m1.find(MultiBlkId{1, 1, 1}); + ASSERT_EQ(it1, m1.cend()); + auto const it2 = m2.find(MultiBlkId{100, 1, 2}); + ASSERT_EQ(it2, m2.cend()); + + // Validate sorting order of std::map + int prev_v{0}; + for (auto const [k, v] : m1) { + ASSERT_GT(v, prev_v); + prev_v = v; + } + ASSERT_EQ(m1.size(), 4u); +} + +int main(int argc, char* argv[]) { + int parsed_argc = argc; + ::testing::InitGoogleTest(&parsed_argc, argv); + SISL_OPTIONS_LOAD(parsed_argc, argv, logging, test_blkid); + sisl::logging::SetLogger("test_blkid"); + spdlog::set_pattern("[%D %T%z] [%^%l%$] [%t] %v"); + + return RUN_ALL_TESTS(); +} \ No newline at end of file diff --git a/src/tests/test_data_service.cpp b/src/tests/test_data_service.cpp index e9e91c8a7..baf1f3d48 100644 --- a/src/tests/test_data_service.cpp +++ b/src/tests/test_data_service.cpp @@ -71,15 +71,7 @@ typedef std::function< void(std::error_condition err, std::shared_ptr< std::vect class BlkDataServiceTest : public testing::Test { public: - BlkDataService& inst() { // return hs()->data_service(); - return homestore::data_service(); - } - - void print_bids(const std::vector< BlkId >& out_bids) { - for (auto i = 0ul; i < out_bids.size(); ++i) { - LOGINFO("bid[{}]: {}", i, out_bids[i].to_string()); - } - } + BlkDataService& inst() { return homestore::data_service(); } void free(sisl::sg_list& sg) { test_common::HSTestHelper::free(sg); } @@ -87,37 +79,35 @@ class BlkDataServiceTest : public testing::Test { void write_read_free_blk(uint64_t io_size) { auto sg_write_ptr = std::make_shared< sisl::sg_list >(); auto sg_read_ptr = std::make_shared< sisl::sg_list >(); - auto test_blkid_ptr = std::make_shared< BlkId >(); + auto test_blkid_ptr = std::make_shared< MultiBlkId >(); - write_sgs(io_size, sg_write_ptr, 1 /* num_iovs */) - .thenValue([this, sg_write_ptr, test_blkid_ptr](const std::vector< BlkId >& out_bids) { + write_sgs(io_size, sg_write_ptr, 1 /* num_iovs */, *test_blkid_ptr) + .thenValue([this, sg_write_ptr, sg_read_ptr, test_blkid_ptr](auto&& err) { + RELEASE_ASSERT(!err, "Write error"); LOGINFO("after_write_cb: Write completed;"); // sg_write buffer is no longer needed; free(*sg_write_ptr); - LOGINFO("Write blk ids: "); - print_bids(out_bids); + LOGINFO("Write blk ids: {}", test_blkid_ptr->to_string()); + HS_REL_ASSERT_GE(test_blkid_ptr->num_pieces(), 1); - HS_DBG_ASSERT_GE(out_bids.size(), 1); - *test_blkid_ptr = out_bids[0]; - }) - .thenValue([this, sg_read_ptr, test_blkid_ptr](auto) { struct iovec iov; - iov.iov_len = test_blkid_ptr->get_nblks() * inst().get_page_size(); + iov.iov_len = test_blkid_ptr->blk_count() * inst().get_blk_size(); iov.iov_base = iomanager.iobuf_alloc(512, iov.iov_len); sg_read_ptr->iovs.push_back(iov); - sg_read_ptr->size += iov.iov_len; + sg_read_ptr->size = iov.iov_len; LOGINFO("Step 2: async read on blkid: {}", test_blkid_ptr->to_string()); - add_read_delay(); return inst().async_read(*test_blkid_ptr, *sg_read_ptr, sg_read_ptr->size); }) - .thenValue([this, sg_read_ptr, test_blkid_ptr](auto) { + .thenValue([this, sg_read_ptr, test_blkid_ptr](auto&& err) { + RELEASE_ASSERT(!err, "Read error"); LOGINFO("read completed;"); free(*sg_read_ptr); return inst().async_free_blk(*test_blkid_ptr); }) - .thenValue([this, test_blkid_ptr](auto) { + .thenValue([this, test_blkid_ptr](auto&& err) { + RELEASE_ASSERT(!err, "free_blk error"); LOGINFO("completed async_free_blk: {}", test_blkid_ptr->to_string()); this->finish_and_notify(); }); @@ -127,39 +117,35 @@ class BlkDataServiceTest : public testing::Test { void write_free_blk_before_read_comp(const uint64_t io_size) { auto sg_write_ptr = std::make_shared< sisl::sg_list >(); auto sg_read_ptr = std::make_shared< sisl::sg_list >(); - auto test_blkid_ptr = std::make_shared< BlkId >(); - - write_sgs(io_size, sg_write_ptr, 1 /* num_iovs */) - .thenValue([this, sg_write_ptr, test_blkid_ptr](const std::vector< BlkId >& out_bids) { - // write completed, now we trigger read on a blkid and in read completion routine, we do - // a free blk; + auto test_blkid_ptr = std::make_shared< MultiBlkId >(); + write_sgs(io_size, sg_write_ptr, 1 /* num_iovs */, *test_blkid_ptr) + .thenValue([this, sg_write_ptr, sg_read_ptr, test_blkid_ptr](auto&& err) { + RELEASE_ASSERT(!err, "Write error"); LOGINFO("after_write_cb: Write completed;"); free(*sg_write_ptr); // sg_write buffer is no longer needed; - LOGINFO("Write blk ids: "); - print_bids(out_bids); + LOGINFO("Write blk ids: {}", test_blkid_ptr->to_string()); + HS_REL_ASSERT_GE(test_blkid_ptr->num_pieces(), 1); - HS_DBG_ASSERT_GE(out_bids.size(), 1); - *test_blkid_ptr = out_bids[0]; - }) - .thenValue([this, sg_read_ptr, test_blkid_ptr](auto) mutable { struct iovec iov; - iov.iov_len = test_blkid_ptr->get_nblks() * inst().get_page_size(); + iov.iov_len = test_blkid_ptr->blk_count() * inst().get_blk_size(); iov.iov_base = iomanager.iobuf_alloc(512, iov.iov_len); sg_read_ptr->iovs.push_back(iov); - sg_read_ptr->size += iov.iov_len; + sg_read_ptr->size = iov.iov_len; - LOGINFO("Step 2a: inject read delay on blkid: {}", test_blkid_ptr->to_string()); - LOGINFO("Step 2b: async read on blkid: {}", test_blkid_ptr->to_string()); + LOGINFO("Step 2a: inject read delay and read on blkid: {}", test_blkid_ptr->to_string()); + add_read_delay(); inst() .async_read(*test_blkid_ptr, *sg_read_ptr, sg_read_ptr->size) - .thenValue([sg_read_ptr, this](auto) { + .thenValue([sg_read_ptr, this](auto&& err) { + RELEASE_ASSERT(!err, "Read error"); + // if we are here, free_blk callback must have been called already, because data service layer // trigger the free_blk cb firstly then send read complete cb back to caller; m_read_blk_done = true; LOGINFO("read completed;"); - HS_DBG_ASSERT_EQ(m_free_blk_done.load(), true, + HS_REL_ASSERT_EQ(m_free_blk_done.load(), true, "free blk callback should not be called before read blk completes"); free(*sg_read_ptr); @@ -167,9 +153,10 @@ class BlkDataServiceTest : public testing::Test { }); LOGINFO("Step 3: started async_free_blk: {}", test_blkid_ptr->to_string()); - inst().async_free_blk(*test_blkid_ptr).thenValue([this](auto) { + inst().async_free_blk(*test_blkid_ptr).thenValue([this](auto&& err) { + RELEASE_ASSERT(!err, "free_blk error"); LOGINFO("completed async_free_blk"); - HS_DBG_ASSERT_EQ(m_free_blk_done.load(), false, "Duplicate free blk completion"); + HS_REL_ASSERT_EQ(m_free_blk_done.load(), false, "Duplicate free blk completion"); m_free_blk_done = true; }); }); @@ -177,55 +164,52 @@ class BlkDataServiceTest : public testing::Test { void write_io_free_blk(const uint64_t io_size) { std::shared_ptr< sisl::sg_list > sg_write_ptr = std::make_shared< sisl::sg_list >(); + auto test_blkid_ptr = std::make_shared< MultiBlkId >(); - auto futs = write_sgs(io_size, sg_write_ptr, 1 /* num_iovs */) - .thenValue([sg_write_ptr, this](const std::vector< BlkId >& out_bids) { - LOGINFO("after_write_cb: Write completed;"); - free(*sg_write_ptr); - - std::vector< folly::Future< bool > > futs; - for (const auto& free_bid : out_bids) { - LOGINFO("Step 2: started async_free_blk: {}", free_bid.to_string()); - auto f = inst().async_free_blk(free_bid); - futs.emplace_back(std::move(f)); - } - return futs; - }); - - folly::collectAllUnsafe(futs).then([this](auto) { - LOGINFO("completed async_free_blks"); - this->finish_and_notify(); - }); + write_sgs(io_size, sg_write_ptr, 1 /* num_iovs */, *test_blkid_ptr) + .thenValue([sg_write_ptr, this, test_blkid_ptr](auto&& err) { + RELEASE_ASSERT(!err, "Write error"); + LOGINFO("after_write_cb: Write completed;"); + free(*sg_write_ptr); + + LOGINFO("Step 2: started async_free_blk: {}", test_blkid_ptr->to_string()); + inst().async_free_blk(*test_blkid_ptr).thenValue([this](auto&& err) { + RELEASE_ASSERT(!err, "Free error"); + LOGINFO("completed async_free_blks"); + this->finish_and_notify(); + }); + }); } void write_io_verify(const uint64_t io_size) { auto sg_write_ptr = std::make_shared< sisl::sg_list >(); auto sg_read_ptr = std::make_shared< sisl::sg_list >(); + auto test_blkid_ptr = std::make_shared< MultiBlkId >(); + + write_sgs(io_size, sg_write_ptr, 1 /* num_iovs */, *test_blkid_ptr) + .thenValue([sg_write_ptr, sg_read_ptr, test_blkid_ptr, this](auto&& err) { + RELEASE_ASSERT(!err, "Write error"); - write_sgs(io_size, sg_write_ptr, 1 /* num_iovs */) - .thenValue([sg_write_ptr, sg_read_ptr, this](const std::vector< BlkId >& out_bids) mutable { // this will be called in write io completion cb; LOGINFO("after_write_cb: Write completed;"); // TODO: verify multiple read blks; - HS_DBG_ASSERT_EQ(out_bids.size(), 1); - - const auto num_iovs = out_bids.size(); + HS_DBG_ASSERT_EQ(test_blkid_ptr->num_pieces(), 1); - for (auto i = 0ul; i < num_iovs; ++i) { - struct iovec iov; - iov.iov_len = out_bids[i].get_nblks() * inst().get_page_size(); - iov.iov_base = iomanager.iobuf_alloc(512, iov.iov_len); - sg_read_ptr->iovs.push_back(iov); - sg_read_ptr->size += iov.iov_len; - } + struct iovec iov; + iov.iov_len = test_blkid_ptr->blk_count() * inst().get_blk_size(); + iov.iov_base = iomanager.iobuf_alloc(512, iov.iov_len); + sg_read_ptr->iovs.push_back(iov); + sg_read_ptr->size = iov.iov_len; - LOGINFO("Step 2: async read on blkid: {}", out_bids[0].to_string()); - return inst().async_read(out_bids[0], *sg_read_ptr, sg_read_ptr->size); + LOGINFO("Step 2: async read on blkid: {}", test_blkid_ptr->to_string()); + return inst().async_read(*test_blkid_ptr, *sg_read_ptr, sg_read_ptr->size); }) - .thenValue([this, sg_write_ptr, sg_read_ptr](auto) mutable { + .thenValue([this, sg_write_ptr, sg_read_ptr](auto&& err) mutable { + RELEASE_ASSERT(!err, "Read error"); + const auto equal = test_common::HSTestHelper::compare(*sg_read_ptr, *sg_write_ptr); - assert(equal); + RELEASE_ASSERT(equal, "Read after write data mismatch"); LOGINFO("Read completed;"); free(*sg_write_ptr); @@ -240,7 +224,8 @@ class BlkDataServiceTest : public testing::Test { // void write_io(uint64_t io_size, uint32_t num_iovs = 1) { auto sg = std::make_shared< sisl::sg_list >(); - write_sgs(io_size, sg, num_iovs).thenValue([this, sg](auto) { + MultiBlkId blkid; + write_sgs(io_size, sg, num_iovs, blkid).thenValue([this, sg](auto) { free(*sg); finish_and_notify(); }); @@ -268,7 +253,8 @@ class BlkDataServiceTest : public testing::Test { // caller should be responsible to call free(sg) to free the iobuf allocated in iovs, // normally it should be freed in after_write_cb; // - folly::Future< std::vector< BlkId > > write_sgs(uint64_t io_size, cshared< sisl::sg_list >& sg, uint32_t num_iovs) { + folly::Future< std::error_code > write_sgs(uint64_t io_size, cshared< sisl::sg_list >& sg, uint32_t num_iovs, + MultiBlkId& out_bids) { // TODO: What if iov_len is not multiple of 4Ki? HS_DBG_ASSERT_EQ(io_size % (4 * Ki * num_iovs), 0, "Expecting iov_len : {} to be multiple of {}.", io_size / num_iovs, 4 * Ki); @@ -282,16 +268,7 @@ class BlkDataServiceTest : public testing::Test { sg->size += iov_len; } - auto out_bids_ptr = std::make_shared< std::vector< BlkId > >(); - return inst() - .async_alloc_write(*(sg.get()), blk_alloc_hints{}, *out_bids_ptr, false /* part_of_batch*/) - .thenValue([sg, this, out_bids_ptr](bool success) { - assert(success); - for (const auto& bid : *out_bids_ptr) { - LOGINFO("bid: {}", bid.to_string()); - } - return folly::makeFuture< std::vector< BlkId > >(std::move(*out_bids_ptr)); - }); + return inst().async_alloc_write(*(sg.get()), blk_alloc_hints{}, out_bids, false /* part_of_batch*/); } void add_read_delay() {