diff --git a/conanfile.py b/conanfile.py index 753ce2458..d4bb59224 100644 --- a/conanfile.py +++ b/conanfile.py @@ -5,7 +5,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "4.2.2" + version = "4.3.0" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" @@ -55,7 +55,7 @@ def build_requirements(self): self.build_requires("gtest/1.14.0") def requirements(self): - self.requires("iomgr/[~=9, include_prerelease=True]@oss/master") + self.requires("iomgr/[~=10, include_prerelease=True]@oss/master") self.requires("sisl/[~=10, include_prerelease=True]@oss/master") self.requires("farmhash/cci.20190513@") diff --git a/src/include/homestore/blk.h b/src/include/homestore/blk.h index 14283cd3e..c0c8b25c7 100644 --- a/src/include/homestore/blk.h +++ b/src/include/homestore/blk.h @@ -24,122 +24,162 @@ #include #include +#include #include +#include #include namespace homestore { -typedef uint32_t blk_num_t; -typedef blk_num_t blk_cap_t; -static_assert(sizeof(blk_num_t) == (BLK_NUM_BITS - 1) / 8 + 1, "Expected blk_num_t to matching BLK_NUM_BITS"); +using chunk_num_t = uint16_t; +using blk_count_t = uint16_t; +using blk_num_t = uint32_t; +using blk_temp_t = uint16_t; -typedef uint8_t blk_count_serialized_t; -typedef uint16_t blk_count_t; -static_assert(sizeof(blk_count_serialized_t) == (NBLKS_BITS - 1) / 8 + 1, - "Expected blk_count_t to matching NBLKS_BITS"); +static constexpr size_t max_addressable_chunks() { return 1UL << (8 * sizeof(chunk_num_t)); } +static constexpr size_t max_blks_per_chunk() { return 1UL << (8 * sizeof(blk_num_t)); } +static constexpr size_t max_blks_per_blkid() { return (1UL << (8 * sizeof(blk_count_t))) - 1; } -typedef uint8_t chunk_num_t; -static_assert(sizeof(chunk_num_t) == (CHUNK_NUM_BITS - 1) / 8 + 1, "Expected blk_count_t to matching CHUNK_NUM_BITS"); - -typedef uint8_t blk_temp_t; - -/* This structure represents the application wide unique block number. It also encomposses the number of blks. */ +#pragma pack(1) struct BlkId { -private: - static constexpr uint64_t s_blk_num_mask{(static_cast< uint64_t >(1) << BLK_NUM_BITS) - 1}; - static constexpr uint64_t s_nblks_mask{(static_cast< uint64_t >(1) << NBLKS_BITS) - 1}; - static constexpr uint64_t s_chunk_num_mask{(static_cast< uint64_t >(1) << CHUNK_NUM_BITS) - 1}; +protected: + struct serialized { + blk_num_t m_is_multi : 1; // Is it a part of multi blkid or not + blk_num_t m_blk_num : 31; // Block number which is unique within the chunk + blk_count_t m_nblks; // Number of blocks+1 for this blkid, don't directly acccess this - use blk_count() + chunk_num_t m_chunk_num; // Chunk number - which is unique for the entire application -public: - static constexpr blk_count_t max_blks_in_op() { return (1 << NBLKS_BITS); } - static constexpr uint64_t max_id_int() { return (1ull << (BLK_NUM_BITS + NBLKS_BITS + CHUNK_NUM_BITS)) - 1; } + serialized() : m_is_multi{0}, m_blk_num{0}, m_nblks{0}, m_chunk_num{0} {} + serialized(bool is_multi, blk_num_t blk_num, blk_count_t nblks, chunk_num_t cnum) : + m_is_multi{is_multi ? 0x1u : 0x0u}, m_blk_num{blk_num}, m_nblks{nblks}, m_chunk_num{cnum} {} + }; + static_assert(sizeof(serialized) == sizeof(uint64_t), "Expected serialized size to 64 bits"); - static int compare(const BlkId& one, const BlkId& two); - uint64_t to_integer() const; + serialized s; +public: + BlkId() = default; explicit BlkId(uint64_t id_int); - BlkId(blk_num_t blk_num, blk_count_t nblks, chunk_num_t chunk_num = 0); - BlkId() { invalidate(); } - BlkId(const BlkId&) = default; - BlkId& operator=(const BlkId&) = default; + BlkId(blk_num_t blk_num, blk_count_t nblks, chunk_num_t chunk_num); + BlkId(BlkId const&) = default; + BlkId& operator=(BlkId const&) = default; BlkId(BlkId&&) noexcept = default; BlkId& operator=(BlkId&&) noexcept = default; - bool operator==(const BlkId& other) const { return (compare(*this, other) == 0); } - bool operator>(const BlkId& other) const { return (compare(*this, other) > 0); } - bool operator<(const BlkId& other) const { return (compare(*this, other) < 0); } + + bool operator==(BlkId const& other) const { return (compare(*this, other) == 0); } + bool operator>(BlkId const& other) const { return (compare(*this, other) > 0); } + bool operator<(BlkId const& other) const { return (compare(*this, other) < 0); } + + blk_num_t blk_num() const { return s.m_blk_num; } + blk_count_t blk_count() const { return s.m_nblks; } + chunk_num_t chunk_num() const { return s.m_chunk_num; } + bool is_multi() const { return s.m_is_multi; } void invalidate(); + uint64_t to_integer() const; + sisl::blob serialize(); // TODO: Consider making this const, perhaps returns const uint8_t version of blob + void deserialize(sisl::blob const& b, bool copy); + uint32_t serialized_size() const; + std::string to_string() const; bool is_valid() const; - BlkId get_blkid_at(uint32_t offset, uint32_t pagesz) const; - BlkId get_blkid_at(uint32_t offset, uint32_t size, uint32_t pagesz) const; - - void set(blk_num_t blk_num, blk_count_t nblks, chunk_num_t chunk_num = 0); - void set(const BlkId& bid); - void set(uint64_t id_int); + static int compare(BlkId const& one, BlkId const& two); +}; +#pragma pack() - void set_blk_num(blk_num_t blk_num); - blk_num_t get_blk_num() const { return m_blk_num; } - // last blk num is the last blk num that belongs this blkid; - blk_num_t get_last_blk_num() const { return get_blk_num() + get_nblks() - 1; } +#pragma pack(1) +struct MultiBlkId : public BlkId { + static constexpr uint32_t max_addln_pieces{5}; + static constexpr uint32_t max_pieces{max_addln_pieces + 1}; - void set_nblks(blk_count_t nblks); - blk_count_t get_nblks() const { return static_cast< blk_count_t >(m_nblks) + 1; } +private: + struct chain_blkid { + blk_num_t m_blk_num; + blk_count_t m_nblks{0}; - void set_chunk_num(const chunk_num_t chunk_num); - chunk_num_t get_chunk_num() const { return m_chunk_num; } + bool is_valid() const { return (m_nblks != 0); } + }; - /* A blkID represent a page size which is assigned to a blk allocator */ - uint32_t data_size(const uint32_t page_size) const { return (get_nblks() * page_size); } + uint16_t n_addln_piece{0}; + std::array< chain_blkid, max_addln_pieces > addln_pieces; +public: + MultiBlkId(); + MultiBlkId(BlkId const& b); + MultiBlkId(blk_num_t blk_num, blk_count_t nblks, chunk_num_t chunk_num); + MultiBlkId(MultiBlkId const&) = default; + MultiBlkId& operator=(MultiBlkId const&) = default; + MultiBlkId(MultiBlkId&&) noexcept = default; + MultiBlkId& operator=(MultiBlkId&&) noexcept = default; + + void add(blk_num_t blk_num, blk_count_t nblks, chunk_num_t chunk_num); + void add(BlkId const&); + + uint16_t num_pieces() const; + blk_count_t blk_count() const; std::string to_string() const; - blk_num_t m_blk_num; // Block number which is unique within the chunk - blk_count_serialized_t m_nblks; // Number of blocks+1 for this blkid, don't directly acccess this - use get_nblks() - chunk_num_t m_chunk_num; // Chunk number - which is unique for the entire application -} __attribute__((__packed__)); - -VENUM(BlkAllocStatus, uint32_t, - BLK_ALLOC_NONE = 0, // No Action taken - SUCCESS = 1ul << 0, // Success - FAILED = 1ul << 1, // Failed to alloc/free - REQ_MORE = 1ul << 2, // Indicate that we need more - SPACE_FULL = 1ul << 3, // Space is full - INVALID_DEV = 1ul << 4, // Invalid Device provided for alloc - PARTIAL = 1ul << 5, // In case of multiple blks, only partial is alloced/freed - INVALID_THREAD = 1ul << 6 // Not possible to alloc in this thread -); - -static_assert(sizeof(BlkId) < 8); -#pragma pack(1) -struct BlkId8_t : public BlkId { - uint8_t pad[8 - sizeof(BlkId)]{}; - - BlkId8_t& operator=(const BlkId& rhs) { - BlkId::operator=(rhs); - return *this; - } + bool operator==(MultiBlkId const& other) const { return (compare(*this, other) == 0); } + bool operator>(MultiBlkId const& other) const { return (compare(*this, other) > 0); } + bool operator<(MultiBlkId const& other) const { return (compare(*this, other) < 0); } + + sisl::blob serialize(); + uint32_t serialized_size() const; + void deserialize(sisl::blob const& b, bool copy); + + bool has_room() const; + BlkId to_single_blkid() const; + + static int compare(MultiBlkId const& one, MultiBlkId const& two); + + struct iterator { + MultiBlkId const& mbid_; + uint16_t next_blk_{0}; + + iterator(MultiBlkId const& mb) : mbid_{mb} {} + std::optional< BlkId > next() { + if (next_blk_ == 0) { + auto bid = r_cast< BlkId const& >(mbid_); + ++next_blk_; + return (bid.is_valid()) ? std::make_optional(bid) : std::nullopt; + } else if (next_blk_ < mbid_.num_pieces()) { + auto cbid = mbid_.addln_pieces[next_blk_ - 1]; + ++next_blk_; + return std::make_optional(BlkId{cbid.m_blk_num, cbid.m_nblks, mbid_.chunk_num()}); + } else { + return std::nullopt; + } + } + }; + + iterator iterate() const; }; #pragma pack() -static_assert(sizeof(BlkId8_t) == 8); -inline blk_num_t begin_of(const BlkId& blkid) { return blkid.get_blk_num(); } -inline blk_num_t end_of(const BlkId& blkid) { return blkid.get_blk_num() + blkid.get_nblks(); } -inline size_t hash_value(const BlkId& blkid) { return std::hash< uint64_t >()(blkid.to_integer()); } } // namespace homestore -// hash function definitions +///////////////////// hash function definitions ///////////////////// namespace std { template <> struct hash< homestore::BlkId > { - typedef homestore::BlkId argument_type; - typedef size_t result_type; - result_type operator()(const argument_type& bid) const noexcept { - return std::hash< uint64_t >()(bid.to_integer()); + size_t operator()(const homestore::BlkId& bid) const noexcept { return std::hash< uint64_t >()(bid.to_integer()); } +}; + +template <> +struct hash< homestore::MultiBlkId > { + size_t operator()(const homestore::MultiBlkId& mbid) const noexcept { + static constexpr size_t s_start_seed = 0xB504F333; + size_t seed = s_start_seed; + auto it = mbid.iterate(); + while (auto b = it.next()) { + boost::hash_combine(seed, b->to_integer()); + } + return seed; } }; } // namespace std +///////////////////// formatting definitions ///////////////////// template < typename T > struct fmt::formatter< T, std::enable_if_t< std::is_base_of< homestore::BlkId, T >::value, char > > : fmt::formatter< std::string > { @@ -148,10 +188,25 @@ struct fmt::formatter< T, std::enable_if_t< std::is_base_of< homestore::BlkId, T } }; -namespace homestore { +template < typename T > +struct fmt::formatter< T, std::enable_if_t< std::is_base_of< homestore::MultiBlkId, T >::value, char > > + : fmt::formatter< std::string > { + auto format(const homestore::MultiBlkId& a, format_context& ctx) const { + return fmt::formatter< std::string >::format(a.to_string(), ctx); + } +}; -template < typename charT, typename traits > -std::basic_ostream< charT, traits >& operator<<(std::basic_ostream< charT, traits >& outStream, const BlkId& blk) { +namespace boost { +template <> +struct hash< homestore::BlkId > { + size_t operator()(const homestore::BlkId& bid) const noexcept { return std::hash< homestore::BlkId >()(bid); } +}; +} // namespace boost + +namespace homestore { +///////////////////// stream operation definitions ///////////////////// +template < typename charT, typename traits, typename blkidT > +std::basic_ostream< charT, traits >& stream_op(std::basic_ostream< charT, traits >& outStream, blkidT const& blk) { // copy the stream formatting std::basic_ostringstream< charT, traits > outStringStream; outStringStream.copyfmt(outStream); @@ -163,27 +218,40 @@ std::basic_ostream< charT, traits >& operator<<(std::basic_ostream< charT, trait return outStream; } -/* Hints for various allocators */ +template < typename charT, typename traits > +std::basic_ostream< charT, traits >& operator<<(std::basic_ostream< charT, traits >& outStream, BlkId const& blk) { + return stream_op< charT, traits, BlkId >(outStream, blk); +} + +template < typename charT, typename traits > +std::basic_ostream< charT, traits >& operator<<(std::basic_ostream< charT, traits >& outStream, MultiBlkId const& blk) { + return stream_op< charT, traits, MultiBlkId >(outStream, blk); +} + +///////////////////// Other common Blkd definitions ///////////////////// +VENUM(BlkAllocStatus, uint32_t, + BLK_ALLOC_NONE = 0, // No Action taken + SUCCESS = 1ul << 0, // Success + FAILED = 1ul << 1, // Failed to alloc/free + REQ_MORE = 1ul << 2, // Indicate that we need more + SPACE_FULL = 1ul << 3, // Space is full + INVALID_DEV = 1ul << 4, // Invalid Device provided for alloc + PARTIAL = 1ul << 5, // In case of multiple blks, only partial is alloced/freed + INVALID_THREAD = 1ul << 6, // Not possible to alloc in this thread + INVALID_INPUT = 1ul << 7, // Invalid input + TOO_MANY_PIECES = 1ul << 8 // Allocation results in more pieces than passed on +); + struct blk_alloc_hints { - blk_alloc_hints() : - desired_temp{0}, - dev_id_hint{INVALID_DEV_ID}, - can_look_for_other_chunk{true}, - is_contiguous{false}, - multiplier{1}, - max_blks_per_entry{BlkId::max_blks_in_op()}, - stream_info{(uintptr_t) nullptr} {} - - blk_temp_t desired_temp; // Temperature hint for the device - uint32_t dev_id_hint; // which physical device to pick (hint if any) -1 for don't care - bool can_look_for_other_chunk; // If alloc on device not available can I pick other device - bool is_contiguous; - uint32_t multiplier; // blks allocated in a blkid should be a multiple of multiplier - uint32_t max_blks_per_entry; // Number of blks on every entry - uintptr_t stream_info; -#ifdef _PRERELEASE - bool error_simulate = false; // can error simulate happen -#endif + blk_temp_t desired_temp{0}; // Temperature hint for the device + std::optional< uint32_t > pdev_id_hint; // which physical device to pick (hint if any) -1 for don't care + std::optional< chunk_num_t > chunk_id_hint; // any specific chunk id to pick for this allocation + std::optional< stream_id_t > stream_id_hint; // any specific stream to pick + bool can_look_for_other_chunk{true}; // If alloc on device not available can I pick other device + bool is_contiguous{true}; // Should the entire allocation be one contiguous block + bool partial_alloc_ok{false}; // ok to allocate only portion of nblks? Mutually exclusive with is_contiguous + uint32_t min_blks_per_piece{1}; // blks allocated in a blkid should be atleast this size per entry + uint32_t max_blks_per_piece{max_blks_per_blkid()}; // Number of blks on every entry }; } // namespace homestore diff --git a/src/include/homestore/blkdata_service.hpp b/src/include/homestore/blkdata_service.hpp index d82eaf7dd..3cb2987ca 100644 --- a/src/include/homestore/blkdata_service.hpp +++ b/src/include/homestore/blkdata_service.hpp @@ -56,7 +56,7 @@ class BlkDataService { * * @param vb : vdev info blk containing the details of this blkstore */ - shared< VirtualDev > open_vdev(const vdev_info& vinfo, bool load_existing); + shared< VirtualDev > open_vdev(vdev_info const& vinfo, bool load_existing); /** * @brief : asynchronous write without input block ids. Block ids will be allocated by this api and returned; @@ -67,9 +67,11 @@ class BlkDataService { * @param cb : callback that will be triggered after write completes; * @param part_of_batch : is this write part of a batch; */ - folly::Future< bool > async_alloc_write(const sisl::sg_list& sgs, const blk_alloc_hints& hints, - std::vector< BlkId >& out_blkids, bool part_of_batch = false); + folly::Future< std::error_code > async_alloc_write(sisl::sg_list const& sgs, blk_alloc_hints const& hints, + MultiBlkId& out_blkids, bool part_of_batch = false); + folly::Future< std::error_code > async_write(const char* buf, uint32_t size, MultiBlkId const& bid, + bool part_of_batch); /** * @brief : asynchronous write with input block ids; * @@ -79,8 +81,11 @@ class BlkDataService { * @param cb : callback that will be triggered after write completes * @param part_of_batch : is this write part of a batch; */ - folly::Future< bool > async_write(const sisl::sg_list& sgs, const blk_alloc_hints& hints, - const std::vector< BlkId >& in_blkids, bool part_of_batch = false); + folly::Future< std::error_code > async_write(sisl::sg_list const& sgs, MultiBlkId const& in_blkids, + bool part_of_batch = false); + + folly::Future< std::error_code > async_read(MultiBlkId const& bid, uint8_t* buf, uint32_t size, + bool part_of_batch = false); /** * @brief : asynchronous read @@ -91,14 +96,15 @@ class BlkDataService { * @param cb : callback that will be triggered after read completes * @param part_of_batch : is this read part of batch; */ - folly::Future< bool > async_read(const BlkId& bid, sisl::sg_list& sgs, uint32_t size, bool part_of_batch = false); + folly::Future< std::error_code > async_read(MultiBlkId const& bid, sisl::sg_list& sgs, uint32_t size, + bool part_of_batch = false); /** * @brief : commit a block, usually called during recovery * * @param bid : block id to commit; */ - void commit_blk(const BlkId& bid); + void commit_blk(MultiBlkId const& bid); /** * @brief : alloc blocks based on input size; @@ -116,14 +122,14 @@ class BlkDataService { * @param bid : the block id to free * @param cb : the callback that will be triggered after free block completes; */ - folly::Future< bool > async_free_blk(const BlkId bid); + folly::Future< std::error_code > async_free_blk(MultiBlkId const& bid); /** - * @brief : get the page size of this data service; + * @brief : get the blk size of this data service; * - * @return : page size + * @return : blk size */ - uint32_t get_page_size() const { return m_page_size; } + uint32_t get_blk_size() const { return m_blk_size; } /** * @brief : get the read block tracker handle; @@ -138,7 +144,7 @@ class BlkDataService { void start(); private: - BlkAllocStatus alloc_blks(uint32_t size, const blk_alloc_hints& hints, std::vector< BlkId >& out_blkids); + BlkAllocStatus alloc_blks(uint32_t size, blk_alloc_hints const& hints, MultiBlkId& out_blkids); void init(); @@ -147,7 +153,7 @@ class BlkDataService { private: std::shared_ptr< VirtualDev > m_vdev; std::unique_ptr< BlkReadTracker > m_blk_read_tracker; - uint32_t m_page_size; + uint32_t m_blk_size; }; extern BlkDataService& data_service(); diff --git a/src/include/homestore/homestore_decl.hpp b/src/include/homestore/homestore_decl.hpp index 8dd859556..2b227d08b 100644 --- a/src/include/homestore/homestore_decl.hpp +++ b/src/include/homestore/homestore_decl.hpp @@ -20,6 +20,7 @@ #include #include +#include #include #include @@ -49,7 +50,10 @@ template < typename T > using cshared = const std::shared_ptr< T >; template < typename T > -using unique = const std::unique_ptr< T >; +using unique = std::unique_ptr< T >; + +template < typename T > +using intrusive = boost::intrusive_ptr< T >; ////////////// All Size Limits /////////////////// constexpr uint32_t BLK_NUM_BITS{32}; @@ -145,12 +149,17 @@ struct HS_SERVICE { static constexpr uint32_t LOG_LOCAL = 1 << 2; static constexpr uint32_t DATA = 1 << 3; static constexpr uint32_t INDEX = 1 << 4; + static constexpr uint32_t REPLICATION = 1 << 5; uint32_t svcs; HS_SERVICE() : svcs{META} {} HS_SERVICE(uint32_t val) : svcs{val} { svcs |= META; // Force meta to be present always + if (svcs & REPLICATION) { + svcs |= LOG_REPLICATED | LOG_LOCAL; + svcs &= ~DATA; // ReplicationDataSvc or DataSvc only one of them + } } std::string list() const { @@ -160,6 +169,7 @@ struct HS_SERVICE { if (svcs & INDEX) { str += "index,"; } if (svcs & LOG_REPLICATED) { str += "log_replicated,"; } if (svcs & LOG_LOCAL) { str += "log_local,"; } + if (svcs & REPLICATION) { str += "replication,"; } return str; } }; diff --git a/src/include/homestore/logstore_service.hpp b/src/include/homestore/logstore_service.hpp index f7eda0035..3f1d62958 100644 --- a/src/include/homestore/logstore_service.hpp +++ b/src/include/homestore/logstore_service.hpp @@ -135,7 +135,7 @@ class LogStoreService { void device_truncate(const device_truncate_cb_t& cb = nullptr, const bool wait_till_done = false, const bool dry_run = false); - folly::Future< bool > create_vdev(uint64_t size, logstore_family_id_t family); + folly::Future< std::error_code > create_vdev(uint64_t size, logstore_family_id_t family); shared< VirtualDev > open_vdev(const vdev_info& vinfo, logstore_family_id_t family, bool load_existing); shared< JournalVirtualDev > get_vdev(logstore_family_id_t family) const { return (family == DATA_LOG_FAMILY_IDX) ? m_data_logdev_vdev : m_ctrl_logdev_vdev; diff --git a/src/include/homestore/meta_service.hpp b/src/include/homestore/meta_service.hpp index 3ab8e1cf6..b6b5c9b4c 100644 --- a/src/include/homestore/meta_service.hpp +++ b/src/include/homestore/meta_service.hpp @@ -265,7 +265,7 @@ class MetaBlkService { * */ void alloc_meta_blk(BlkId& bid); - void alloc_meta_blk(uint64_t size, std::vector< BlkId >& bid); + void alloc_meta_blks(uint64_t size, std::vector< BlkId >& bid); void free_meta_blk(meta_blk* mblk); diff --git a/src/include/homestore/replication/repl_decls.h b/src/include/homestore/replication/repl_decls.h new file mode 100644 index 000000000..1c659d6ad --- /dev/null +++ b/src/include/homestore/replication/repl_decls.h @@ -0,0 +1,44 @@ +#pragma once +#include +#include + +#include +#include +#include +#include +#include +#include + +SISL_LOGGING_DECL(replication) + +#define REPL_LOG_MODS grpc_server, HOMESTORE_LOG_MODS, nuraft_mesg, nuraft, replication + +namespace homestore { +using blkid_list_t = folly::small_vector< BlkId, 4 >; + +// Fully qualified domain pba, unique pba id across replica set +struct RemoteBlkId { + RemoteBlkId(uint32_t s, const BlkId& b) : server_id{s}, blkid{b} {} + uint32_t server_id; + BlkId blkid; + + bool operator==(RemoteBlkId const& o) const { return (server_id == o.server_id) && (blkid == o.blkid); } +}; + +using remote_blkid_list_t = folly::small_vector< RemoteBlkId, 4 >; + +// data service api names +static std::string const SEND_DATA{"send_data"}; +static std::string const FETCH_DATA{"fetch_data"}; + +} // namespace homestore + +// hash function definitions +namespace std { +template <> +struct hash< homestore::RemoteBlkId > { + size_t operator()(homestore::RemoteBlkId const& fqbid) const noexcept { + return std::hash< uint64_t >()(fqbid.server_id) + std::hash< uint64_t >()(fqbid.blkid.to_integer()); + } +}; +} // namespace std diff --git a/src/include/homestore/replication/repl_dev.h b/src/include/homestore/replication/repl_dev.h new file mode 100644 index 000000000..6b459d10a --- /dev/null +++ b/src/include/homestore/replication/repl_dev.h @@ -0,0 +1,132 @@ +#pragma once + +#include +#include + +#include + +namespace home_replication { + +// +// Callbacks to be implemented by ReplDev users. +// +class ReplDevListener { +public: + virtual ~ReplDevListener() = default; + + /// @brief Called when the log entry has been committed in the replica set. + /// + /// This function is called from a dedicated commit thread which is different from the original thread calling + /// replica_set::write(). There is only one commit thread, and lsn is guaranteed to be monotonically increasing. + /// + /// @param lsn - The log sequence number + /// @param header - Header originally passed with replica_set::write() api + /// @param key - Key originally passed with replica_set::write() api + /// @param blkids - List of blkids where data is written to the storage engine. + /// @param ctx - User contenxt passed as part of the replica_set::write() api + /// + virtual void on_commit(int64_t lsn, sisl::blob const& header, sisl::blob const& key, blkid_list_t const& blkids, + void* ctx) = 0; + + /// @brief Called when the log entry has been received by the replica dev. + /// + /// On recovery, this is called from a random worker thread before the raft server is started. It is + /// guaranteed to be serialized in log index order. + /// + /// On the leader, this is called from the same thread that replica_set::write() was called. + /// + /// On the follower, this is called when the follower has received the log entry. It is guaranteed to be serialized + /// in log sequence order. + /// + /// NOTE: Listener can choose to ignore this pre commit, however, typical use case of maintaining this is in-case + /// replica set needs to support strong consistent reads and follower needs to ignore any keys which are not being + /// currently in pre-commit, but yet to be committed. + /// + /// @param lsn - The log sequence number + /// @param header - Header originally passed with repl_dev::write() api + /// @param key - Key originally passed with repl_dev::write() api + /// @param ctx - User contenxt passed as part of the repl_dev::write() api + virtual void on_pre_commit(int64_t lsn, const sisl::blob& header, const sisl::blob& key, void* ctx) = 0; + + /// @brief Called when the log entry has been rolled back by the replica set. + /// + /// This function is called on followers only when the log entry is going to be overwritten. This function is called + /// from a random worker thread, but is guaranteed to be serialized. + /// + /// For each log index, it is guaranteed that either on_commit() or on_rollback() is called but not both. + /// + /// NOTE: Listener should do the free any resources created as part of pre-commit. + /// + /// @param lsn - The log sequence number getting rolled back + /// @param header - Header originally passed with repl_dev::write() api + /// @param key - Key originally passed with repl_dev::write() api + /// @param ctx - User contenxt passed as part of the repl_dev::write() api + virtual void on_rollback(int64_t lsn, const sisl::blob& header, const sisl::blob& key, void* ctx) = 0; + + /// @brief Called when replication module is trying to allocate a block to write the value + /// + /// This function can be called both on leader and follower when it is trying to allocate a block to write the + /// value. Caller is expected to provide hints for allocation based on the header supplied as part of original + /// write. In cases where caller don't care about the hints can return default blk_alloc_hints. + /// + /// @param header Header originally passed with repl_dev::write() api on the leader + /// @return Expected to return blk_alloc_hints for this write + virtual blk_alloc_hints get_blk_alloc_hints(sisl::blob const& header) = 0; + + /// @brief Called when the replica set is being stopped + virtual void on_replica_stop() = 0; +}; + +class ReplDev { +public: + virtual ~ReplDev() = default; + + /// @brief Replicate the data to the replica set. This method goes through the + /// following steps: + /// Step 1: Allocates blkid from the storage engine to write the value into. Storage + /// engine returns a blkid_list in cases where single contiguous blocks are not + /// available. For convenience, the comment will continue to refer blkid_list as blkids. + /// Step 2: Uses data channel to send the to all replicas + /// Step 3: Creates a log/journal entry with and calls nuraft to + /// append the entry and replicate using nuraft channel (also called header_channel). + /// Step 4: Writes the data into the allocated blk_id + /// + /// @param header - Blob representing the header (it is opaque and will be copied + /// as-is to the journal entry) + /// @param key - Blob representing the key (it is opaque and will be copied as-is to + /// the journal entry). We are tracking this seperately to support consistent read use + /// cases + /// @param value - vector of io buffers that contain value for the key. It is an optional field and if the value + /// list size is 0, then only key is written to replicadev without data. + /// @param user_ctx - User supplied opaque context which will be passed to listener + /// callbacks + virtual void async_alloc_write(const sisl::blob& header, const sisl::blob& key, const sisl::sg_list& value, + void* user_ctx) = 0; + + /// @brief Reads the data and returns a future to continue on + /// @param bid Block id to read + /// @param sgs Scatter gather buffer list to which blkids are to be read into + /// @param size Total size of the data read + /// @param part_of_batch Is read is part of a batch. If part of the batch, then submit_batch needs to be called at + /// the end + /// @return A Future with bool to notify if it has successfully read the data, raises the exception in case of + /// failure + virtual folly::Future< bool > async_read(const BlkId& bid, sisl::sg_list& sgs, uint32_t size, + bool part_of_batch = false); + + /// @brief After data is replicated and on_commit to the listener is called. the blkids can be freed. + /// + /// @param lsn - LSN of the old blkids that is being freed + /// @param blkids - blkids to be freed. + virtual void async_free_blks(int64_t lsn, const blkid_list_t& blkids) = 0; + + /// @brief Checks if this replica is the leader in this ReplDev + /// @return true or false + virtual bool is_leader() const = 0; + + /// @brief Gets the group_id this repldev is working for + /// @return group_id + virtual std::string group_id() const = 0; +}; + +} // namespace home_replication diff --git a/src/include/homestore/replication_service.hpp b/src/include/homestore/replication_service.hpp new file mode 100644 index 000000000..cb27e9306 --- /dev/null +++ b/src/include/homestore/replication_service.hpp @@ -0,0 +1,52 @@ +#pragma once +#include +#include +#include +#include + +#include + +#include "repl_decls.h" +#include "repl_set.h" + +namespace nuraft { +class state_machine; +} + +namespace homestore { + +class ReplDev; +using ReplServiceError = nuraft::cmd_result_code; +using on_replica_dev_init_t = std::function< std::unique_ptr< ReplicaDevListener >(cshared< ReplDev >& rd) >; + +template < typename V, typename E > +using Result = folly::Expected< V, E >; + +template < class V, class E > +using AsyncResult = folly::SemiFuture< Result< V, E > >; + +template < class V > +using ReplResult = Result< V, ReplServiceError >; + +template < class V > +using ReplAsyncResult = AsyncResult< V, ReplServiceError >; + +class ReplicationService { +public: + ReplicationService() = default; + virtual ~ReplicationService() = default; + + // using set_var = std::variant< shared< ReplDev >, ReplServiceError >; + + /// Sync APIs + virtual shared< ReplDev > get_replica_dev(std::string const& group_id) const = 0; + virtual void iterate_replica_devs(std::function< void(cshared< ReplDev >&) > cb) const = 0; + + /// Async APIs + virtual ReplAsyncResult< shared< ReplDev > > create_replica_dev(std::string const& group_id, + std::set< std::string, std::less<> >&& members) = 0; + + virtual folly::SemiFuture< ReplServiceError > + replace_member(std::string const& group_id, std::string const& member_out, std::string const& member_in) const = 0; +}; +} // namespace homestore diff --git a/src/include/homestore/vchunk.h b/src/include/homestore/vchunk.h index 6f09786f1..11b313de7 100644 --- a/src/include/homestore/vchunk.h +++ b/src/include/homestore/vchunk.h @@ -29,7 +29,7 @@ class VChunk { void set_user_private(const sisl::blob& data); const uint8_t* get_user_private() const; - blk_cap_t available_blks() const; + blk_num_t available_blks() const; uint32_t get_pdev_id() const; cshared< Chunk > get_internal_chunk() const; diff --git a/src/lib/blkalloc/append_blk_allocator.cpp b/src/lib/blkalloc/append_blk_allocator.cpp index f74ed8956..2017e506d 100644 --- a/src/lib/blkalloc/append_blk_allocator.cpp +++ b/src/lib/blkalloc/append_blk_allocator.cpp @@ -67,7 +67,7 @@ void AppendBlkAllocator::on_meta_blk_found(const sisl::byte_view& buf, void* met // // alloc a single block; // -BlkAllocStatus AppendBlkAllocator::alloc(BlkId& bid) { +BlkAllocStatus AppendBlkAllocator::alloc_contiguous(BlkId& bid) { std::unique_lock lk(m_mtx); if (available_blks() < 1) { COUNTER_INCREMENT(m_metrics, num_alloc_failure, 1); @@ -75,9 +75,9 @@ BlkAllocStatus AppendBlkAllocator::alloc(BlkId& bid) { return BlkAllocStatus::SPACE_FULL; } - bid.set(m_last_append_offset, 1, m_chunk_id); + bid = BlkId{m_last_append_offset, 1, m_chunk_id}; - [[maybe_unused]] auto cur_cp = hs()->cp_mgr().cp_guard(); + auto cur_cp = hs()->cp_mgr().cp_guard(); ++m_last_append_offset; --m_freeable_nblks; set_dirty_offset(cur_cp->id() % MAX_CP_COUNT); @@ -90,28 +90,27 @@ BlkAllocStatus AppendBlkAllocator::alloc(BlkId& bid) { // For append blk allocator, the assumption is only one writer will append data on one chunk. // If we want to change above design, we can open this api for vector allocation; // -BlkAllocStatus AppendBlkAllocator::alloc(blk_count_t nblks, const blk_alloc_hints& hint, - std::vector< BlkId >& out_bids) { +BlkAllocStatus AppendBlkAllocator::alloc(blk_count_t nblks, const blk_alloc_hints& hint, BlkId& out_bid) { std::unique_lock lk(m_mtx); if (available_blks() < nblks) { COUNTER_INCREMENT(m_metrics, num_alloc_failure, 1); LOGERROR("No space left to serve request nblks: {}, available_blks: {}", nblks, available_blks()); return BlkAllocStatus::SPACE_FULL; - } else if (nblks > BlkId::max_blks_in_op()) { + } else if (nblks > max_blks_per_blkid()) { // consumer(vdev) already handles this case. COUNTER_INCREMENT(m_metrics, num_alloc_failure, 1); - LOGERROR("Can't serve request nblks: {} larger than max_blks_in_op: {}", nblks, BlkId::max_blks_in_op()); + LOGERROR("Can't serve request nblks: {} larger than max_blks_in_op: {}", nblks, max_blks_per_blkid()); return BlkAllocStatus::FAILED; } // Push 1 blk to the vector which has all the requested nblks; - out_bids.emplace_back(m_last_append_offset, nblks, m_chunk_id); + out_bid = BlkId{m_last_append_offset, nblks, m_chunk_id}; - [[maybe_unused]] auto cur_cp = hs()->cp_mgr().cp_guard(); + auto cur_cp = hs()->cp_mgr().cp_guard(); m_last_append_offset += nblks; m_freeable_nblks -= nblks; - // it is garunteened dirty buffer always contains updates of current_cp or next_cp, it will + // it is guaranteed that dirty buffer always contains updates of current_cp or next_cp, it will // never get dirty buffer from across updates; set_dirty_offset(cur_cp->id() % MAX_CP_COUNT); @@ -155,38 +154,31 @@ void AppendBlkAllocator::clear_dirty_offset(const uint8_t idx) { m_sb[idx]->is_d // void AppendBlkAllocator::free(const BlkId& bid) { std::unique_lock lk(m_mtx); - [[maybe_unused]] auto cur_cp = hs()->cp_mgr().cp_guard(); - const auto n = bid.get_nblks(); + auto cur_cp = hs()->cp_mgr().cp_guard(); + const auto n = bid.blk_count(); m_freeable_nblks += n; - if (bid.get_blk_num() + n == m_last_append_offset) { + if (bid.blk_num() + n == m_last_append_offset) { // we are freeing the the last blk id, let's rewind. m_last_append_offset -= n; } set_dirty_offset(cur_cp->id() % MAX_CP_COUNT); } -void AppendBlkAllocator::free(const std::vector< BlkId >& blk_ids) { - for (const auto b : blk_ids) { - this->free(b); - } -} - -blk_cap_t AppendBlkAllocator::available_blks() const { return get_total_blks() - get_used_blks(); } +blk_num_t AppendBlkAllocator::available_blks() const { return get_total_blks() - get_used_blks(); } -blk_cap_t AppendBlkAllocator::get_used_blks() const { return m_last_append_offset; } +blk_num_t AppendBlkAllocator::get_used_blks() const { return m_last_append_offset; } bool AppendBlkAllocator::is_blk_alloced(const BlkId& in_bid, bool) const { // blk_num starts from 0; - return in_bid.get_blk_num() < get_used_blks(); + return in_bid.blk_num() < get_used_blks(); } std::string AppendBlkAllocator::get_name() const { return "AppendBlkAlloc_chunk_" + std::to_string(m_chunk_id); } std::string AppendBlkAllocator::to_string() const { - auto str = fmt::format("{}, last_append_offset: {}", get_name(), m_last_append_offset); - return str; + return fmt::format("{}, last_append_offset: {}", get_name(), m_last_append_offset); } -blk_cap_t AppendBlkAllocator::get_freeable_nblks() const { return m_freeable_nblks; } +blk_num_t AppendBlkAllocator::get_freeable_nblks() const { return m_freeable_nblks; } } // namespace homestore diff --git a/src/lib/blkalloc/append_blk_allocator.h b/src/lib/blkalloc/append_blk_allocator.h index ebfcdf61e..3c05aaabe 100644 --- a/src/lib/blkalloc/append_blk_allocator.h +++ b/src/lib/blkalloc/append_blk_allocator.h @@ -34,8 +34,8 @@ struct append_blkalloc_ctx { uint32_t version{append_blkalloc_sb_version}; bool is_dirty; // this field is needed for cp_flush, but not necessarily needed for persistence; uint64_t allocator_id; - uint64_t freeable_nblks; - uint64_t last_append_offset; + blk_num_t freeable_nblks; + blk_num_t last_append_offset; }; #pragma pack() @@ -75,15 +75,13 @@ class AppendBlkAllocator : public BlkAllocator { AppendBlkAllocator& operator=(AppendBlkAllocator&&) noexcept = delete; virtual ~AppendBlkAllocator() = default; - BlkAllocStatus alloc(BlkId& bid) override; - BlkAllocStatus alloc(blk_count_t nblks, const blk_alloc_hints& hints, std::vector< BlkId >& out_blkid) override; + BlkAllocStatus alloc_contiguous(BlkId& bid) override; + BlkAllocStatus alloc(blk_count_t nblks, blk_alloc_hints const& hints, BlkId& out_blkid) override; + void free(BlkId const& b) override; - void free(const std::vector< BlkId >& blk_ids) override; - void free(const BlkId& b) override; - - blk_cap_t available_blks() const override; - blk_cap_t get_used_blks() const override; - blk_cap_t get_freeable_nblks() const; + blk_num_t available_blks() const override; + blk_num_t get_used_blks() const override; + blk_num_t get_freeable_nblks() const; bool is_blk_alloced(const BlkId& in_bid, bool use_lock = false) const override; std::string to_string() const override; @@ -102,9 +100,9 @@ class AppendBlkAllocator : public BlkAllocator { void on_meta_blk_found(const sisl::byte_view& buf, void* meta_cookie); private: - std::mutex m_mtx; // thread_safe, TODO: open option for consumer to choose to go lockless; - uint64_t m_last_append_offset{0}; // last appended offset in blocks; - uint64_t m_freeable_nblks{0}; + std::mutex m_mtx; // thread_safe, TODO: open option for consumer to choose to go lockless; + blk_num_t m_last_append_offset{0}; // last appended offset in blocks; + blk_num_t m_freeable_nblks{0}; AppendBlkAllocMetrics m_metrics; std::array< superblk< append_blkalloc_ctx >, MAX_CP_COUNT > m_sb; }; diff --git a/src/lib/blkalloc/blk.cpp b/src/lib/blkalloc/blk.cpp index affd94b83..29507dcf3 100644 --- a/src/lib/blkalloc/blk.cpp +++ b/src/lib/blkalloc/blk.cpp @@ -17,94 +17,162 @@ #include "common/homestore_assert.hpp" namespace homestore { +BlkId::BlkId(uint64_t id_int) { + *r_cast< uint64_t* >(&s) = id_int; + DEBUG_ASSERT_EQ(is_multi(), 0, "MultiBlkId is set on BlkId constructor"); +} + +BlkId::BlkId(blk_num_t blk_num, blk_count_t nblks, chunk_num_t chunk_num) : s{0x0, blk_num, nblks, chunk_num} {} + +uint64_t BlkId::to_integer() const { return *r_cast< const uint64_t* >(&s); } + +sisl::blob BlkId::serialize() { return sisl::blob{r_cast< uint8_t* >(&s), sizeof(serialized)}; } + +uint32_t BlkId::serialized_size() const { return sizeof(BlkId); } + +void BlkId::deserialize(sisl::blob const& b, bool copy) { + serialized* other = r_cast< serialized* >(b.bytes); + s = *other; +} + +void BlkId::invalidate() { s.m_nblks = 0; } + +bool BlkId::is_valid() const { return (blk_count() > 0); } + +std::string BlkId::to_string() const { + return is_valid() ? fmt::format("BlkNum={} nblks={} chunk={}", blk_num(), blk_count(), chunk_num()) + : "Invalid_Blkid"; +} + int BlkId::compare(const BlkId& one, const BlkId& two) { - if (one.m_chunk_num > two.m_chunk_num) { + if (one.chunk_num() < two.chunk_num()) { return -1; - } else if (one.m_chunk_num < two.m_chunk_num) { + } else if (one.chunk_num() > two.chunk_num()) { return 1; } - if (one.m_blk_num > two.m_blk_num) { + if (one.blk_num() < two.blk_num()) { return -1; - } else if (one.m_blk_num < two.m_blk_num) { + } else if (one.blk_num() > two.blk_num()) { return 1; } - if (one.m_nblks > two.m_nblks) { + if (one.blk_count() < two.blk_count()) { return -1; - } else if (one.m_nblks < two.m_nblks) { + } else if (one.blk_count() > two.blk_count()) { return 1; } return 0; } -uint64_t BlkId::to_integer() const { - const uint64_t val{m_blk_num | (static_cast< uint64_t >(m_nblks) << BLK_NUM_BITS) | - (static_cast< uint64_t >(m_chunk_num) << (BLK_NUM_BITS + NBLKS_BITS))}; - return val; -} - -BlkId::BlkId(uint64_t id_int) { set(id_int); } -BlkId::BlkId(blk_num_t blk_num, blk_count_t nblks, chunk_num_t chunk_num) { set(blk_num, nblks, chunk_num); } - -void BlkId::invalidate() { set(blk_num_t{0}, blk_count_t{0}, s_chunk_num_mask); } +//////////////////////////////////// MultiBlkId Section ////////////////////////////// +MultiBlkId::MultiBlkId() : BlkId::BlkId() { s.m_is_multi = 1; } -bool BlkId::is_valid() const { return (m_chunk_num != s_chunk_num_mask); } +MultiBlkId::MultiBlkId(BlkId const& b) : BlkId::BlkId(b) { s.m_is_multi = 1; } -BlkId BlkId::get_blkid_at(uint32_t offset, uint32_t pagesz) const { - assert(offset % pagesz == 0); - const uint32_t remaining_size{((get_nblks() - (offset / pagesz)) * pagesz)}; - return (get_blkid_at(offset, remaining_size, pagesz)); +MultiBlkId::MultiBlkId(blk_num_t blk_num, blk_count_t nblks, chunk_num_t chunk_num) : + BlkId::BlkId{blk_num, nblks, chunk_num} { + s.m_is_multi = 1; } -BlkId BlkId::get_blkid_at(uint32_t offset, uint32_t size, uint32_t pagesz) const { - assert(size % pagesz == 0); - assert(offset % pagesz == 0); +void MultiBlkId::add(blk_num_t blk_num, blk_count_t nblks, chunk_num_t chunk_num) { + if (BlkId::is_valid()) { + RELEASE_ASSERT_EQ(s.m_chunk_num, chunk_num, "MultiBlkId has to be all from same chunk"); + RELEASE_ASSERT_LT(n_addln_piece, max_addln_pieces, "MultiBlkId cannot support more than {} pieces", + max_addln_pieces + 1); + addln_pieces[n_addln_piece] = chain_blkid{.m_blk_num = blk_num, .m_nblks = nblks}; + ++n_addln_piece; + } else { + s = BlkId::serialized{0x1, blk_num, nblks, chunk_num}; + } +} - BlkId other; +void MultiBlkId::add(BlkId const& b) { add(b.blk_num(), b.blk_count(), b.chunk_num()); } - other.set_blk_num(get_blk_num() + (offset / pagesz)); - other.set_nblks(size / pagesz); - other.set_chunk_num(get_chunk_num()); +sisl::blob MultiBlkId::serialize() { return sisl::blob{r_cast< uint8_t* >(this), serialized_size()}; } - assert(other.get_blk_num() < get_blk_num() + get_nblks()); - assert((other.get_blk_num() + other.get_nblks()) <= (get_blk_num() + get_nblks())); - return other; +uint32_t MultiBlkId::serialized_size() const { + uint32_t sz = BlkId::serialized_size(); + if (n_addln_piece != 0) { sz += sizeof(uint16_t) + (n_addln_piece * sizeof(chain_blkid)); } + return sz; } -void BlkId::set(blk_num_t blk_num, blk_count_t nblks, chunk_num_t chunk_num) { - set_blk_num(blk_num); - set_nblks(nblks); - set_chunk_num(chunk_num); +void MultiBlkId::deserialize(sisl::blob const& b, bool copy) { + MultiBlkId* other = r_cast< MultiBlkId* >(b.bytes); + s = other->s; + if (b.size == sizeof(BlkId)) { + n_addln_piece = 0; + } else { + n_addln_piece = other->n_addln_piece; + std::copy(other->addln_pieces.begin(), other->addln_pieces.begin() + other->n_addln_piece, + addln_pieces.begin()); + } } -void BlkId::set(const BlkId& bid) { set(bid.get_blk_num(), bid.get_nblks(), bid.get_chunk_num()); } +uint16_t MultiBlkId::num_pieces() const { return BlkId::is_valid() ? n_addln_piece + 1 : 0; } -void BlkId::set(uint64_t id_int) { - HS_DBG_ASSERT_LE(id_int, max_id_int()); - m_blk_num = (id_int & s_blk_num_mask); - m_nblks = static_cast< blk_count_t >((id_int >> BLK_NUM_BITS) & s_nblks_mask); - m_chunk_num = static_cast< chunk_num_t >((id_int >> (BLK_NUM_BITS + NBLKS_BITS)) & s_chunk_num_mask); -} +bool MultiBlkId::has_room() const { return (n_addln_piece < max_addln_pieces); } + +MultiBlkId::iterator MultiBlkId::iterate() const { return MultiBlkId::iterator{*this}; } -void BlkId::set_blk_num(blk_num_t blk_num) { - HS_DBG_ASSERT_LE(blk_num, s_blk_num_mask); - m_blk_num = blk_num; +std::string MultiBlkId::to_string() const { + std::string str = "MultiBlks: {"; + auto it = iterate(); + while (auto const b = it.next()) { + str += b->to_string(); + } + str += std::string("}"); + return str; } -void BlkId::set_nblks(blk_count_t nblks) { - HS_DBG_ASSERT_LE(nblks, max_blks_in_op()); - m_nblks = static_cast< blk_count_serialized_t >(nblks - 1); +blk_count_t MultiBlkId::blk_count() const { + blk_count_t nblks{0}; + auto it = iterate(); + while (auto b = it.next()) { + nblks += b->blk_count(); + } + return nblks; } -void BlkId::set_chunk_num(chunk_num_t chunk_num) { - HS_DBG_ASSERT_LE(chunk_num, s_chunk_num_mask); - m_chunk_num = chunk_num; +BlkId MultiBlkId::to_single_blkid() const { + HS_DBG_ASSERT_LE(num_pieces(), 1, "Can only MultiBlkId with one piece to BlkId"); + return BlkId{blk_num(), blk_count(), chunk_num()}; } -std::string BlkId::to_string() const { - return is_valid() ? fmt::format("BlkNum={} nblks={} chunk={}", get_blk_num(), get_nblks(), get_chunk_num()) - : "Invalid_Blkid"; +int MultiBlkId::compare(MultiBlkId const& left, MultiBlkId const& right) { + if (left.chunk_num() < right.chunk_num()) { + return -1; + } else if (left.chunk_num() > right.chunk_num()) { + return 1; + } + + // Shortcut path for simple BlkId search to avoid building icl set + if ((left.num_pieces() == 1) && (right.num_pieces() == 1)) { + return BlkId::compare(d_cast< BlkId const& >(left), d_cast< BlkId const& >(right)); + } + + using IntervalSet = boost::icl::interval_set< uint64_t >; + using Interval = IntervalSet::interval_type; + + IntervalSet lset; + auto lit = left.iterate(); + while (auto b = lit.next()) { + lset.insert(Interval::right_open(b->blk_num(), b->blk_num() + b->blk_count())); + } + + IntervalSet rset; + auto rit = right.iterate(); + while (auto b = rit.next()) { + rset.insert(Interval::right_open(b->blk_num(), b->blk_num() + b->blk_count())); + } + + if (lset < rset) { + return -1; + } else if (lset > rset) { + return 1; + } else { + return 0; + } } } // namespace homestore diff --git a/src/lib/blkalloc/blk_allocator.cpp b/src/lib/blkalloc/blk_allocator.cpp index 4aa3269d0..92c44df53 100644 --- a/src/lib/blkalloc/blk_allocator.cpp +++ b/src/lib/blkalloc/blk_allocator.cpp @@ -66,41 +66,54 @@ bool BlkAllocator::is_blk_alloced_on_disk(const BlkId& b, bool use_lock) const { if (!auto_recovery_on()) { return true; // nothing to compare. So always return true } - auto bits_set{[this, &b]() { - if (!get_disk_bm_const()->is_bits_set(b.get_blk_num(), b.get_nblks())) { return false; } + auto bits_set = [this](BlkId const& b) { + if (!get_disk_bm_const()->is_bits_set(b.blk_num(), b.blk_count())) { return false; } return true; - }}; + }; + if (use_lock) { - const BlkAllocPortion& portion = blknum_to_portion_const(b.get_blk_num()); + const BlkAllocPortion& portion = blknum_to_portion_const(b.blk_num()); auto lock{portion.portion_auto_lock()}; - return bits_set(); + return bits_set(b); } else { - return bits_set(); + return bits_set(b); } } -BlkAllocStatus BlkAllocator::alloc_on_disk(const BlkId& in_bid) { +BlkAllocStatus BlkAllocator::alloc_on_disk(BlkId const& bid) { if (!auto_recovery_on() && m_inited) { return BlkAllocStatus::FAILED; } rcu_read_lock(); auto list = get_alloc_blk_list(); if (list) { // cp has started, accumulating to the list - list->push_back(in_bid); + list->push_back(bid); } else { + auto set_on_disk_bm = [this](auto& b) { + BlkAllocPortion& portion = blknum_to_portion(b.blk_num()); + { + auto lock{portion.portion_auto_lock()}; + if (m_inited) { + BLKALLOC_REL_ASSERT(get_disk_bm_const()->is_bits_reset(b.blk_num(), b.blk_count()), + "Expected disk blks to reset"); + } + get_disk_bm_mutable()->set_bits(b.blk_num(), b.blk_count()); + portion.decrease_available_blocks(b.blk_count()); + BLKALLOC_LOG(DEBUG, "blks allocated {} chunk number {}", b.to_string(), m_chunk_id); + } + }; + // cp is not started or already done, allocate on disk bm directly; /* enable this assert later when reboot is supported */ // assert(auto_recovery_on() || !m_inited); - BlkAllocPortion& portion = blknum_to_portion(in_bid.get_blk_num()); - { - auto lock{portion.portion_auto_lock()}; - if (m_inited) { - BLKALLOC_REL_ASSERT(get_disk_bm_const()->is_bits_reset(in_bid.get_blk_num(), in_bid.get_nblks()), - "Expected disk blks to reset"); + if (bid.is_multi()) { + MultiBlkId const& mbid = r_cast< MultiBlkId const& >(bid); + auto it = mbid.iterate(); + while (auto b = it.next()) { + set_on_disk_bm(*b); } - get_disk_bm_mutable()->set_bits(in_bid.get_blk_num(), in_bid.get_nblks()); - portion.decrease_available_blocks(in_bid.get_nblks()); - BLKALLOC_LOG(DEBUG, "blks allocated {} chunk number {}", in_bid.to_string(), m_chunk_id); + } else { + set_on_disk_bm(bid); } } rcu_read_unlock(); @@ -108,29 +121,42 @@ BlkAllocStatus BlkAllocator::alloc_on_disk(const BlkId& in_bid) { return BlkAllocStatus::SUCCESS; } -BlkAllocStatus BlkAllocator::alloc_on_realtime(const BlkId& b) { +BlkAllocStatus BlkAllocator::alloc_on_realtime(BlkId const& bid) { if (!realtime_bm_on()) { return BlkAllocStatus::SUCCESS; } if (!auto_recovery_on() && m_inited) { return BlkAllocStatus::FAILED; } - BlkAllocPortion& portion = blknum_to_portion(b.get_blk_num()); - { - auto lock{portion.portion_auto_lock()}; - if (m_inited) { - if (!get_realtime_bm()->is_bits_reset(b.get_blk_num(), b.get_nblks())) { - BLKALLOC_LOG(ERROR, "bit not reset {} nblks {} chunk number {}", b.get_blk_num(), b.get_nblks(), - m_chunk_id); - for (blk_count_t i{0}; i < b.get_nblks(); ++i) { - if (!get_disk_bm_const()->is_bits_reset(b.get_blk_num() + i, 1)) { - BLKALLOC_LOG(ERROR, "bit not reset {}", b.get_blk_num() + i); + + auto set_on_realtime_bm = [this](BlkId const& b) { + BlkAllocPortion& portion = blknum_to_portion(b.blk_num()); + { + auto lock{portion.portion_auto_lock()}; + if (m_inited) { + if (!get_realtime_bm()->is_bits_reset(b.blk_num(), b.blk_count())) { + BLKALLOC_LOG(ERROR, "bit not reset {} nblks {} chunk number {}", b.blk_num(), b.blk_count(), + m_chunk_id); + for (blk_count_t i{0}; i < b.blk_count(); ++i) { + if (!get_disk_bm_const()->is_bits_reset(b.blk_num() + i, 1)) { + BLKALLOC_LOG(ERROR, "bit not reset {}", b.blk_num() + i); + } } + BLKALLOC_REL_ASSERT(get_realtime_bm()->is_bits_reset(b.blk_num(), b.blk_count()), + "Expected disk bits to reset blk num {} num blks {}", b.blk_num(), + b.blk_count()); } - BLKALLOC_REL_ASSERT(get_realtime_bm()->is_bits_reset(b.get_blk_num(), b.get_nblks()), - "Expected disk bits to reset blk num {} num blks {}", b.get_blk_num(), - b.get_nblks()); } + get_realtime_bm()->set_bits(b.blk_num(), b.blk_count()); + BLKALLOC_LOG(DEBUG, "realtime blks allocated {} chunk number {}", b.to_string(), m_chunk_id); + } + }; + + if (bid.is_multi()) { + MultiBlkId const& mbid = r_cast< MultiBlkId const& >(bid); + auto it = mbid.iterate(); + while (auto const b = it.next()) { + set_on_realtime_bm(*b); } - get_realtime_bm()->set_bits(b.get_blk_num(), b.get_nblks()); - BLKALLOC_LOG(DEBUG, "realtime blks allocated {} chunk number {}", b.to_string(), m_chunk_id); + } else { + set_on_realtime_bm(bid); } return BlkAllocStatus::SUCCESS; @@ -139,60 +165,90 @@ BlkAllocStatus BlkAllocator::alloc_on_realtime(const BlkId& b) { // // Caller should consume the return value and print context when return false; // -bool BlkAllocator::free_on_realtime(const BlkId& b) { +bool BlkAllocator::free_on_realtime(BlkId const& bid) { if (!realtime_bm_on()) { return true; } /* this api should be called only when auto recovery is enabled */ assert(auto_recovery_on()); - BlkAllocPortion& portion = blknum_to_portion(b.get_blk_num()); - { - auto lock{portion.portion_auto_lock()}; - if (m_inited) { - /* During recovery we might try to free the entry which is already freed while replaying the journal, - * This assert is valid only post recovery. - */ - if (!get_realtime_bm()->is_bits_set(b.get_blk_num(), b.get_nblks())) { - BLKALLOC_LOG(ERROR, "{}, bit not set {} nblks{} chunk number {}", b.to_string(), b.get_blk_num(), - b.get_nblks(), m_chunk_id); - for (blk_count_t i{0}; i < b.get_nblks(); ++i) { - if (!get_realtime_bm()->is_bits_set(b.get_blk_num() + i, 1)) { - BLKALLOC_LOG(ERROR, "bit not set {}", b.get_blk_num() + i); + + auto unset_on_realtime_bm = [this](BlkId const& b) { + BlkAllocPortion& portion = blknum_to_portion(b.blk_num()); + { + auto lock{portion.portion_auto_lock()}; + if (m_inited) { + /* During recovery we might try to free the entry which is already freed while replaying the journal, + * This assert is valid only post recovery. + */ + if (!get_realtime_bm()->is_bits_set(b.blk_num(), b.blk_count())) { + BLKALLOC_LOG(ERROR, "{}, bit not set {} nblks{} chunk number {}", b.to_string(), b.blk_num(), + b.blk_count(), m_chunk_id); + for (blk_count_t i{0}; i < b.blk_count(); ++i) { + if (!get_realtime_bm()->is_bits_set(b.blk_num() + i, 1)) { + BLKALLOC_LOG(ERROR, "bit not set {}", b.blk_num() + i); + } } + return false; } - return false; } + + BLKALLOC_LOG(DEBUG, "realtime: free bid: {}", b.to_string()); + get_realtime_bm()->reset_bits(b.blk_num(), b.blk_count()); + return true; } + }; - BLKALLOC_LOG(DEBUG, "realtime: free bid: {}", b.to_string()); - get_realtime_bm()->reset_bits(b.get_blk_num(), b.get_nblks()); - return true; + bool ret{true}; + if (bid.is_multi()) { + MultiBlkId const& mbid = r_cast< MultiBlkId const& >(bid); + auto it = mbid.iterate(); + while (auto const b = it.next()) { + if (!unset_on_realtime_bm(*b)) { + ret = false; + break; + } + } + } else { + ret = unset_on_realtime_bm(bid); } + return ret; } -void BlkAllocator::free_on_disk(const BlkId& b) { +void BlkAllocator::free_on_disk(BlkId const& bid) { /* this api should be called only when auto recovery is enabled */ assert(auto_recovery_on()); - BlkAllocPortion& portion = blknum_to_portion(b.get_blk_num()); - { - auto lock{portion.portion_auto_lock()}; - if (m_inited) { - /* During recovery we might try to free the entry which is already freed while replaying the journal, - * This assert is valid only post recovery. - */ - if (!get_disk_bm_const()->is_bits_set(b.get_blk_num(), b.get_nblks())) { - BLKALLOC_LOG(ERROR, "bit not set {} nblks {} chunk number {}", b.get_blk_num(), b.get_nblks(), - m_chunk_id); - for (blk_count_t i{0}; i < b.get_nblks(); ++i) { - if (!get_disk_bm_const()->is_bits_set(b.get_blk_num() + i, 1)) { - BLKALLOC_LOG(ERROR, "bit not set {}", b.get_blk_num() + i); + auto unset_on_disk_bm = [this](auto& b) { + BlkAllocPortion& portion = blknum_to_portion(b.blk_num()); + { + auto lock{portion.portion_auto_lock()}; + if (m_inited) { + /* During recovery we might try to free the entry which is already freed while replaying the journal, + * This assert is valid only post recovery. + */ + if (!get_disk_bm_const()->is_bits_set(b.blk_num(), b.blk_count())) { + BLKALLOC_LOG(ERROR, "bit not set {} nblks {} chunk number {}", b.blk_num(), b.blk_count(), + m_chunk_id); + for (blk_count_t i{0}; i < b.blk_count(); ++i) { + if (!get_disk_bm_const()->is_bits_set(b.blk_num() + i, 1)) { + BLKALLOC_LOG(ERROR, "bit not set {}", b.blk_num() + i); + } } + BLKALLOC_REL_ASSERT(get_disk_bm_const()->is_bits_set(b.blk_num(), b.blk_count()), + "Expected disk bits to set blk num {} num blks {}", b.blk_num(), b.blk_count()); } - BLKALLOC_REL_ASSERT(get_disk_bm_const()->is_bits_set(b.get_blk_num(), b.get_nblks()), - "Expected disk bits to set blk num {} num blks {}", b.get_blk_num(), b.get_nblks()); } + get_disk_bm_mutable()->reset_bits(b.blk_num(), b.blk_count()); + portion.increase_available_blocks(b.blk_count()); + } + }; + + if (bid.is_multi()) { + MultiBlkId const& mbid = r_cast< MultiBlkId const& >(bid); + auto it = mbid.iterate(); + while (auto const b = it.next()) { + unset_on_disk_bm(*b); } - get_disk_bm_mutable()->reset_bits(b.get_blk_num(), b.get_nblks()); - portion.increase_available_blocks(b.get_nblks()); + } else { + unset_on_disk_bm(bid); } } @@ -229,9 +285,9 @@ void BlkAllocator::create_debug_bm() { } void BlkAllocator::update_debug_bm(const BlkId& bid) { - BLKALLOC_REL_ASSERT(get_disk_bm_const()->is_bits_set(bid.get_blk_num(), bid.get_nblks()), - "Expected disk bits to set blk num {} num blks {}", bid.get_blk_num(), bid.get_nblks()); - get_debug_bm()->set_bits(bid.get_blk_num(), bid.get_nblks()); + BLKALLOC_REL_ASSERT(get_disk_bm_const()->is_bits_set(bid.blk_num(), bid.blk_count()), + "Expected disk bits to set blk num {} num blks {}", bid.blk_num(), bid.blk_count()); + get_debug_bm()->set_bits(bid.blk_num(), bid.blk_count()); } bool BlkAllocator::verify_debug_bm(bool free_debug_bm) { diff --git a/src/lib/blkalloc/blk_allocator.h b/src/lib/blkalloc/blk_allocator.h index 43f49842e..fb75bc0f4 100644 --- a/src/lib/blkalloc/blk_allocator.h +++ b/src/lib/blkalloc/blk_allocator.h @@ -63,8 +63,8 @@ struct BlkAllocConfig { public: const uint32_t m_blk_size; const uint32_t m_align_size; - const blk_cap_t m_capacity; - const blk_cap_t m_blks_per_portion; + const blk_num_t m_capacity; + const blk_num_t m_blks_per_portion; const std::string m_unique_name; bool m_auto_recovery{false}; bool m_realtime_bm_on{false}; // only specifically turn off in BlkAlloc Test; @@ -74,7 +74,7 @@ struct BlkAllocConfig { bool realtime_bm_on = true) : m_blk_size{blk_size}, m_align_size{align_size}, - m_capacity{static_cast< blk_cap_t >(size / blk_size)}, + m_capacity{static_cast< blk_num_t >(size / blk_size)}, m_blks_per_portion{std::min(HS_DYNAMIC_CONFIG(blkallocator.num_blks_per_portion), m_capacity)}, m_unique_name{name} { #ifdef _PRERELEASE @@ -86,9 +86,9 @@ struct BlkAllocConfig { #endif } - BlkAllocConfig(const BlkAllocConfig&) = default; + BlkAllocConfig(BlkAllocConfig const&) = default; BlkAllocConfig(BlkAllocConfig&&) noexcept = delete; - BlkAllocConfig& operator=(const BlkAllocConfig&) = default; + BlkAllocConfig& operator=(BlkAllocConfig const&) = default; BlkAllocConfig& operator=(BlkAllocConfig&&) noexcept = delete; virtual ~BlkAllocConfig() = default; void set_auto_recovery(bool is_auto_recovery) { m_auto_recovery = is_auto_recovery; } @@ -118,9 +118,9 @@ class BlkAllocPortion { public: BlkAllocPortion(blk_temp_t temp = default_temperature()) : m_temperature(temp) {} ~BlkAllocPortion() = default; - BlkAllocPortion(const BlkAllocPortion&) = delete; + BlkAllocPortion(BlkAllocPortion const&) = delete; BlkAllocPortion(BlkAllocPortion&&) noexcept = delete; - BlkAllocPortion& operator=(const BlkAllocPortion&) = delete; + BlkAllocPortion& operator=(BlkAllocPortion const&) = delete; BlkAllocPortion& operator=(BlkAllocPortion&&) noexcept = delete; auto portion_auto_lock() const { return std::scoped_lock< std::mutex >(m_blk_lock); } @@ -175,20 +175,19 @@ class BlkAllocPortion { class CP; class BlkAllocator { public: - BlkAllocator(const BlkAllocConfig& cfg, chunk_num_t id = 0); - BlkAllocator(const BlkAllocator&) = delete; + BlkAllocator(BlkAllocConfig const& cfg, chunk_num_t id = 0); + BlkAllocator(BlkAllocator const&) = delete; BlkAllocator(BlkAllocator&&) noexcept = delete; - BlkAllocator& operator=(const BlkAllocator&) = delete; + BlkAllocator& operator=(BlkAllocator const&) = delete; BlkAllocator& operator=(BlkAllocator&&) noexcept = delete; virtual ~BlkAllocator() = default; - virtual BlkAllocStatus alloc(BlkId& bid) = 0; - virtual BlkAllocStatus alloc(blk_count_t nblks, const blk_alloc_hints& hints, std::vector< BlkId >& out_blkid) = 0; - virtual void free(const std::vector< BlkId >& blk_ids) = 0; - virtual void free(const BlkId& id) = 0; - virtual blk_cap_t available_blks() const = 0; - virtual blk_cap_t get_used_blks() const = 0; - virtual bool is_blk_alloced(const BlkId& b, bool use_lock = false) const = 0; + virtual BlkAllocStatus alloc_contiguous(BlkId& bid) = 0; + virtual BlkAllocStatus alloc(blk_count_t nblks, blk_alloc_hints const& hints, BlkId& out_blkid) = 0; + virtual void free(BlkId const& id) = 0; + virtual blk_num_t available_blks() const = 0; + virtual blk_num_t get_used_blks() const = 0; + virtual bool is_blk_alloced(BlkId const& b, bool use_lock = false) const = 0; virtual std::string to_string() const = 0; virtual void cp_flush(CP* cp); // TODO: it needs to be a pure virtual function after bitmap blkallocator is derived @@ -217,30 +216,23 @@ class BlkAllocator { void decr_alloced_blk_count(blk_count_t nblks) { m_alloced_blk_count.fetch_sub(nblks, std::memory_order_relaxed); } int64_t get_alloced_blk_count() const { return m_alloced_blk_count.load(std::memory_order_acquire); } - bool is_blk_alloced_on_disk(const BlkId& b, bool use_lock = false) const; /* It is used during recovery in both mode :- auto recovery and manual recovery * It is also used in normal IO during auto recovery mode. */ - BlkAllocStatus alloc_on_disk(const BlkId& in_bid); + BlkAllocStatus alloc_on_disk(BlkId const& in_bid); - BlkAllocStatus alloc_on_realtime(const BlkId& b); + BlkAllocStatus alloc_on_realtime(BlkId const& b); + + bool is_blk_alloced_on_disk(BlkId const& b, bool use_lock = false) const; // // Caller should consume the return value and print context when return false; // + [[nodiscard]] bool free_on_realtime(BlkId const& b); - bool free_on_realtime(const BlkId& b); - - void free_on_disk(const BlkId& b); - - // Acquire the underlying bitmap buffer and while the caller has acquired, all the new allocations - // will be captured in a separate list and then pushes into buffer once released. - // NOTE: THIS IS NON-THREAD SAFE METHOD. Caller is expected to ensure synchronization between multiple - // acquires/releases - sisl::byte_array acquire_underlying_buffer(); - void release_underlying_buffer(); + void free_on_disk(BlkId const& b); /* CP start is called when all its consumers have purged their free lists and now want to persist the * disk bitmap. @@ -250,21 +242,21 @@ class BlkAllocator { // void cp_done(); uint32_t get_align_size() const { return m_align_size; } - blk_cap_t get_total_blks() const { return m_num_blks; } - blk_cap_t get_blks_per_portion() const { return m_blks_per_portion; } - blk_cap_t get_num_portions() const { return (m_num_blks - 1) / m_blks_per_portion + 1; } + blk_num_t get_total_blks() const { return m_num_blks; } + blk_num_t get_blks_per_portion() const { return m_blks_per_portion; } + blk_num_t get_num_portions() const { return (m_num_blks - 1) / m_blks_per_portion + 1; } const std::string& get_name() const { return m_name; } bool auto_recovery_on() const { return m_auto_recovery; } uint32_t get_blk_size() const { return m_blk_size; } blk_num_t blknum_to_portion_num(const blk_num_t blknum) const { return blknum / m_blks_per_portion; } BlkAllocPortion& blknum_to_portion(blk_num_t blknum) { return m_blk_portions[blknum_to_portion_num(blknum)]; } - const BlkAllocPortion& blknum_to_portion_const(blk_num_t blknum) const { + BlkAllocPortion const& blknum_to_portion_const(blk_num_t blknum) const { return m_blk_portions[blknum_to_portion_num(blknum)]; } void create_debug_bm(); - void update_debug_bm(const BlkId& bid); + void update_debug_bm(BlkId const& bid); bool verify_debug_bm(bool free_debug_bm); /* Get status */ @@ -278,12 +270,19 @@ class BlkAllocator { sisl::ThreadVector< BlkId >* get_alloc_blk_list(); void set_disk_bm_dirty() { is_disk_bm_dirty = true; } + // Acquire the underlying bitmap buffer and while the caller has acquired, all the new allocations + // will be captured in a separate list and then pushes into buffer once released. + // NOTE: THIS IS NON-THREAD SAFE METHOD. Caller is expected to ensure synchronization between multiple + // acquires/releases + sisl::byte_array acquire_underlying_buffer(); + void release_underlying_buffer(); + protected: const std::string m_name; const uint32_t m_blk_size; const uint32_t m_align_size; - const blk_cap_t m_num_blks; - blk_cap_t m_blks_per_portion; + const blk_num_t m_num_blks; + blk_num_t m_blks_per_portion; const bool m_auto_recovery{false}; const bool m_realtime_bm_on{false}; // only specifically turn off in BlkAlloc Test; bool m_inited{false}; @@ -307,22 +306,21 @@ class BlkAllocator { */ class FixedBlkAllocator : public BlkAllocator { public: - FixedBlkAllocator(const BlkAllocConfig& cfg, bool init, chunk_num_t chunk_id); - FixedBlkAllocator(const FixedBlkAllocator&) = delete; + FixedBlkAllocator(BlkAllocConfig const& cfg, bool init, chunk_num_t chunk_id); + FixedBlkAllocator(FixedBlkAllocator const&) = delete; FixedBlkAllocator(FixedBlkAllocator&&) noexcept = delete; - FixedBlkAllocator& operator=(const FixedBlkAllocator&) = delete; + FixedBlkAllocator& operator=(FixedBlkAllocator const&) = delete; FixedBlkAllocator& operator=(FixedBlkAllocator&&) noexcept = delete; ~FixedBlkAllocator() override = default; - BlkAllocStatus alloc(BlkId& bid) override; - BlkAllocStatus alloc(blk_count_t nblks, const blk_alloc_hints& hints, std::vector< BlkId >& out_blkid) override; - void free(const std::vector< BlkId >& blk_ids) override; - void free(const BlkId& b) override; + BlkAllocStatus alloc_contiguous(BlkId& bid) override; + BlkAllocStatus alloc(blk_count_t nblks, blk_alloc_hints const& hints, BlkId& out_blkid) override; + void free(BlkId const& b) override; void inited() override; - blk_cap_t available_blks() const override; - blk_cap_t get_used_blks() const override; - bool is_blk_alloced(const BlkId& in_bid, bool use_lock = false) const override; + blk_num_t available_blks() const override; + blk_num_t get_used_blks() const override; + bool is_blk_alloced(BlkId const& in_bid, bool use_lock = false) const override; std::string to_string() const override; private: diff --git a/src/lib/blkalloc/blk_cache.h b/src/lib/blkalloc/blk_cache.h index 7b0c5d0e2..132230465 100644 --- a/src/lib/blkalloc/blk_cache.h +++ b/src/lib/blkalloc/blk_cache.h @@ -30,7 +30,7 @@ #include "common/homestore_assert.hpp" namespace homestore { -typedef blk_count_t slab_idx_t; +using slab_idx_t = blk_count_t; static constexpr uint16_t slab_tbl_size{257}; @@ -77,30 +77,30 @@ struct blk_cache_entry { blk_cache_entry() : blk_cache_entry{0, 0, 0} {} blk_cache_entry(const blk_num_t blk_num, const blk_count_t nblks, const blk_temp_t temp) { set_blk_num(blk_num); - set_nblks(nblks); + set_blk_count(nblks); set_temperature(temp); } void set_blk_num(const blk_num_t blk_num) { m_blk_num = blk_num; } [[nodiscard]] blk_num_t get_blk_num() const { return m_blk_num; } - void set_nblks(const blk_count_t nblks) { - HS_DBG_ASSERT_LE(nblks, BlkId::max_blks_in_op()); - m_nblks = static_cast< blk_count_serialized_t >(nblks - 1); + void set_blk_count(const blk_count_t nblks) { + HS_DBG_ASSERT_LE(nblks, max_blks_per_blkid()); + m_nblks = nblks; } - [[nodiscard]] blk_count_t get_nblks() const { return static_cast< blk_count_t >(m_nblks) + 1; } + [[nodiscard]] blk_count_t blk_count() const { return m_nblks; } void set_temperature(const blk_temp_t temp) { m_temp = temp; } [[nodiscard]] blk_temp_t get_temperature() const { return m_temp; } [[nodiscard]] std::string to_string() const { - return fmt::format("BlkNum={} nblks={} temp={}", get_blk_num(), get_nblks(), get_temperature()); + return fmt::format("BlkNum={} nblks={} temp={}", get_blk_num(), blk_count(), get_temperature()); } private: - blk_num_t m_blk_num; // Blk number within the chunk - blk_count_serialized_t m_nblks; // Total number of blocks - blk_temp_t m_temp; // Temperature of each page + blk_num_t m_blk_num; // Blk number within the chunk + blk_count_t m_nblks; // Total number of blocks + blk_temp_t m_temp; // Temperature of each page }; #pragma pack() @@ -143,8 +143,8 @@ struct blk_cache_fill_req { }; struct blk_cache_refill_status { - blk_cap_t slab_required_count{0}; - blk_cap_t slab_refilled_count{0}; + blk_num_t slab_required_count{0}; + blk_num_t slab_refilled_count{0}; [[nodiscard]] bool need_refill() const { return (slab_required_count && (slab_refilled_count != slab_required_count)); @@ -160,9 +160,9 @@ struct blk_cache_refill_status { struct blk_cache_fill_session { uint64_t session_id; std::vector< blk_cache_refill_status > slab_requirements; // A slot for each slab about count of required/refilled - blk_cap_t overall_refilled_num_blks{0}; + blk_num_t overall_refilled_num_blks{0}; bool overall_refill_done{false}; - std::atomic< blk_cap_t > urgent_refill_blks_count{0}; // Send notification after approx this much blks refilled + std::atomic< blk_num_t > urgent_refill_blks_count{0}; // Send notification after approx this much blks refilled [[nodiscard]] static uint64_t gen_session_id() { static std::atomic< uint64_t > s_session_id{1}; @@ -179,7 +179,7 @@ struct blk_cache_fill_session { slab_requirements.reserve(num_slabs); } - void urgent_need_atleast(const blk_cap_t wait_count) { + void urgent_need_atleast(const blk_num_t wait_count) { urgent_refill_blks_count.store(overall_refilled_num_blks + wait_count, std::memory_order_release); } @@ -211,7 +211,7 @@ struct blk_cache_fill_session { struct SlabCacheConfig { struct _slab_config { blk_count_t slab_size; // Size of this slab (in terms of number of blks) - blk_cap_t max_entries; // Max entries allowed in this slab + blk_num_t max_entries; // Max entries allowed in this slab float refill_threshold_pct; // At what percentage empty should we start refilling this slab cache std::vector< float > m_level_distribution_pct; // How to distribute entries into multiple levels std::string m_name; // Name of the base blk allocator @@ -257,7 +257,7 @@ class FreeBlkCache { std::vector< blk_cache_entry >& excess_blks) = 0; [[maybe_unused]] virtual blk_count_t try_free_blks(const std::vector< blk_cache_entry >& blks, std::vector< blk_cache_entry >& excess_blks) = 0; - [[nodiscard]] virtual blk_cap_t try_fill_cache(const blk_cache_fill_req& fill_req, + [[nodiscard]] virtual blk_num_t try_fill_cache(const blk_cache_fill_req& fill_req, blk_cache_fill_session& fill_session) = 0; [[nodiscard]] virtual std::shared_ptr< blk_cache_fill_session > @@ -268,8 +268,7 @@ class FreeBlkCache { [[nodiscard]] static slab_idx_t find_slab(const blk_count_t nblks) { if (sisl_unlikely(nblks >= slab_tbl_size)) { - return static_cast< slab_idx_t >((nblks > 1) ? sisl::logBase2(static_cast< blk_count_t >(nblks - 1)) + 1 - : 0); + return s_cast< slab_idx_t >((nblks > 1) ? sisl::logBase2(s_cast< blk_count_t >(nblks - 1)) + 1 : 0); } return nblks_to_slab_tbl[nblks]; } diff --git a/src/lib/blkalloc/blk_cache_queue.cpp b/src/lib/blkalloc/blk_cache_queue.cpp index 5478ce487..87cda9f3e 100644 --- a/src/lib/blkalloc/blk_cache_queue.cpp +++ b/src/lib/blkalloc/blk_cache_queue.cpp @@ -26,16 +26,16 @@ FreeBlkCacheQueue::FreeBlkCacheQueue(const SlabCacheConfig& cfg, BlkAllocMetrics m_slab_queues.reserve(cfg.m_per_slab_cfg.size()); for (const auto& slab_cfg : cfg.m_per_slab_cfg) { - std::vector< blk_cap_t > level_limits; + std::vector< blk_num_t > level_limits; level_limits.reserve(slab_cfg.m_level_distribution_pct.size()); #ifndef NDEBUG HS_DBG_ASSERT_EQ(slab_cfg.slab_size, slab_size, "Slab config size is not contiguous power of 2"); slab_size *= 2; #endif - blk_cap_t sum{0}; + blk_num_t sum{0}; for (const auto& p : slab_cfg.m_level_distribution_pct) { - const blk_cap_t limit{static_cast< blk_cap_t >((static_cast< double >(slab_cfg.max_entries) * p) / 100.0)}; + const blk_num_t limit{static_cast< blk_num_t >((static_cast< double >(slab_cfg.max_entries) * p) / 100.0)}; sum += limit; level_limits.push_back(limit); } @@ -51,10 +51,10 @@ FreeBlkCacheQueue::FreeBlkCacheQueue(const SlabCacheConfig& cfg, BlkAllocMetrics } BlkAllocStatus FreeBlkCacheQueue::try_alloc_blks(const blk_cache_alloc_req& req, blk_cache_alloc_resp& resp) { - const auto slab_idx{std::min(FreeBlkCache::find_slab(req.nblks), req.max_slab_idx)}; + const auto slab_idx = std::min(FreeBlkCache::find_slab(req.nblks), req.max_slab_idx); COUNTER_INCREMENT(slab_metrics(slab_idx), num_slab_alloc, 1); - BlkAllocStatus status{try_alloc_in_slab(slab_idx, req, resp)}; + BlkAllocStatus status = try_alloc_in_slab(slab_idx, req, resp); if (status == BlkAllocStatus::SUCCESS) { BLKALLOC_LOG(TRACE, "Alloced in slab {}", resp.out_blks.front().to_string()); return status; @@ -95,8 +95,8 @@ blk_count_t FreeBlkCacheQueue::try_free_blks(const blk_cache_entry& entry, blk_cache_entry e{entry}; blk_count_t num_zombied{0}; - while (e.get_nblks() > 0) { - const auto [slab_idx, excess]{FreeBlkCache::find_round_down_slab(e.get_nblks())}; + while (e.blk_count() > 0) { + const auto [slab_idx, excess]{FreeBlkCache::find_round_down_slab(e.blk_count())}; #ifndef NDEBUG if (slab_idx >= m_slab_queues.size()) { BLKALLOC_LOG(ERROR, "Entry=[{}] slab_idx={} exceeds max slab queues {}", entry.to_string(), slab_idx, @@ -104,15 +104,15 @@ blk_count_t FreeBlkCacheQueue::try_free_blks(const blk_cache_entry& entry, } #endif - e.set_nblks(m_slab_queues[slab_idx]->get_slab_size()); + e.set_blk_count(m_slab_queues[slab_idx]->get_slab_size()); if (!push_slab(slab_idx, e, false /* only_this_level */)) { excess_blks.push_back(e); - num_zombied += e.get_nblks(); + num_zombied += e.blk_count(); } if (excess == 0) { break; } e.set_blk_num(e.get_blk_num() + m_slab_queues[slab_idx]->get_slab_size()); - e.set_nblks(excess); + e.set_blk_count(excess); } return num_zombied; @@ -129,8 +129,8 @@ blk_count_t FreeBlkCacheQueue::try_free_blks(const std::vector< blk_cache_entry return num_zombied; } -blk_cap_t FreeBlkCacheQueue::try_fill_cache(const blk_cache_fill_req& fill_req, blk_cache_fill_session& fill_session) { - blk_cap_t nblks_added{0}; +blk_num_t FreeBlkCacheQueue::try_fill_cache(const blk_cache_fill_req& fill_req, blk_cache_fill_session& fill_session) { + blk_num_t nblks_added{0}; slab_idx_t slabs_pending_refill{static_cast< slab_idx_t >(m_slab_queues.size())}; auto slab_idx{FreeBlkCache::find_slab(fill_req.nblks)}; @@ -167,8 +167,8 @@ blk_cap_t FreeBlkCacheQueue::try_fill_cache(const blk_cache_fill_req& fill_req, return (fill_req.nblks - nblks_remain); } -blk_cap_t FreeBlkCacheQueue::total_free_blks() const { - blk_cap_t count{0}; +blk_num_t FreeBlkCacheQueue::total_free_blks() const { + blk_num_t count{0}; for (const auto& sq : m_slab_queues) { count += sq->entry_count() * sq->slab_size(); } @@ -194,13 +194,13 @@ BlkAllocStatus FreeBlkCacheQueue::try_alloc_in_slab(const slab_idx_t slab_idx, c "Residue block count are not expected to exceed last entry"); const blk_count_t needed_blocks{ static_cast< blk_count_t >(m_slab_queues[slab_idx]->slab_size() - residue_nblks)}; - resp.out_blks.back().set_nblks(needed_blocks); + resp.out_blks.back().set_blk_count(needed_blocks); resp.nblks_alloced -= residue_nblks; // Create the trail residue entry and use that to free them. auto residue_e{resp.out_blks.back()}; residue_e.set_blk_num(residue_e.get_blk_num() + needed_blocks); - residue_e.set_nblks(residue_nblks); + residue_e.set_blk_count(residue_nblks); BLKALLOC_LOG(TRACE, "Residue blocks {}", residue_e.to_string()); resp.nblks_zombied += try_free_blks(residue_e, resp.excess_blks); } @@ -292,7 +292,7 @@ std::optional< blk_temp_t > FreeBlkCacheQueue::pop_slab(const slab_idx_t slab_id return ret; } -SlabCacheQueue::SlabCacheQueue(const blk_count_t slab_size, const std::vector< blk_cap_t >& level_limits, +SlabCacheQueue::SlabCacheQueue(const blk_count_t slab_size, const std::vector< blk_num_t >& level_limits, const float refill_pct, BlkAllocMetrics* parent_metrics) : m_slab_size{slab_size}, m_metrics{m_slab_size, this, parent_metrics} { for (auto& limit : level_limits) { @@ -305,8 +305,8 @@ SlabCacheQueue::SlabCacheQueue(const blk_count_t slab_size, const std::vector< b } std::optional< blk_temp_t > SlabCacheQueue::push(const blk_cache_entry& entry, const bool only_this_level) { - const blk_temp_t start_level{ - static_cast< blk_temp_t >((entry.get_temperature() >= m_level_queues.size()) ? m_level_queues.size() - 1 : entry.get_temperature())}; + const blk_temp_t start_level{static_cast< blk_temp_t >( + (entry.get_temperature() >= m_level_queues.size()) ? m_level_queues.size() - 1 : entry.get_temperature())}; blk_temp_t level{start_level}; bool pushed{m_level_queues[start_level]->write(entry)}; @@ -337,20 +337,20 @@ std::optional< blk_temp_t > SlabCacheQueue::pop(const blk_temp_t input_level, co return popped ? std::optional< blk_temp_t >{start_level} : std::nullopt; } -blk_cap_t SlabCacheQueue::entry_count() const { - blk_cap_t sz{0}; +blk_num_t SlabCacheQueue::entry_count() const { + blk_num_t sz{0}; for (size_t l{0}; l < m_level_queues.size(); ++l) { sz += num_level_entries(l); } return sz; } -blk_cap_t SlabCacheQueue::entry_capacity() const { return m_total_capacity; } +blk_num_t SlabCacheQueue::entry_capacity() const { return m_total_capacity; } -blk_cap_t SlabCacheQueue::num_level_entries(const blk_temp_t level) const { return m_level_queues[level]->sizeGuess(); } +blk_num_t SlabCacheQueue::num_level_entries(const blk_temp_t level) const { return m_level_queues[level]->sizeGuess(); } -blk_cap_t SlabCacheQueue::open_session(const uint64_t session_id, const bool fill_entire_cache) { - blk_cap_t count{0}; +blk_num_t SlabCacheQueue::open_session(const uint64_t session_id, const bool fill_entire_cache) { + blk_num_t count{0}; uint64_t id{m_refill_session.load(std::memory_order_acquire)}; if (id == 0) { diff --git a/src/lib/blkalloc/blk_cache_queue.h b/src/lib/blkalloc/blk_cache_queue.h index f9f51315d..87ac901bf 100644 --- a/src/lib/blkalloc/blk_cache_queue.h +++ b/src/lib/blkalloc/blk_cache_queue.h @@ -47,7 +47,7 @@ class SlabMetrics : public sisl::MetricsGroup { class SlabCacheQueue { public: - SlabCacheQueue(const blk_count_t slab_size, const std::vector< blk_cap_t >& level_limits, const float refill_pct, + SlabCacheQueue(const blk_count_t slab_size, const std::vector< blk_num_t >& level_limits, const float refill_pct, BlkAllocMetrics* metrics); SlabCacheQueue(const SlabCacheQueue&) = delete; SlabCacheQueue(SlabCacheQueue&&) noexcept = delete; @@ -58,15 +58,15 @@ class SlabCacheQueue { [[nodiscard]] std::optional< blk_temp_t > push(const blk_cache_entry& entry, const bool only_this_level); [[nodiscard]] std::optional< blk_temp_t > pop(const blk_temp_t level, const bool only_this_level, blk_cache_entry& out_entry); - [[nodiscard]] blk_cap_t entry_count() const; - [[nodiscard]] blk_cap_t entry_capacity() const; - [[nodiscard]] blk_cap_t num_level_entries(const blk_temp_t level) const; + [[nodiscard]] blk_num_t entry_count() const; + [[nodiscard]] blk_num_t entry_capacity() const; + [[nodiscard]] blk_num_t num_level_entries(const blk_temp_t level) const; [[nodiscard]] blk_num_t entries_needed(const blk_num_t nblks) const { return (nblks - 1) / m_slab_size + 1; } [[nodiscard]] blk_count_t slab_size() const { return m_slab_size; } void refilled(); - [[nodiscard]] blk_cap_t open_session(const uint64_t session_id, const bool fill_entire_cache); + [[nodiscard]] blk_num_t open_session(const uint64_t session_id, const bool fill_entire_cache); void close_session(const uint64_t session_id); [[nodiscard]] SlabMetrics& metrics() { return m_metrics; } @@ -74,11 +74,11 @@ class SlabCacheQueue { blk_count_t get_slab_size() const { return m_slab_size; } private: - blk_count_t m_slab_size; // Slab size in-terms of number of pages + blk_count_t m_slab_size; // Slab size in-terms of number of pages std::vector< std::unique_ptr< folly::MPMCQueue< blk_cache_entry > > > m_level_queues; std::atomic< uint64_t > m_refill_session{0}; // Is a refill pending for this slab - blk_cap_t m_total_capacity{0}; - blk_cap_t m_refill_threshold_limits; // For every level whats their threshold limit size + blk_num_t m_total_capacity{0}; + blk_num_t m_refill_threshold_limits; // For every level whats their threshold limit size SlabMetrics m_metrics; }; @@ -90,35 +90,29 @@ class FreeBlkCacheQueue : public FreeBlkCache { FreeBlkCacheQueue& operator=(const FreeBlkCacheQueue&) = delete; FreeBlkCacheQueue& operator=(FreeBlkCacheQueue&&) noexcept = delete; - [[nodiscard]] BlkAllocStatus try_alloc_blks(const blk_cache_alloc_req& req, blk_cache_alloc_resp& resp) override; - [[maybe_unused]] blk_count_t try_free_blks(const blk_cache_entry& entry, - std::vector< blk_cache_entry >& excess_blks) override; - [[maybe_unused]] blk_count_t try_free_blks(const std::vector< blk_cache_entry >& blks, - std::vector< blk_cache_entry >& excess_blks) override; - [[nodiscard]] blk_cap_t try_fill_cache(const blk_cache_fill_req& fill_req, - blk_cache_fill_session& fill_session) override; + BlkAllocStatus try_alloc_blks(const blk_cache_alloc_req& req, blk_cache_alloc_resp& resp) override; + blk_count_t try_free_blks(const blk_cache_entry& entry, std::vector< blk_cache_entry >& excess_blks) override; + blk_count_t try_free_blks(const std::vector< blk_cache_entry >& blks, + std::vector< blk_cache_entry >& excess_blks) override; + blk_num_t try_fill_cache(const blk_cache_fill_req& fill_req, blk_cache_fill_session& fill_session) override; - [[nodiscard]] blk_cap_t total_free_blks() const override; + blk_num_t total_free_blks() const override; - [[nodiscard]] std::shared_ptr< blk_cache_fill_session > create_cache_fill_session(const bool fill_entire_cache); + std::shared_ptr< blk_cache_fill_session > create_cache_fill_session(const bool fill_entire_cache); void close_cache_fill_session(blk_cache_fill_session& fill_session); private: - [[nodiscard]] BlkAllocStatus break_up(const slab_idx_t slab_idx, const blk_cache_alloc_req& req, - blk_cache_alloc_resp& resp); - [[nodiscard]] BlkAllocStatus merge_down(const slab_idx_t slab_idx, const blk_cache_alloc_req& req, - blk_cache_alloc_resp& resp); - [[nodiscard]] BlkAllocStatus try_alloc_in_slab(const slab_idx_t slab_num, const blk_cache_alloc_req& req, - blk_cache_alloc_resp& resp); - - [[nodiscard]] std::optional< blk_temp_t > push_slab(const slab_idx_t slab_idx, const blk_cache_entry& entry, - const bool only_this_level); - [[nodiscard]] std::optional< blk_temp_t > pop_slab(const slab_idx_t slab_idx, const blk_temp_t level, - const bool only_this_level, blk_cache_entry& out_entry); - - [[nodiscard]] inline SlabMetrics& slab_metrics(const slab_idx_t slab_idx) const { - return m_slab_queues[slab_idx]->metrics(); - } + BlkAllocStatus break_up(const slab_idx_t slab_idx, const blk_cache_alloc_req& req, blk_cache_alloc_resp& resp); + BlkAllocStatus merge_down(const slab_idx_t slab_idx, const blk_cache_alloc_req& req, blk_cache_alloc_resp& resp); + BlkAllocStatus try_alloc_in_slab(const slab_idx_t slab_num, const blk_cache_alloc_req& req, + blk_cache_alloc_resp& resp); + + std::optional< blk_temp_t > push_slab(const slab_idx_t slab_idx, const blk_cache_entry& entry, + const bool only_this_level); + std::optional< blk_temp_t > pop_slab(const slab_idx_t slab_idx, const blk_temp_t level, const bool only_this_level, + blk_cache_entry& out_entry); + + inline SlabMetrics& slab_metrics(const slab_idx_t slab_idx) const { return m_slab_queues[slab_idx]->metrics(); } std::string get_name() { return m_cfg.get_name(); } diff --git a/src/lib/blkalloc/fixed_blk_allocator.cpp b/src/lib/blkalloc/fixed_blk_allocator.cpp index 134c2736f..d922edf03 100644 --- a/src/lib/blkalloc/fixed_blk_allocator.cpp +++ b/src/lib/blkalloc/fixed_blk_allocator.cpp @@ -20,7 +20,7 @@ #include "blk_allocator.h" namespace homestore { -FixedBlkAllocator::FixedBlkAllocator(const BlkAllocConfig& cfg, bool init, chunk_num_t chunk_id) : +FixedBlkAllocator::FixedBlkAllocator(BlkAllocConfig const& cfg, bool init, chunk_num_t chunk_id) : BlkAllocator(cfg, chunk_id), m_blk_q{get_total_blks()} { LOGINFO("total blks: {}", get_total_blks()); if (init) { inited(); } @@ -53,45 +53,29 @@ blk_num_t FixedBlkAllocator::init_portion(BlkAllocPortion& portion, blk_num_t st return blk_num; } -bool FixedBlkAllocator::is_blk_alloced(const BlkId& b, bool use_lock) const { return true; } +bool FixedBlkAllocator::is_blk_alloced(BlkId const& b, bool use_lock) const { return true; } -BlkAllocStatus FixedBlkAllocator::alloc(blk_count_t nblks, const blk_alloc_hints& hints, - std::vector< BlkId >& out_blkid) { - /* TODO:If it is more then 1 then we need to make sure that we never allocate across the portions. As of now - * we don't support the vector of blkids in fixed blk allocator */ +BlkAllocStatus FixedBlkAllocator::alloc([[maybe_unused]] blk_count_t nblks, blk_alloc_hints const&, BlkId& out_blkid) { HS_DBG_ASSERT_EQ(nblks, 1, "FixedBlkAllocator does not support multiple blk allocation yet"); - - BlkId bid; - const auto status = alloc(bid); - if (status == BlkAllocStatus::SUCCESS) { - out_blkid.push_back(bid); - // no need to update real time bm as it is already updated in alloc of single blkid api; - } - return status; + return alloc_contiguous(r_cast< BlkId& >(out_blkid)); } -BlkAllocStatus FixedBlkAllocator::alloc(BlkId& out_blkid) { +BlkAllocStatus FixedBlkAllocator::alloc_contiguous(BlkId& out_blkid) { #ifdef _PRERELEASE if (iomgr_flip::instance()->test_flip("fixed_blkalloc_no_blks")) { return BlkAllocStatus::SPACE_FULL; } #endif const auto ret = m_blk_q.read(out_blkid); if (ret) { // update real time bitmap; - alloc_on_realtime(out_blkid); + if (realtime_bm_on()) { alloc_on_realtime(out_blkid); } return BlkAllocStatus::SUCCESS; } else { return BlkAllocStatus::SPACE_FULL; } } -void FixedBlkAllocator::free(const std::vector< BlkId >& blk_ids) { - for (const auto& blk_id : blk_ids) { - free(blk_id); - } -} - -void FixedBlkAllocator::free(const BlkId& b) { - HS_DBG_ASSERT_EQ(b.get_nblks(), 1, "Multiple blk free for FixedBlkAllocator? allocated by different allocator?"); +void FixedBlkAllocator::free(BlkId const& b) { + HS_DBG_ASSERT_EQ(b.blk_count(), 1, "Multiple blk free for FixedBlkAllocator? allocated by different allocator?"); // No need to set in cache if it is not recovered. When recovery is complete we copy the disk_bm to cache bm. if (m_inited) { @@ -100,8 +84,8 @@ void FixedBlkAllocator::free(const BlkId& b) { } } -blk_cap_t FixedBlkAllocator::available_blks() const { return m_blk_q.sizeGuess(); } -blk_cap_t FixedBlkAllocator::get_used_blks() const { return get_total_blks() - available_blks(); } +blk_num_t FixedBlkAllocator::available_blks() const { return m_blk_q.sizeGuess(); } +blk_num_t FixedBlkAllocator::get_used_blks() const { return get_total_blks() - available_blks(); } std::string FixedBlkAllocator::to_string() const { return fmt::format("Total Blks={} Available_Blks={}", get_total_blks(), available_blks()); diff --git a/src/lib/blkalloc/varsize_blk_allocator.cpp b/src/lib/blkalloc/varsize_blk_allocator.cpp index 8380621b8..34d2e6dab 100644 --- a/src/lib/blkalloc/varsize_blk_allocator.cpp +++ b/src/lib/blkalloc/varsize_blk_allocator.cpp @@ -33,7 +33,7 @@ SISL_LOGGING_DECL(blkalloc) template <> struct fmt::formatter< std::thread::id > { constexpr auto parse(format_parse_context& ctx) -> format_parse_context::iterator { return ctx.begin(); } - auto format(const std::thread::id& i, format_context& ctx) const -> format_context::iterator { + auto format(std::thread::id const& i, format_context& ctx) const -> format_context::iterator { return fmt::format_to(ctx.out(), "{}", std::hash< std::thread::id >{}(i)); } }; @@ -50,7 +50,7 @@ std::condition_variable VarsizeBlkAllocator::s_sweeper_cv; std::queue< VarsizeBlkAllocator* > VarsizeBlkAllocator::s_sweeper_queue; std::unordered_set< VarsizeBlkAllocator* > VarsizeBlkAllocator::s_block_allocators; -VarsizeBlkAllocator::VarsizeBlkAllocator(const VarsizeBlkAllocConfig& cfg, bool init, chunk_num_t chunk_id) : +VarsizeBlkAllocator::VarsizeBlkAllocator(VarsizeBlkAllocConfig const& cfg, bool init, chunk_num_t chunk_id) : BlkAllocator{cfg, chunk_id}, m_state{BlkAllocatorState::INIT}, m_cfg{cfg}, @@ -232,27 +232,6 @@ bool VarsizeBlkAllocator::allocator_state_machine() { return active_state; } -bool VarsizeBlkAllocator::is_blk_alloced(const BlkId& b, bool use_lock) const { - if (!m_inited) { return true; } - auto bits_set{[this, &b]() { - // No need to set in cache if it is not recovered. When recovery is complete we copy the disk_bm to cache - // bm. - if (!m_cache_bm->is_bits_set(b.get_blk_num(), b.get_nblks())) { - BLKALLOC_REL_ASSERT(0, "Expected bits to set"); - return false; - } - return true; - }}; - if (use_lock) { - const BlkAllocPortion& portion = blknum_to_portion_const(b.get_blk_num()); - auto lock{portion.portion_auto_lock()}; - if (!bits_set()) return false; - } else { - if (!bits_set()) return false; - } - return true; -} - void VarsizeBlkAllocator::inited() { m_cache_bm->copy(*(get_disk_bm_const())); BlkAllocator::inited(); @@ -405,251 +384,376 @@ void VarsizeBlkAllocator::fill_cache_in_portion(blk_num_t portion_num, blk_cache fill_session.session_id, portion_num, fill_session.overall_refilled_num_blks); } -BlkAllocStatus VarsizeBlkAllocator::alloc(BlkId& out_blkid) { - static thread_local std::vector< BlkId > s_ids; - s_ids.clear(); +BlkAllocStatus VarsizeBlkAllocator::alloc_contiguous(BlkId& out_blkid) { + return alloc_contiguous(1, blk_alloc_hints{}, out_blkid); +} - auto const status = alloc(1, blk_alloc_hints{}, s_ids); - if (status == BlkAllocStatus::SUCCESS) { - out_blkid = s_ids[0]; - // we don't update realtime here; - // it is already updated at vector version of alloc; - } +BlkAllocStatus VarsizeBlkAllocator::alloc_contiguous(blk_count_t nblks, blk_alloc_hints const& hints, + BlkId& out_blkid) { + MultiBlkId mbid; + auto const status = alloc(nblks, hints, mbid); + if (status == BlkAllocStatus::SUCCESS) { out_blkid = mbid; } return status; } -BlkAllocStatus VarsizeBlkAllocator::alloc(blk_count_t nblks, const blk_alloc_hints& hints, - std::vector< BlkId >& out_blkids) { - BLKALLOC_LOG_ASSERT(m_inited, "Alloc before initialized"); - BLKALLOC_LOG_ASSERT_CMP(nblks % hints.multiplier, ==, 0); - BLKALLOC_LOG(TRACE, "nblks={}, hints multiplier={}", nblks, hints.multiplier); +BlkAllocStatus VarsizeBlkAllocator::alloc(blk_count_t nblks, blk_alloc_hints const& hints, BlkId& out_blkid) { + bool use_slabs = m_cfg.m_use_slabs; #ifdef _PRERELEASE - if (hints.error_simulate && iomgr_flip::instance()->test_flip("varsize_blkalloc_no_blks", nblks)) { - return BlkAllocStatus::SPACE_FULL; - } - - if (iomgr_flip::instance()->test_flip("varsize_blkalloc_bypass_cache")) { - blk_count_t num_alllocated{0}; - auto const status = alloc_blks_direct(nblks, hints, out_blkids, num_alllocated); - if (status == BlkAllocStatus::SUCCESS) { - incr_alloced_blk_count(num_alllocated); - return status; - } else { - // NOTE: There is a small chance this can fail if all the blocks have already been allocated - // to slabs. So clear any partial and fall through to normal routine below - if (status == BlkAllocStatus::PARTIAL) { - for (const auto& blk_id : out_blkids) { - free_on_bitmap(blk_id); - } - out_blkids.clear(); - } - } - } + if (iomgr_flip::instance()->test_flip("varsize_blkalloc_no_blks", nblks)) { return BlkAllocStatus::SPACE_FULL; } + if (iomgr_flip::instance()->test_flip("varsize_blkalloc_bypass_cache")) { use_slabs = false; } #endif - auto status = BlkAllocStatus::FAILED; - blk_count_t total_allocated{0}; - if (m_cfg.m_use_slabs) { - // Allocate from blk cache - static thread_local blk_cache_alloc_resp s_alloc_resp; - const blk_cache_alloc_req alloc_req{nblks, hints.desired_temp, hints.is_contiguous, - FreeBlkCache::find_slab(hints.multiplier), - FreeBlkCache::find_slab(hints.max_blks_per_entry)}; - COUNTER_INCREMENT(m_metrics, num_alloc, 1); - - auto free_excess_blocks{[this]() { - // put excess blocks back on bitmap - for (const auto& e : s_alloc_resp.excess_blks) { - BLKALLOC_LOG(DEBUG, "Freeing in bitmap of entry={} - excess of alloc_blks size={}", e.to_string(), - s_alloc_resp.excess_blks.size()); - free_on_bitmap(blk_cache_entry_to_blkid(e)); - } - }}; - - auto discard_current_allocation{[this, &free_excess_blocks]() { - if (!s_alloc_resp.out_blks.empty()) { - s_alloc_resp.nblks_zombied = m_fb_cache->try_free_blks(s_alloc_resp.out_blks, s_alloc_resp.excess_blks); - } - free_excess_blocks(); - s_alloc_resp.reset(); - }}; - - s_alloc_resp.reset(); - // retries must be at least two to allow slab refill logic to run - const uint32_t max_retries = - std::max< uint32_t >(HS_DYNAMIC_CONFIG(blkallocator.max_varsize_blk_alloc_attempt), 2); - for (uint32_t retry{0}; (retry < max_retries); ++retry) { - status = m_fb_cache->try_alloc_blks(alloc_req, s_alloc_resp); - if ((status == BlkAllocStatus::SUCCESS) || ((status == BlkAllocStatus::PARTIAL) && !hints.is_contiguous)) { - // If the cache has depleted a bit, kick of sweep thread to fill the cache. - if (s_alloc_resp.need_refill) { request_more_blks(nullptr, false /* fill_entire_cache */); } - BLKALLOC_LOG(TRACE, "Alloced first blk_num={}", s_alloc_resp.out_blks[0].to_string()); - - // Convert the response block cache entries to blkids - blk_cache_entries_to_blkids(s_alloc_resp.out_blks, out_blkids); - total_allocated = s_alloc_resp.nblks_alloced; - break; - } else { - discard_current_allocation(); - if ((retry + 1) < max_retries) { - COUNTER_INCREMENT(m_metrics, num_retries, 1); - auto const min_nblks = std::max< blk_count_t >(m_cfg.highest_slab_blks_count() * 2, nblks); - BLKALLOC_LOG( - DEBUG, - "Failed to allocate {} blks from blk cache, requesting refill at least {} blks and retry={}", - nblks, min_nblks, retry); - request_more_blks_wait(nullptr /* seg */, min_nblks); - } - } - } - free_excess_blocks(); + if (!hints.is_contiguous && !out_blkid.is_multi()) { + HS_DBG_ASSERT(false, "Invalid Input: Non contiguous allocation needs MultiBlkId to store"); + return BlkAllocStatus::INVALID_INPUT; } - if (hints.is_contiguous) { - // failed to allocate in slab try direct. - if (status != BlkAllocStatus::SUCCESS) { - blk_count_t num_allocated{0}; - status = alloc_blks_direct(nblks, hints, out_blkids, num_allocated); - if (status == BlkAllocStatus::SUCCESS) { - total_allocated += num_allocated; - BLKALLOC_LOG(TRACE, "Alloced blk_num={} directly", out_blkids.back().to_string()); - } - } - } else { - if (status != BlkAllocStatus::SUCCESS) { - // try to allocate remainder - const blk_count_t nblks_remaining = static_cast< blk_count_t >(nblks - total_allocated); - BLKALLOC_LOG(DEBUG, "nblks={} failed to alloc all from fb cache, trying to alloc rest from bitset directly", - nblks_remaining); - blk_count_t num_allocated{0}; - auto status2 = alloc_blks_direct(nblks_remaining, hints, out_blkids, num_allocated); - if ((status2 == BlkAllocStatus::SUCCESS) || (status2 == BlkAllocStatus::PARTIAL)) { - total_allocated += num_allocated; - BLKALLOC_LOG(TRACE, "Alloced additional blk_num={} directly", out_blkids.back().to_string()); - } else { - // failure to get more is really partial if we have some - BLKALLOC_LOG(TRACE, "Failed to alloc additional blks directly with code {}", status2); - if (status == BlkAllocStatus::PARTIAL) status2 = BlkAllocStatus::PARTIAL; - } - status = status2; + MultiBlkId tmp_blkid; + MultiBlkId& out_mbid = out_blkid.is_multi() ? r_cast< MultiBlkId& >(out_blkid) : tmp_blkid; + BlkAllocStatus status; + blk_count_t num_allocated{0}; + blk_count_t nblks_remain; + + if (use_slabs && (nblks <= m_cfg.highest_slab_blks_count())) { + num_allocated = alloc_blks_slab(nblks, hints, out_mbid); + if (num_allocated >= nblks) { + status = BlkAllocStatus::SUCCESS; + goto out; } + // Fall through to alloc_blks_direct } - switch (status) { - case BlkAllocStatus::FAILED: - case BlkAllocStatus::SPACE_FULL: - COUNTER_INCREMENT(m_metrics, num_alloc_failure, 1); - BLKALLOC_LOG(ERROR, "nblks={} failed to alloc any number of blocks", nblks); - break; - case BlkAllocStatus::PARTIAL: - COUNTER_INCREMENT(m_metrics, num_alloc_partial, 1); - BLKALLOC_LOG(DEBUG, "nblks={} allocated={} partial allocation", nblks, total_allocated); - break; - case BlkAllocStatus::SUCCESS: - break; - default: - BLKALLOC_LOG(ERROR, "Unexpected status", status); + nblks_remain = nblks - num_allocated; + num_allocated += alloc_blks_direct(nblks_remain, hints, out_mbid); + if (num_allocated == nblks) { + status = BlkAllocStatus::SUCCESS; + BLKALLOC_LOG(TRACE, "Alloced blks [{}] directly", out_mbid.to_string()); + } else if ((num_allocated != 0) && hints.partial_alloc_ok) { + status = BlkAllocStatus::PARTIAL; + } else { + free_blks_direct(out_mbid); + status = hints.is_contiguous ? BlkAllocStatus::FAILED : BlkAllocStatus::SPACE_FULL; } +out: if ((status == BlkAllocStatus::SUCCESS) || (status == BlkAllocStatus::PARTIAL)) { - incr_alloced_blk_count(total_allocated); + incr_alloced_blk_count(num_allocated); // update real time bitmap - for (const auto& b : out_blkids) { - alloc_on_realtime(b); - } + if (realtime_bm_on()) { alloc_on_realtime(out_mbid); } #ifdef _PRERELEASE - alloc_sanity_check(total_allocated, hints, out_blkids); + alloc_sanity_check(num_allocated, hints, out_mbid); #endif } + if (!out_blkid.is_multi()) { out_blkid = out_mbid.to_single_blkid(); } return status; } -void VarsizeBlkAllocator::free(const std::vector< BlkId >& blk_ids) { - for (const auto& blk_id : blk_ids) { - free(blk_id); +BlkAllocStatus VarsizeBlkAllocator::alloc(blk_count_t nblks, blk_alloc_hints const& hints, + std::vector< BlkId >& out_blkids) { + // Regular alloc blks will allocate in MultiBlkId, but there is an upper limit on how many it can accomodate in a + // single MultiBlkId, if caller is ok to generate multiple MultiBlkids, this method is called. + auto h = hints; + h.partial_alloc_ok = true; + blk_count_t nblks_remain = nblks; + BlkAllocStatus status; + + do { + MultiBlkId mbid; + status = alloc(nblks_remain, h, mbid); + if ((status != BlkAllocStatus::SUCCESS) && (status != BlkAllocStatus::PARTIAL)) { break; } + + blk_count_t nblks_this_iter{0}; + auto it = mbid.iterate(); + while (auto const bid = it.next()) { + out_blkids.push_back(*bid); + nblks_this_iter += bid->blk_count(); + } + + if (status == BlkAllocStatus::SUCCESS) { + HS_DBG_ASSERT_GE(nblks_this_iter, nblks_remain, + "alloc_blks returned success, but return id doesn't have reqd blks"); + break; + } + + if (nblks_this_iter >= nblks_remain) { + HS_DBG_ASSERT(false, "alloc_blks returns partial, while it has fully allocated reqd blks"); + status = BlkAllocStatus::SUCCESS; + break; + } + nblks_remain -= nblks_this_iter; + } while (nblks_remain); + + return status; +} + +blk_count_t VarsizeBlkAllocator::alloc_blks_slab(blk_count_t nblks, blk_alloc_hints const& hints, + MultiBlkId& out_blkid) { + blk_count_t num_allocated{0}; + + // Allocate from blk cache + static thread_local blk_cache_alloc_resp s_alloc_resp; + const blk_cache_alloc_req alloc_req{nblks, hints.desired_temp, hints.is_contiguous, + FreeBlkCache::find_slab(hints.min_blks_per_piece), + s_cast< slab_idx_t >(m_cfg.get_slab_cnt() - 1)}; + COUNTER_INCREMENT(m_metrics, num_alloc, 1); + + auto free_excess_blocks = [this]() { + // put excess blocks back on bitmap + for (auto const& e : s_alloc_resp.excess_blks) { + BLKALLOC_LOG(DEBUG, "Freeing in bitmap of entry={} - excess of alloc_blks size={}", e.to_string(), + s_alloc_resp.excess_blks.size()); + free_blks_direct(MultiBlkId{blk_cache_entry_to_blkid(e)}); + } + }; + + auto discard_current_allocation = [this, &free_excess_blocks]() { + if (!s_alloc_resp.out_blks.empty()) { + s_alloc_resp.nblks_zombied = m_fb_cache->try_free_blks(s_alloc_resp.out_blks, s_alloc_resp.excess_blks); + } + free_excess_blocks(); + s_alloc_resp.reset(); + }; + + s_alloc_resp.reset(); + // retries must be at least two to allow slab refill logic to run + const uint32_t max_retries = std::max< uint32_t >(HS_DYNAMIC_CONFIG(blkallocator.max_varsize_blk_alloc_attempt), 2); + for (uint32_t retry{0}; ((retry < max_retries) && out_blkid.has_room()); ++retry) { + auto status = m_fb_cache->try_alloc_blks(alloc_req, s_alloc_resp); + + // If the blk allocation is only partially completed, then we are ok in proceeding further for cases where + // caller does not want a contiguous allocation. In that case, return these partial results and then caller will + // use direct allocation to allocate remaining blks. In case where caller is also ok with partial allocation, + // then it doesn't matter if request is for contiguous allocation or not, we can return the partial results. + if ((status == BlkAllocStatus::SUCCESS) || + ((status == BlkAllocStatus::PARTIAL) && (hints.partial_alloc_ok || !hints.is_contiguous))) { + // If the cache has depleted a bit, kick of sweep thread to fill the cache. + if (s_alloc_resp.need_refill) { request_more_blks(nullptr, false /* fill_entire_cache */); } + BLKALLOC_LOG(TRACE, "Alloced first blk_num={}", s_alloc_resp.out_blks[0].to_string()); + + // Convert the response block cache entries to blkids + for (size_t piece{0}; piece < s_alloc_resp.out_blks.size(); ++piece) { + auto& e = s_alloc_resp.out_blks[piece]; + if (out_blkid.has_room()) { + out_blkid.add(e.get_blk_num(), e.blk_count(), m_chunk_id); + num_allocated += e.blk_count(); + } else { + // We are not able to put all of the response to out_blkid, because it doesn't have room, + // If caller is ok with partial allocation, we can free remaining entry and send partial result. + // If caller is not ok with partial allocation, we should discard entire allocation and retry + if (hints.partial_alloc_ok) { + s_alloc_resp.excess_blks.insert(s_alloc_resp.excess_blks.end(), + s_alloc_resp.out_blks.begin() + piece, + s_alloc_resp.out_blks.end()); + } else { + num_allocated = 0; + out_blkid = MultiBlkId{}; + status = BlkAllocStatus::TOO_MANY_PIECES; + } + break; + } + } + + if (status != BlkAllocStatus::TOO_MANY_PIECES) { break; } + } + + discard_current_allocation(); + if ((retry + 1) < max_retries) { + COUNTER_INCREMENT(m_metrics, num_retries, 1); + auto const min_nblks = std::max< blk_count_t >(m_cfg.highest_slab_blks_count() * 2, nblks); + BLKALLOC_LOG(DEBUG, + "Failed to allocate {} blks from blk cache, requesting refill at least {} blks " + "and retry={}", + nblks, min_nblks, retry); + request_more_blks_wait(nullptr /* seg */, min_nblks); + } } + + free_excess_blocks(); + + return num_allocated; +} + +blk_count_t VarsizeBlkAllocator::alloc_blks_direct(blk_count_t nblks, blk_alloc_hints const& hints, + MultiBlkId& out_blkid) { + // Search all segments starting with some random portion num within each segment + static thread_local std::random_device rd{}; + static thread_local std::default_random_engine re{rd()}; + + if (m_start_portion_num == INVALID_PORTION_NUM) { m_start_portion_num = m_rand_portion_num_generator(re); } + + auto portion_num = m_start_portion_num; + auto const max_pieces = hints.is_contiguous ? 1u : MultiBlkId::max_pieces; + + blk_count_t const min_blks = hints.is_contiguous ? nblks : std::min< blk_count_t >(nblks, hints.min_blks_per_piece); + blk_count_t nblks_remain = nblks; + do { + BlkAllocPortion& portion = get_blk_portion(portion_num); + auto cur_blk_id = portion_num * get_blks_per_portion(); + auto const end_blk_id = cur_blk_id + get_blks_per_portion() - 1; + { + auto lock{portion.portion_auto_lock()}; + while (nblks_remain && (cur_blk_id <= end_blk_id) && portion.get_available_blocks() && + out_blkid.has_room()) { + // Get next reset bits and insert to cache and then reset those bits + auto const b = m_cache_bm->get_next_contiguous_n_reset_bits( + cur_blk_id, end_blk_id, std::min(min_blks, nblks_remain), nblks_remain); + if (b.nbits == 0) { break; } + HS_DBG_ASSERT_GE(end_blk_id, b.start_bit, "Expected start bit to be smaller than end bit"); + HS_DBG_ASSERT_LE(b.nbits, nblks_remain); + HS_DBG_ASSERT_GE(b.nbits, std::min(min_blks, nblks_remain)); + HS_DBG_ASSERT_GE(end_blk_id, (b.start_bit + b.nbits - 1), + "Expected end bit to be smaller than portion end bit"); + + nblks_remain -= b.nbits; + out_blkid.add(b.start_bit, b.nbits, m_chunk_id); + + BLKALLOC_LOG(DEBUG, "Allocated directly from portion={} nnblks={} Blk_num={} nblks={} set_bit_count={}", + portion_num, nblks, b.start_bit, b.nbits, get_alloced_blk_count()); + + // Set the bitmap indicating the blocks are allocated + m_cache_bm->set_bits(b.start_bit, b.nbits); + if (portion.decrease_available_blocks(b.nbits) == 0) break; + cur_blk_id = b.start_bit + b.nbits; + } + } + if (++portion_num == get_num_portions()) { portion_num = 0; } + BLKALLOC_LOG(TRACE, "alloc direct unable to find in prev portion, searching in portion={}, start_portion={}", + portion_num, m_start_portion_num); + } while (nblks_remain && (portion_num != m_start_portion_num) && !hints.is_contiguous && out_blkid.has_room()); + + // save which portion we were at for next allocation; + m_start_portion_num = portion_num; + + COUNTER_INCREMENT(m_metrics, num_blks_alloc_direct, 1); + return (nblks - nblks_remain); } -void VarsizeBlkAllocator::free(const BlkId& b) { +void VarsizeBlkAllocator::free(BlkId const& bid) { if (!m_inited) { - BLKALLOC_LOG(DEBUG, "Free not required for blk num = {}", b.get_blk_num()); + BLKALLOC_LOG(DEBUG, "Free not required for blk num = {}", bid.blk_num()); return; } - if (m_cfg.m_use_slabs) { - static thread_local std::vector< blk_cache_entry > excess_blks; - excess_blks.clear(); + blk_count_t n_freed = (m_cfg.m_use_slabs && (bid.blk_count() <= m_cfg.highest_slab_blks_count())) + ? free_blks_slab(r_cast< MultiBlkId const& >(bid)) + : free_blks_direct(r_cast< MultiBlkId const& >(bid)); + decr_alloced_blk_count(n_freed); + BLKALLOC_LOG(TRACE, "Freed blk_num={}", bid.to_string()); +} + +blk_count_t VarsizeBlkAllocator::free_blks_slab(MultiBlkId const& bid) { + static thread_local std::vector< blk_cache_entry > excess_blks; + excess_blks.clear(); - [[maybe_unused]] const blk_count_t num_zombied{ - m_fb_cache->try_free_blks(blkid_to_blk_cache_entry(b, 2), excess_blks)}; + auto const do_free = [this](BlkId const& b) { + m_fb_cache->try_free_blks(blkid_to_blk_cache_entry(b, 2), excess_blks); + return b.blk_count(); + }; - for (const auto& e : excess_blks) { - BLKALLOC_LOG(TRACE, "Freeing in bitmap of entry={} - excess of free_blks size={}", e.to_string(), - excess_blks.size()); - free_on_bitmap(blk_cache_entry_to_blkid(e)); + blk_count_t n_freed{0}; + if (bid.is_multi()) { + auto it = bid.iterate(); + while (auto const b = it.next()) { + n_freed += do_free(*b); } } else { - // free directly on bitmap - free_on_bitmap(b); + n_freed += do_free(bid); } - decr_alloced_blk_count(b.get_nblks()); - BLKALLOC_LOG(TRACE, "Freed blk_num={}", blkid_to_blk_cache_entry(b).to_string()); + for (auto const& e : excess_blks) { + BLKALLOC_LOG(TRACE, "Freeing in bitmap of entry={} - excess of free_blks size={}", e.to_string(), + excess_blks.size()); + free_blks_direct(MultiBlkId{blk_cache_entry_to_blkid(e)}); + } + return n_freed; } -blk_cap_t VarsizeBlkAllocator::available_blks() const { return get_total_blks() - get_used_blks(); } -blk_cap_t VarsizeBlkAllocator::get_used_blks() const { return get_alloced_blk_count(); } - -void VarsizeBlkAllocator::free_on_bitmap(const BlkId& b) { - BlkAllocPortion& portion = blknum_to_portion(b.get_blk_num()); - { - auto const start_blk_id = portion.get_portion_num() * get_blks_per_portion(); - auto const end_blk_id = start_blk_id + get_blks_per_portion() - 1; - auto lock{portion.portion_auto_lock()}; - HS_DBG_ASSERT_LE(start_blk_id, b.get_blk_num(), "Expected start bit to be greater than portion start bit"); - HS_DBG_ASSERT_GE(end_blk_id, (b.get_blk_num() + b.get_nblks() - 1), - "Expected end bit to be smaller than portion end bit"); - BLKALLOC_REL_ASSERT(m_cache_bm->is_bits_set(b.get_blk_num(), b.get_nblks()), "Expected bits to be set"); - m_cache_bm->reset_bits(b.get_blk_num(), b.get_nblks()); - portion.increase_available_blocks(b.get_nblks()); +blk_count_t VarsizeBlkAllocator::free_blks_direct(MultiBlkId const& bid) { + auto const do_free = [this](BlkId const& b) { + BlkAllocPortion& portion = blknum_to_portion(b.blk_num()); + { + auto const start_blk_id = portion.get_portion_num() * get_blks_per_portion(); + auto const end_blk_id = start_blk_id + get_blks_per_portion() - 1; + auto lock{portion.portion_auto_lock()}; + HS_DBG_ASSERT_LE(start_blk_id, b.blk_num(), "Expected start bit to be greater than portion start bit"); + HS_DBG_ASSERT_GE(end_blk_id, (b.blk_num() + b.blk_count() - 1), + "Expected end bit to be smaller than portion end bit"); + BLKALLOC_REL_ASSERT(m_cache_bm->is_bits_set(b.blk_num(), b.blk_count()), "Expected bits to be set"); + m_cache_bm->reset_bits(b.blk_num(), b.blk_count()); + portion.increase_available_blocks(b.blk_count()); + } + BLKALLOC_LOG(TRACE, "Freeing directly to portion={} blkid={} set_bits_count={}", + blknum_to_portion_num(b.blk_num()), b.to_string(), get_alloced_blk_count()); + return b.blk_count(); + }; + + blk_count_t n_freed{0}; + if (bid.is_multi()) { + auto it = bid.iterate(); + while (auto const b = it.next()) { + n_freed += do_free(*b); + } + } else { + n_freed += do_free(bid); } - BLKALLOC_LOG(TRACE, "Freeing directly to portion={} blkid={} set_bits_count={}", - blknum_to_portion_num(b.get_blk_num()), b.to_string(), get_alloced_blk_count()); + return n_freed; } -#ifdef _PRERELEASE -bool VarsizeBlkAllocator::is_set_on_bitmap(const BlkId& b) const { - const BlkAllocPortion& portion = blknum_to_portion_const(b.get_blk_num()); - { - // No need to set in cache if it is not recovered. When recovery is complete we copy the disk_bm to cache bm. - auto lock{portion.portion_auto_lock()}; - return m_cache_bm->is_bits_set(b.get_blk_num(), b.get_nblks()); +bool VarsizeBlkAllocator::is_blk_alloced(BlkId const& bid, bool use_lock) const { + if (!m_inited) { return true; } + + auto check_bits_set = [this](BlkId const& b, bool use_lock) { + if (use_lock) { + BlkAllocPortion const& portion = blknum_to_portion_const(b.blk_num()); + auto lock{portion.portion_auto_lock()}; + return m_cache_bm->is_bits_set(b.blk_num(), b.blk_count()); + } else { + return m_cache_bm->is_bits_set(b.blk_num(), b.blk_count()); + } + }; + + bool ret; + if (bid.is_multi()) { + auto& mbid = r_cast< MultiBlkId const& >(bid); + auto it = mbid.iterate(); + while (auto const b = it.next()) { + ret = check_bits_set(*b, use_lock); + if (!ret) { break; } + } + } else { + ret = check_bits_set(bid, use_lock); } + return ret; } -void VarsizeBlkAllocator::alloc_sanity_check(blk_count_t nblks, const blk_alloc_hints& hints, - const std::vector< BlkId >& out_blkids) const { +blk_num_t VarsizeBlkAllocator::available_blks() const { return get_total_blks() - get_used_blks(); } +blk_num_t VarsizeBlkAllocator::get_used_blks() const { return get_alloced_blk_count(); } + +#ifdef _PRERELEASE +void VarsizeBlkAllocator::alloc_sanity_check(blk_count_t nblks, blk_alloc_hints const& hints, + MultiBlkId const& out_blkid) const { if (HS_DYNAMIC_CONFIG(generic.sanity_check_level)) { blk_count_t alloced_nblks{0}; - for (const auto& b : out_blkids) { - const BlkAllocPortion& portion = blknum_to_portion_const(b.get_blk_num()); + auto it = out_blkid.iterate(); + while (auto const b = it.next()) { + BlkAllocPortion const& portion = blknum_to_portion_const(b->blk_num()); auto lock{portion.portion_auto_lock()}; - BLKALLOC_REL_ASSERT(m_cache_bm->is_bits_set(b.get_blk_num(), b.get_nblks()), - "Expected blkid={} to be already set in cache bitmap", b.to_string()); + BLKALLOC_REL_ASSERT(m_cache_bm->is_bits_set(b->blk_num(), b->blk_count()), + "Expected blkid={} to be already set in cache bitmap", b->to_string()); if (get_disk_bm_const()) { - BLKALLOC_REL_ASSERT(!is_blk_alloced_on_disk(b), "Expected blkid={} to be already free in disk bitmap", - b.to_string()); + BLKALLOC_REL_ASSERT(!is_blk_alloced_on_disk(*b), "Expected blkid={} to be already free in disk bitmap", + b->to_string()); } - alloced_nblks += b.get_nblks(); + alloced_nblks += b->blk_count(); } BLKALLOC_REL_ASSERT((nblks == alloced_nblks), "Requested blks={} alloced_blks={} num_pieces={}", nblks, - alloced_nblks, out_blkids.size()); - BLKALLOC_REL_ASSERT((!hints.is_contiguous || (out_blkids.size() == 1)), + alloced_nblks, out_blkid.num_pieces()); + BLKALLOC_REL_ASSERT((!hints.is_contiguous || (out_blkid.num_pieces() == 1)), "Multiple blkids allocated for contiguous request"); } } @@ -703,70 +807,6 @@ void VarsizeBlkAllocator::request_more_blks_wait(BlkAllocSegment* seg, blk_count } } -BlkAllocStatus VarsizeBlkAllocator::alloc_blks_direct(blk_count_t nblks, const blk_alloc_hints& hints, - std::vector< BlkId >& out_blkids, blk_count_t& num_allocated) { - // Search all segments starting with some random portion num within each segment - static thread_local std::random_device rd{}; - static thread_local std::default_random_engine re{rd()}; - - if (m_start_portion_num == INVALID_PORTION_NUM) { m_start_portion_num = m_rand_portion_num_generator(re); } - - auto portion_num = m_start_portion_num; - blk_count_t const min_blks = hints.is_contiguous ? nblks : std::min< blk_count_t >(nblks, hints.multiplier); - blk_count_t nblks_remain = nblks; - do { - BlkAllocPortion& portion = get_blk_portion(portion_num); - auto cur_blk_id = portion_num * get_blks_per_portion(); - auto const end_blk_id = cur_blk_id + get_blks_per_portion() - 1; - { - auto lock{portion.portion_auto_lock()}; - while (nblks_remain && (cur_blk_id <= end_blk_id) && (portion.get_available_blocks() > 0)) { - // Get next reset bits and insert to cache and then reset those bits - auto const b = m_cache_bm->get_next_contiguous_n_reset_bits( - cur_blk_id, end_blk_id, std::min(min_blks, nblks_remain), nblks_remain); - if (b.nbits == 0) { break; } - HS_DBG_ASSERT_GE(end_blk_id, b.start_bit, "Expected start bit to be smaller than end bit"); - HS_DBG_ASSERT_LE(b.nbits, nblks_remain); - HS_DBG_ASSERT_GE(b.nbits, std::min(min_blks, nblks_remain)); - HS_DBG_ASSERT_GE(end_blk_id, (b.start_bit + b.nbits - 1), - "Expected end bit to be smaller than portion end bit"); - - nblks_remain -= b.nbits; - out_blkids.emplace_back(b.start_bit, b.nbits, m_chunk_id); - - BLKALLOC_LOG(DEBUG, "Allocated directly from portion={} nnblks={} Blk_num={} nblks={} set_bit_count={}", - portion_num, nblks, b.start_bit, b.nbits, get_alloced_blk_count()); - - // Set the bitmap indicating the blocks are allocated - m_cache_bm->set_bits(b.start_bit, b.nbits); - if (portion.decrease_available_blocks(b.nbits) == 0) break; - cur_blk_id = b.start_bit + b.nbits; - } - } - if (++portion_num == get_num_portions()) { portion_num = 0; } - BLKALLOC_LOG(TRACE, "alloc direct unable to find in prev portion, searching in portion={}, start_portion={}", - portion_num, m_start_portion_num); - } while ((nblks_remain > 0) && (portion_num != m_start_portion_num) && !hints.is_contiguous); - - // save which portion we were at for next allocation; - m_start_portion_num = portion_num; - - COUNTER_INCREMENT(m_metrics, num_blks_alloc_direct, 1); - num_allocated = nblks - nblks_remain; - if (nblks_remain > 0) { - if (nblks_remain == nblks) { - // allocated no blocks. NOTE: if contiguous we may or may not be full. Don't really know without - // searching for a single free block - return hints.is_contiguous ? BlkAllocStatus::FAILED : BlkAllocStatus::SPACE_FULL; - } else { - // allocated some blocks - return BlkAllocStatus::PARTIAL; - } - } - - return BlkAllocStatus::SUCCESS; -} - /* This method assumes that mutex to protect state is already taken. */ bool VarsizeBlkAllocator::prepare_sweep(BlkAllocSegment* seg, bool fill_entire_cache) { m_sweep_segment = seg; @@ -780,19 +820,29 @@ bool VarsizeBlkAllocator::prepare_sweep(BlkAllocSegment* seg, bool fill_entire_c } } -void VarsizeBlkAllocator::blk_cache_entries_to_blkids(const std::vector< blk_cache_entry >& entries, - std::vector< BlkId >& out_blkids) { - for (const auto& e : entries) { - out_blkids.emplace_back(e.get_blk_num(), e.get_nblks(), m_chunk_id); +#if 0 +blk_num_t VarsizeBlkAllocator::blk_cache_entries_to_blkids(const std::vector< blk_cache_entry >& entries, + MultiBlkId& out_blkid) { + uint32_t num_added{0}; + for (auto const& e : entries) { + if (out_blkid.has_room()) { + out_blkid.add(e.get_blk_num(), e.blk_count(), m_chunk_id); + ++num_added; + } else { + break; + } } + + return num_added; } +#endif -BlkId VarsizeBlkAllocator::blk_cache_entry_to_blkid(const blk_cache_entry& e) { - return BlkId{e.get_blk_num(), e.get_nblks(), m_chunk_id}; +BlkId VarsizeBlkAllocator::blk_cache_entry_to_blkid(blk_cache_entry const& e) { + return BlkId{e.get_blk_num(), e.blk_count(), m_chunk_id}; } -blk_cache_entry VarsizeBlkAllocator::blkid_to_blk_cache_entry(const BlkId& bid, blk_temp_t preferred_level) { - return blk_cache_entry{bid.get_blk_num(), bid.get_nblks(), preferred_level}; +blk_cache_entry VarsizeBlkAllocator::blkid_to_blk_cache_entry(BlkId const& bid, blk_temp_t preferred_level) { + return blk_cache_entry{bid.blk_num(), bid.blk_count(), preferred_level}; } std::string VarsizeBlkAllocator::to_string() const { diff --git a/src/lib/blkalloc/varsize_blk_allocator.h b/src/lib/blkalloc/varsize_blk_allocator.h index 7544fac55..7e23597fd 100644 --- a/src/lib/blkalloc/varsize_blk_allocator.h +++ b/src/lib/blkalloc/varsize_blk_allocator.h @@ -45,27 +45,27 @@ struct VarsizeBlkAllocConfig : public BlkAllocConfig { public: const uint32_t m_phys_page_size; const seg_num_t m_nsegments; - const blk_cap_t m_blks_per_temp_group; - blk_cap_t m_max_cache_blks; + const blk_num_t m_blks_per_temp_group; + blk_num_t m_max_cache_blks; SlabCacheConfig m_slab_config; const bool m_use_slabs{true}; // use sweeping thread pool with slabs in variable size block allocator public: VarsizeBlkAllocConfig() : VarsizeBlkAllocConfig{0, 0, 0, 0, ""} {} - VarsizeBlkAllocConfig(const std::string& name) : VarsizeBlkAllocConfig{0, 0, 0, 0, name} {} + VarsizeBlkAllocConfig(std::string const& name) : VarsizeBlkAllocConfig{0, 0, 0, 0, name} {} VarsizeBlkAllocConfig(uint32_t blk_size, uint32_t ppage_sz, uint32_t align_sz, uint64_t size, - const std::string& name, bool realtime_bm_on = true, bool use_slabs = true) : + std::string const& name, bool realtime_bm_on = true, bool use_slabs = true) : BlkAllocConfig{blk_size, align_sz, size, name, realtime_bm_on}, m_phys_page_size{ppage_sz}, m_nsegments{HS_DYNAMIC_CONFIG(blkallocator.max_segments)}, m_blks_per_temp_group{m_capacity / HS_DYNAMIC_CONFIG(blkallocator.num_blk_temperatures)}, m_use_slabs{use_slabs} { // Initialize the max cache blks as minimum dictated by the number of blks or memory limits whichever is lower - const blk_cap_t size_by_count{static_cast< blk_cap_t >( + const blk_num_t size_by_count{static_cast< blk_num_t >( std::trunc(HS_DYNAMIC_CONFIG(blkallocator.free_blk_cache_count_by_vdev_percent) * m_capacity / 100.0))}; - const blk_cap_t size_by_mem{ - static_cast< blk_cap_t >(std::trunc(HS_DYNAMIC_CONFIG(blkallocator.max_free_blk_cache_memory_percent) * + const blk_num_t size_by_mem{ + static_cast< blk_num_t >(std::trunc(HS_DYNAMIC_CONFIG(blkallocator.max_free_blk_cache_memory_percent) * HS_STATIC_CONFIG(input.app_mem_size) / 100.0))}; m_max_cache_blks = std::min(size_by_count, size_by_mem); @@ -81,11 +81,11 @@ struct VarsizeBlkAllocConfig : public BlkAllocConfig { const auto num_temp_slab_pct{(100.0 - reuse_pct) / static_cast< double >(num_temp)}; m_slab_config.m_name = name; - for (const auto& pct : HS_DYNAMIC_CONFIG(blkallocator.free_blk_slab_distribution)) { + for (auto const& pct : HS_DYNAMIC_CONFIG(blkallocator.free_blk_slab_distribution)) { cum_pct += pct; SlabCacheConfig::_slab_config s_cfg; s_cfg.slab_size = static_cast< blk_count_t >(1) << slab_idx; - s_cfg.max_entries = static_cast< blk_cap_t >((m_max_cache_blks / s_cfg.slab_size) * (pct / 100.0)); + s_cfg.max_entries = static_cast< blk_num_t >((m_max_cache_blks / s_cfg.slab_size) * (pct / 100.0)); s_cfg.m_name = name; s_cfg.refill_threshold_pct = HS_DYNAMIC_CONFIG(blkallocator.free_blk_cache_refill_threshold_pct); @@ -108,9 +108,9 @@ struct VarsizeBlkAllocConfig : public BlkAllocConfig { } } - VarsizeBlkAllocConfig(const VarsizeBlkAllocConfig& other) = default; + VarsizeBlkAllocConfig(VarsizeBlkAllocConfig const& other) = default; VarsizeBlkAllocConfig(VarsizeBlkAllocConfig&&) noexcept = delete; - VarsizeBlkAllocConfig& operator=(const VarsizeBlkAllocConfig&) = delete; + VarsizeBlkAllocConfig& operator=(VarsizeBlkAllocConfig const&) = delete; VarsizeBlkAllocConfig& operator=(VarsizeBlkAllocConfig&&) noexcept = delete; virtual ~VarsizeBlkAllocConfig() override = default; @@ -119,20 +119,20 @@ struct VarsizeBlkAllocConfig : public BlkAllocConfig { //////////// Segments related getters/setters ///////////// seg_num_t get_total_segments() const { return m_nsegments; } - blk_cap_t get_blks_per_segment() const { return (m_capacity / m_nsegments); } + blk_num_t get_blks_per_segment() const { return (m_capacity / m_nsegments); } //////////// Blks related getters/setters ///////////// - blk_cap_t get_max_cache_blks() const { return m_max_cache_blks; } - blk_cap_t get_blks_per_temp_group() const { return m_blks_per_temp_group; } - blk_cap_t get_blks_per_phys_page() const { return m_phys_page_size / m_blk_size; } + blk_num_t get_max_cache_blks() const { return m_max_cache_blks; } + blk_num_t get_blks_per_temp_group() const { return m_blks_per_temp_group; } + blk_num_t get_blks_per_phys_page() const { return m_phys_page_size / m_blk_size; } //////////// Slab related getters/setters ///////////// - slab_idx_t get_slab_cnt() const { return m_slab_config.m_per_slab_cfg.size(); } + slab_idx_t get_slab_cnt() const { return s_cast< slab_idx_t >(m_slab_config.m_per_slab_cfg.size()); } blk_count_t get_slab_block_count(const slab_idx_t index) { return m_slab_config.m_per_slab_cfg[index].slab_size; } - blk_cap_t get_slab_capacity(const slab_idx_t slab_idx) const { + blk_num_t get_slab_capacity(const slab_idx_t slab_idx) const { return m_slab_config.m_per_slab_cfg[slab_idx].max_entries; } - blk_cap_t highest_slab_blks_count() const { + blk_num_t highest_slab_blks_count() const { const slab_idx_t index{get_slab_cnt()}; return (index > 0) ? m_slab_config.m_per_slab_cfg[index - 1].slab_size : 0; } @@ -151,12 +151,12 @@ class BlkAllocSegment { blk_num_t m_alloc_clock_hand; public: - BlkAllocSegment(const seg_num_t seg_num, const blk_num_t nportions, const std::string& seg_name) : + BlkAllocSegment(const seg_num_t seg_num, const blk_num_t nportions, std::string const& seg_name) : m_total_portions{nportions}, m_seg_num{seg_num}, m_alloc_clock_hand{0} {} - BlkAllocSegment(const BlkAllocSegment&) = delete; + BlkAllocSegment(BlkAllocSegment const&) = delete; BlkAllocSegment(BlkAllocSegment&&) noexcept = delete; - BlkAllocSegment& operator=(const BlkAllocSegment&) = delete; + BlkAllocSegment& operator=(BlkAllocSegment const&) = delete; BlkAllocSegment& operator=(BlkAllocSegment&&) noexcept = delete; virtual ~BlkAllocSegment() {} @@ -185,9 +185,9 @@ class BlkAllocMetrics : public sisl::MetricsGroup { register_me_to_farm(); } - BlkAllocMetrics(const BlkAllocMetrics&) = delete; + BlkAllocMetrics(BlkAllocMetrics const&) = delete; BlkAllocMetrics(BlkAllocMetrics&&) noexcept = delete; - BlkAllocMetrics& operator=(const BlkAllocMetrics&) = delete; + BlkAllocMetrics& operator=(BlkAllocMetrics const&) = delete; BlkAllocMetrics& operator=(BlkAllocMetrics&&) noexcept = delete; ~BlkAllocMetrics() { deregister_me_from_farm(); } }; @@ -201,24 +201,23 @@ class BlkAllocMetrics : public sisl::MetricsGroup { */ class VarsizeBlkAllocator : public BlkAllocator { public: - VarsizeBlkAllocator(const VarsizeBlkAllocConfig& cfg, bool init, chunk_num_t chunk_id); - VarsizeBlkAllocator(const VarsizeBlkAllocator&) = delete; + VarsizeBlkAllocator(VarsizeBlkAllocConfig const& cfg, bool init, chunk_num_t chunk_id); + VarsizeBlkAllocator(VarsizeBlkAllocator const&) = delete; VarsizeBlkAllocator(VarsizeBlkAllocator&&) noexcept = delete; - VarsizeBlkAllocator& operator=(const VarsizeBlkAllocator&) = delete; + VarsizeBlkAllocator& operator=(VarsizeBlkAllocator const&) = delete; VarsizeBlkAllocator& operator=(VarsizeBlkAllocator&&) noexcept = delete; virtual ~VarsizeBlkAllocator() override; - BlkAllocStatus alloc(BlkId& bid) override; - BlkAllocStatus alloc(blk_count_t nblks, const blk_alloc_hints& hints, std::vector< BlkId >& out_blkid) override; - void free(const std::vector< BlkId >& blk_ids) override; - void free(const BlkId& b) override; + BlkAllocStatus alloc_contiguous(BlkId& bid) override; + BlkAllocStatus alloc_contiguous(blk_count_t nblks, blk_alloc_hints const& hints, BlkId& out_blkid); + BlkAllocStatus alloc(blk_count_t nblks, blk_alloc_hints const& hints, BlkId& out_blkid) override; + BlkAllocStatus alloc(blk_count_t nblks, blk_alloc_hints const& hints, std::vector< BlkId >& out_blkids); + void free(BlkId const& blk_id) override; void inited() override; - BlkAllocStatus alloc_blks_direct(blk_count_t nblks, const blk_alloc_hints& hints, std::vector< BlkId >& out_blkids, - blk_count_t& num_allocated); - blk_cap_t available_blks() const override; - blk_cap_t get_used_blks() const override; - bool is_blk_alloced(const BlkId& in_bid, bool use_lock = false) const override; + blk_num_t available_blks() const override; + blk_num_t get_used_blks() const override; + bool is_blk_alloced(BlkId const& in_bid, bool use_lock = false) const override; std::string to_string() const override; nlohmann::json get_metrics_in_json(); @@ -256,17 +255,20 @@ class VarsizeBlkAllocator : public BlkAllocator { // TODO: this fields needs to be passed in from hints and persisted in volume's sb; blk_num_t m_start_portion_num{INVALID_PORTION_NUM}; - blk_cap_t m_blks_per_seg{1}; + blk_num_t m_blks_per_seg{1}; blk_num_t m_portions_per_seg{1}; private: static void sweeper_thread(size_t thread_num); bool allocator_state_machine(); + blk_count_t alloc_blks_slab(blk_count_t nblks, blk_alloc_hints const& hints, MultiBlkId& out_blkid); + blk_count_t alloc_blks_direct(blk_count_t nblks, blk_alloc_hints const& hints, MultiBlkId& out_blkids); + blk_count_t free_blks_slab(MultiBlkId const& b); + blk_count_t free_blks_direct(MultiBlkId const& b); + #ifdef _PRERELEASE - bool is_set_on_bitmap(const BlkId& b) const; - void alloc_sanity_check(blk_count_t nblks, const blk_alloc_hints& hints, - const std::vector< BlkId >& out_blkids) const; + void alloc_sanity_check(blk_count_t nblks, blk_alloc_hints const& hints, MultiBlkId const& out_blkids) const; #endif // Sweep and cache related functions @@ -277,7 +279,7 @@ class VarsizeBlkAllocator : public BlkAllocator { void fill_cache(BlkAllocSegment* seg, blk_cache_fill_session& fill_session); void fill_cache_in_portion(blk_num_t portion_num, blk_cache_fill_session& fill_session); - void free_on_bitmap(const BlkId& b); + void free_on_bitmap(BlkId const& b); //////////////////////////////////////////// Convenience routines /////////////////////////////////////////// ///////////////////// Physical page related routines //////////////////////// @@ -296,8 +298,8 @@ class VarsizeBlkAllocator : public BlkAllocator { } ///////////////////// Cache Entry related routines //////////////////////// - void blk_cache_entries_to_blkids(const std::vector< blk_cache_entry >& entries, std::vector< BlkId >& out_blkids); - BlkId blk_cache_entry_to_blkid(const blk_cache_entry& e); - blk_cache_entry blkid_to_blk_cache_entry(const BlkId& bid, blk_temp_t preferred_level = 1); + // void blk_cache_entries_to_blkids(const std::vector< blk_cache_entry >& entries, MultiBlkId& out_blkids); + BlkId blk_cache_entry_to_blkid(blk_cache_entry const& e); + blk_cache_entry blkid_to_blk_cache_entry(BlkId const& bid, blk_temp_t preferred_level = 1); }; } // namespace homestore diff --git a/src/lib/blkdata_svc/blk_read_tracker.cpp b/src/lib/blkdata_svc/blk_read_tracker.cpp index 65da3f55f..1d90618a9 100644 --- a/src/lib/blkdata_svc/blk_read_tracker.cpp +++ b/src/lib/blkdata_svc/blk_read_tracker.cpp @@ -32,18 +32,14 @@ void BlkReadTracker::merge(const BlkId& blkid, int64_t new_ref_count, // Don't move alignment handling outside of this function, because the nblks between (first and last blk num after // alignment) could be larger than 255 which exceeds a BlkId can hold; // - auto cur_blk_num_aligned = s_cast< blk_num_t >(sisl::round_down(blkid.get_blk_num(), entries_per_record())); - auto last_blk_num_aligned_up = s_cast< blk_num_t >(sisl::round_up(blkid.get_last_blk_num(), entries_per_record()) - - 1); // -1 so that it does not cover next base id; - if (blkid.get_last_blk_num() % entries_per_record() == 0) { - // if last blk num happens to be aligned, it actually belongs to next base id, so add 1 back; - last_blk_num_aligned_up += 1; - } + auto cur_blk_num_aligned = s_cast< blk_num_t >(sisl::round_down(blkid.blk_num(), entries_per_record())); + auto last_blk_num_aligned_up = + s_cast< blk_num_t >(sisl::round_up(blkid.blk_num() + blkid.blk_count() + 1, entries_per_record()) - 1); [[maybe_unused]] bool waiter_rescheduled{false}; // everything is aligned after this point, so we don't need to handle sub_range in a base blkid; while (cur_blk_num_aligned <= last_blk_num_aligned_up) { - BlkId base_blkid{cur_blk_num_aligned, entries_per_record(), blkid.get_chunk_num()}; + BlkId base_blkid{cur_blk_num_aligned, entries_per_record(), blkid.chunk_num()}; BlkTrackRecord rec; const auto rec_found = m_pending_reads_map.get(base_blkid, rec); @@ -98,8 +94,16 @@ void BlkReadTracker::merge(const BlkId& blkid, int64_t new_ref_count, void BlkReadTracker::insert(const BlkId& blkid) { merge(blkid, 1, nullptr); } void BlkReadTracker::remove(const BlkId& blkid) { merge(blkid, -1, nullptr); } -void BlkReadTracker::wait_on(const BlkId& blkid, after_remove_cb_t&& after_remove_cb) { - merge(blkid, 0, std::make_shared< blk_track_waiter >(std::move(after_remove_cb))); +void BlkReadTracker::wait_on(MultiBlkId const& blkids, after_remove_cb_t&& after_remove_cb) { + if (blkids.num_pieces() == 1) { + merge(blkids, 0, std::make_shared< blk_track_waiter >(std::move(after_remove_cb))); + } else { + auto waiter = std::make_shared< blk_track_waiter >(std::move(after_remove_cb)); + auto it = blkids.iterate(); + while (auto const b = it.next()) { + merge(*b, 0, waiter); + } + } } uint16_t BlkReadTracker::entries_per_record() const { diff --git a/src/lib/blkdata_svc/blk_read_tracker.hpp b/src/lib/blkdata_svc/blk_read_tracker.hpp index 10c2572e4..ec62d77c4 100644 --- a/src/lib/blkdata_svc/blk_read_tracker.hpp +++ b/src/lib/blkdata_svc/blk_read_tracker.hpp @@ -20,7 +20,7 @@ #include #include #include -#include "homestore/blk.h" +#include namespace homestore { typedef folly::Function< void(void) > after_remove_cb_t; @@ -157,7 +157,7 @@ class BlkReadTracker { * @param blkid : blkid that caller wants to wait on for pending read; * @param after_remove_cb : the callback to be sent after read on this blkid are all completed; */ - void wait_on(const BlkId& blkid, after_remove_cb_t&& after_remove_cb); + void wait_on(MultiBlkId const& blkids, after_remove_cb_t&& after_remove_cb); /** * @brief : get size of the pending map; diff --git a/src/lib/blkdata_svc/blkdata_service.cpp b/src/lib/blkdata_svc/blkdata_service.cpp index 3c52f89de..33ed6fede 100644 --- a/src/lib/blkdata_svc/blkdata_service.cpp +++ b/src/lib/blkdata_svc/blkdata_service.cpp @@ -17,7 +17,7 @@ #include #include "device/chunk.h" #include "device/virtual_dev.hpp" -#include "device/physical_dev.hpp" // vdev_info_block +#include "device/physical_dev.hpp" // vdev_info_block #include "common/homestore_config.hpp" // is_data_drive_hdd #include "common/homestore_assert.hpp" #include "common/error.h" @@ -53,87 +53,161 @@ void BlkDataService::create_vdev(uint64_t size, blk_allocator_type_t alloc_type, // both first_time_boot and recovery path will come here shared< VirtualDev > BlkDataService::open_vdev(const vdev_info& vinfo, bool load_existing) { m_vdev = std::make_shared< VirtualDev >(*(hs()->device_mgr()), vinfo, nullptr, true /* auto_recovery */); - m_page_size = vinfo.blk_size; + m_blk_size = vinfo.blk_size; return m_vdev; } -folly::Future< bool > BlkDataService::async_read(const BlkId& bid, sisl::sg_list& sgs, uint32_t size, - bool part_of_batch) { - m_blk_read_tracker->insert(bid); - HS_DBG_ASSERT_EQ(sgs.iovs.size(), 1, "Expecting iov size to be 1 since reading on one blk."); +static auto collect_all_futures(std::vector< folly::Future< std::error_code > >& futs) { + return folly::collectAllUnsafe(futs).thenValue([](auto&& vf) { + for (auto const& err_c : vf) { + if (sisl_unlikely(err_c.value())) { + auto ec = err_c.value(); + return folly::makeFuture< std::error_code >(std::move(ec)); + } + } + return folly::makeFuture< std::error_code >(std::error_code{}); + }); +} - return m_vdev->async_readv(sgs.iovs.data(), sgs.iovs.size(), size, bid, part_of_batch) - .thenValue([this, bid](auto&&) { +folly::Future< std::error_code > BlkDataService::async_read(MultiBlkId const& blkid, uint8_t* buf, uint32_t size, + bool part_of_batch) { + auto do_read = [this](BlkId const& bid, uint8_t* buf, uint32_t size, bool part_of_batch) { + m_blk_read_tracker->insert(bid); + + return m_vdev->async_read(r_cast< char* >(buf), size, bid, part_of_batch).thenValue([this, bid](auto&& ec) { m_blk_read_tracker->remove(bid); - return folly::makeFuture< bool >(true); + return folly::makeFuture< std::error_code >(std::move(ec)); }); + }; + + if (blkid.num_pieces() == 1) { + return do_read(blkid.to_single_blkid(), buf, size, part_of_batch); + } else { + static thread_local std::vector< folly::Future< std::error_code > > s_futs; + s_futs.clear(); + + auto it = blkid.iterate(); + while (auto const bid = it.next()) { + uint32_t sz = bid->blk_count() * m_blk_size; + s_futs.emplace_back(do_read(*bid, buf, sz, part_of_batch)); + buf += sz; + } + + return collect_all_futures(s_futs); + } } -folly::Future< bool > BlkDataService::async_write(const sisl::sg_list& sgs, const blk_alloc_hints& hints, - const std::vector< BlkId >& blkids, bool part_of_batch) { - if (blkids.size() == 1) { - // Shortcut to most common case - return m_vdev->async_writev(sgs.iovs.data(), sgs.iovs.size(), blkids[0], part_of_batch); +folly::Future< std::error_code > BlkDataService::async_read(MultiBlkId const& blkid, sisl::sg_list& sgs, uint32_t size, + bool part_of_batch) { + // TODO: sg_iovs_t should not be passed by value. We need it pass it as const&, but that is failing because + // iovs.data() will then return "const iovec*", but unfortunately all the way down to iomgr, we take iovec* + // instead it can easily take "const iovec*". Until we change this is made as copy by value + auto do_read = [this](BlkId const& bid, sisl::sg_iovs_t iovs, uint32_t size, bool part_of_batch) { + m_blk_read_tracker->insert(bid); + + return m_vdev->async_readv(iovs.data(), iovs.size(), size, bid, part_of_batch) + .thenValue([this, bid](auto&& ec) { + m_blk_read_tracker->remove(bid); + return folly::makeFuture< std::error_code >(std::move(ec)); + }); + }; + + if (blkid.num_pieces() == 1) { + return do_read(blkid.to_single_blkid(), sgs.iovs, size, part_of_batch); } else { - static thread_local std::vector< folly::Future< bool > > s_futs; + static thread_local std::vector< folly::Future< std::error_code > > s_futs; s_futs.clear(); + sisl::sg_iterator sg_it{sgs.iovs}; - for (const auto& bid : blkids) { - const auto iovs = sg_it.next_iovs(bid.get_nblks() * m_page_size); - s_futs.emplace_back(m_vdev->async_writev(iovs.data(), iovs.size(), bid, part_of_batch)); + auto blkid_it = blkid.iterate(); + while (auto const bid = blkid_it.next()) { + uint32_t const sz = bid->blk_count() * m_blk_size; + s_futs.emplace_back(do_read(*bid, sg_it.next_iovs(sz), sz, part_of_batch)); } - return folly::collectAllUnsafe(s_futs).thenTry([](auto&&) { return folly::makeFuture< bool >(true); }); + + return collect_all_futures(s_futs); } } -folly::Future< bool > BlkDataService::async_alloc_write(const sisl::sg_list& sgs, const blk_alloc_hints& hints, - std::vector< BlkId >& out_blkids, bool part_of_batch) { - out_blkids.clear(); +folly::Future< std::error_code > BlkDataService::async_alloc_write(const sisl::sg_list& sgs, + const blk_alloc_hints& hints, MultiBlkId& out_blkids, + bool part_of_batch) { const auto status = alloc_blks(sgs.size, hints, out_blkids); if (status != BlkAllocStatus::SUCCESS) { - return folly::makeFuture< bool >( - std::system_error(std::make_error_code(std::errc::resource_unavailable_try_again))); + return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::resource_unavailable_try_again)); } - return async_write(sgs, hints, out_blkids, part_of_batch); + return async_write(sgs, out_blkids, part_of_batch); } -BlkAllocStatus BlkDataService::alloc_blks(uint32_t size, const blk_alloc_hints& hints, - std::vector< BlkId >& out_blkids) { - HS_DBG_ASSERT_EQ(size % m_page_size, 0, "Non aligned size requested"); - blk_count_t nblks = static_cast< blk_count_t >(size / m_page_size); +folly::Future< std::error_code > BlkDataService::async_write(const char* buf, uint32_t size, MultiBlkId const& blkid, + bool part_of_batch) { + if (blkid.num_pieces() == 1) { + // Shortcut to most common case + return m_vdev->async_write(buf, size, blkid.to_single_blkid(), part_of_batch); + } else { + static thread_local std::vector< folly::Future< std::error_code > > s_futs; + s_futs.clear(); - return m_vdev->alloc_blk(nblks, hints, out_blkids); + const char* ptr = buf; + auto blkid_it = blkid.iterate(); + while (auto const bid = blkid_it.next()) { + uint32_t sz = bid->blk_count() * m_blk_size; + s_futs.emplace_back(m_vdev->async_write(ptr, sz, *bid, part_of_batch)); + ptr += sz; + } + return collect_all_futures(s_futs); + } } -void BlkDataService::commit_blk(const BlkId& bid) { m_vdev->commit_blk(bid); } - -blk_list_t BlkDataService::alloc_blks(uint32_t size) { - blk_alloc_hints hints; // default hints - std::vector< BlkId > out_blkids; - const auto status = alloc_blks(size, hints, out_blkids); +folly::Future< std::error_code > BlkDataService::async_write(sisl::sg_list const& sgs, MultiBlkId const& blkid, + bool part_of_batch) { + // TODO: Async write should pass this by value the sgs.size parameter as well, currently vdev write routine + // walks through again all the iovs and then getting the len to pass it down to iomgr. This defeats the purpose of + // taking size parameters (which was done exactly done to avoid this walk through) + if (blkid.num_pieces() == 1) { + // Shortcut to most common case + return m_vdev->async_writev(sgs.iovs.data(), sgs.iovs.size(), blkid.to_single_blkid(), part_of_batch); + } else { + static thread_local std::vector< folly::Future< std::error_code > > s_futs; + s_futs.clear(); + sisl::sg_iterator sg_it{sgs.iovs}; - blk_list_t blk_list; - if (status != BlkAllocStatus::SUCCESS) { - LOGERROR("Resouce unavailable!"); - return blk_list; + auto blkid_it = blkid.iterate(); + while (auto const bid = blkid_it.next()) { + const auto iovs = sg_it.next_iovs(bid->blk_count() * m_blk_size); + s_futs.emplace_back(m_vdev->async_writev(iovs.data(), iovs.size(), *bid, part_of_batch)); + } + return collect_all_futures(s_futs); } +} - // convert BlkId to blklist; - for (auto i = 0ul; i < out_blkids.size(); ++i) { - blk_list.emplace_back(out_blkids[i].to_integer()); - } +BlkAllocStatus BlkDataService::alloc_blks(uint32_t size, const blk_alloc_hints& hints, MultiBlkId& out_blkids) { + HS_DBG_ASSERT_EQ(size % m_blk_size, 0, "Non aligned size requested"); + blk_count_t nblks = static_cast< blk_count_t >(size / m_blk_size); + + return m_vdev->alloc_blks(nblks, hints, out_blkids); +} - return blk_list; +void BlkDataService::commit_blk(MultiBlkId const& blkid) { + if (blkid.num_pieces() == 1) { + // Shortcut to most common case + m_vdev->commit_blk(blkid); + } else { + auto it = blkid.iterate(); + while (auto const bid = it.next()) { + m_vdev->commit_blk(*bid); + } + } } -folly::Future< bool > BlkDataService::async_free_blk(const BlkId bid) { +folly::Future< std::error_code > BlkDataService::async_free_blk(MultiBlkId const& bids) { // create blk read waiter instance; - folly::Promise< bool > promise; + folly::Promise< std::error_code > promise; auto f = promise.getFuture(); - m_blk_read_tracker->wait_on(bid, [this, bid, p = std::move(promise)]() mutable { - m_vdev->free_blk(bid); - p.setValue(true); + m_blk_read_tracker->wait_on(bids, [this, bids, p = std::move(promise)]() mutable { + m_vdev->free_blk(bids); + p.setValue(std::error_code{}); }); return f; } diff --git a/src/lib/device/journal_vdev.cpp b/src/lib/device/journal_vdev.cpp index 4240fc223..1db2c55dc 100644 --- a/src/lib/device/journal_vdev.cpp +++ b/src/lib/device/journal_vdev.cpp @@ -134,9 +134,9 @@ auto JournalVirtualDev::process_pwrite_offset(size_t len, off_t offset) { } /////////////////////////////// Write Section ////////////////////////////////// -folly::Future< bool > JournalVirtualDev::async_append(const uint8_t* buf, size_t size) { +folly::Future< std::error_code > JournalVirtualDev::async_append(const uint8_t* buf, size_t size) { if (!validate_append_size(size)) { - return folly::makeFuture< bool >(std::system_error(std::make_error_code(std::errc::no_space_on_device))); + return folly::makeFuture< std::error_code >(std::make_error_code(std::errc::no_space_on_device)); } else { auto const [chunk, offset_in_chunk] = process_pwrite_offset(size, m_seek_cursor); m_seek_cursor += size; @@ -157,7 +157,7 @@ folly::Future< bool > JournalVirtualDev::async_append(const uint8_t* buf, size_t * @param cb : callback after write is completed, can be null * */ -folly::Future< bool > JournalVirtualDev::async_pwrite(const uint8_t* buf, size_t size, off_t offset) { +folly::Future< std::error_code > JournalVirtualDev::async_pwrite(const uint8_t* buf, size_t size, off_t offset) { HS_REL_ASSERT_LE(size, m_reserved_sz, "Write size: larger then reserved size is not allowed!"); m_reserved_sz -= size; // update reserved size @@ -165,7 +165,7 @@ folly::Future< bool > JournalVirtualDev::async_pwrite(const uint8_t* buf, size_t return async_write(r_cast< const char* >(buf), size, chunk, offset_in_chunk); } -folly::Future< bool > JournalVirtualDev::async_pwritev(const iovec* iov, int iovcnt, off_t offset) { +folly::Future< std::error_code > JournalVirtualDev::async_pwritev(const iovec* iov, int iovcnt, off_t offset) { auto const size = VirtualDev::get_len(iov, iovcnt); // if size is smaller than reserved size, it means write will never be overlapping start offset; @@ -216,7 +216,9 @@ void JournalVirtualDev::sync_next_read(uint8_t* buf, size_t size_rd) { across_chunk = true; } - sync_pread(buf, size_rd, m_seek_cursor); + auto ec = sync_pread(buf, size_rd, m_seek_cursor); + // TODO: Check if we can have tolerate this error and somehow start homestore without replaying or in degraded mode? + HS_REL_ASSERT(!ec, "Error in reading next stream of bytes, proceeding could cause some inconsistency, exiting"); // Update seek cursor after read; m_seek_cursor += size_rd; @@ -224,7 +226,7 @@ void JournalVirtualDev::sync_next_read(uint8_t* buf, size_t size_rd) { m_seek_cursor = m_seek_cursor % size(); } -void JournalVirtualDev::sync_pread(uint8_t* buf, size_t size, off_t offset) { +std::error_code JournalVirtualDev::sync_pread(uint8_t* buf, size_t size, off_t offset) { auto const [chunk, offset_in_chunk] = offset_to_chunk(offset); // if the read count is acrossing chunk, only return what's left in this chunk @@ -236,7 +238,7 @@ void JournalVirtualDev::sync_pread(uint8_t* buf, size_t size, off_t offset) { return sync_read(r_cast< char* >(buf), size, chunk, offset_in_chunk); } -void JournalVirtualDev::sync_preadv(iovec* iov, int iovcnt, off_t offset) { +std::error_code JournalVirtualDev::sync_preadv(iovec* iov, int iovcnt, off_t offset) { uint64_t len = VirtualDev::get_len(iov, iovcnt); auto const [chunk, offset_in_chunk] = offset_to_chunk(offset); @@ -251,7 +253,7 @@ void JournalVirtualDev::sync_preadv(iovec* iov, int iovcnt, off_t offset) { iov[0].iov_len = len; // is this needed? } - sync_readv(iov, iovcnt, chunk, offset_in_chunk); + return sync_readv(iov, iovcnt, chunk, offset_in_chunk); } off_t JournalVirtualDev::lseek(off_t offset, int whence) { diff --git a/src/lib/device/journal_vdev.hpp b/src/lib/device/journal_vdev.hpp index 28c72a6bf..9ecac4342 100644 --- a/src/lib/device/journal_vdev.hpp +++ b/src/lib/device/journal_vdev.hpp @@ -44,7 +44,7 @@ class JournalVirtualDev : public VirtualDev { off_t m_data_start_offset{0}; // Start offset of where actual data begin for this vdev std::atomic< uint64_t > m_write_sz_in_total{0}; // this size will be decreased by truncate and increased by append; bool m_truncate_done{true}; - uint64_t m_reserved_sz{0}; // write size within chunk, used to check chunk boundary; + uint64_t m_reserved_sz{0}; // write size within chunk, used to check chunk boundary; public: /* Create a new virtual dev for these parameters */ @@ -79,7 +79,7 @@ class JournalVirtualDev : public VirtualDev { * * @return : On success, the number of bytes written is returned. On error, -1 is returned. */ - folly::Future< bool > async_append(const uint8_t* buf, size_t count); + folly::Future< std::error_code > async_append(const uint8_t* buf, size_t count); /** * @brief : writes up to count bytes from the buffer starting at buf at offset offset. @@ -95,7 +95,7 @@ class JournalVirtualDev : public VirtualDev { * * @return : On success, the number of bytes read or written is returned, or -1 on error. */ - folly::Future< bool > async_pwrite(const uint8_t* buf, size_t size, off_t offset); + folly::Future< std::error_code > async_pwrite(const uint8_t* buf, size_t size, off_t offset); /** * @brief : writes iovcnt buffers of data described by iov to the offset. @@ -110,7 +110,7 @@ class JournalVirtualDev : public VirtualDev { * * @return : On success, number of bytes written. On error, -1 is returned */ - folly::Future< bool > async_pwritev(const iovec* iov, int iovcnt, off_t offset); + folly::Future< std::error_code > async_pwritev(const iovec* iov, int iovcnt, off_t offset); /// @brief writes up to count bytes from the buffer starting at buf at offset offset. The cursor is not /// changed. pwrite always use offset returned from alloc_next_append_blk to do the write;pwrite should not across @@ -145,9 +145,9 @@ class JournalVirtualDev : public VirtualDev { * @param count : size of buffer * @param offset : the start offset to do read * - * @return : On success, returns the number of bytes. On error, -1 is returned. + * @return : return the error code of the read */ - void sync_pread(uint8_t* buf, size_t count_in, off_t offset); + std::error_code sync_pread(uint8_t* buf, size_t count_in, off_t offset); /** * @brief : read at offset and save output to iov. @@ -159,9 +159,9 @@ class JournalVirtualDev : public VirtualDev { * @param iovcnt : size of iovev * @param offset : the start offset to read * - * @return : return the number of bytes read; On error, -1 is returned. + * @return : return the error code of the read */ - void sync_preadv(iovec* iov, int iovcnt, off_t offset); + std::error_code sync_preadv(iovec* iov, int iovcnt, off_t offset); /** * @brief : repositions the cusor of the device to the argument offset diff --git a/src/lib/device/physical_dev.cpp b/src/lib/device/physical_dev.cpp index 163162efd..122f735d8 100644 --- a/src/lib/device/physical_dev.cpp +++ b/src/lib/device/physical_dev.cpp @@ -118,48 +118,53 @@ PhysicalDev::PhysicalDev(const dev_info& dinfo, int oflags, const pdev_info_head PhysicalDev::~PhysicalDev() { close_device(); } void PhysicalDev::write_super_block(uint8_t* buf, uint32_t sb_size, uint64_t offset) { - m_drive_iface->sync_write(m_iodev.get(), c_charptr_cast(buf), sb_size, offset); + auto err_c = m_drive_iface->sync_write(m_iodev.get(), c_charptr_cast(buf), sb_size, offset); if (m_super_blk_in_footer) { auto t_offset = data_end_offset() + offset; - m_drive_iface->sync_write(m_iodev.get(), c_charptr_cast(buf), sb_size, t_offset); + err_c = m_drive_iface->sync_write(m_iodev.get(), c_charptr_cast(buf), sb_size, t_offset); } + + HS_REL_ASSERT(!err_c, "Super block write failed on dev={} at size={} offset={}, homestore will go down", m_devname, + sb_size, offset); } -void PhysicalDev::read_super_block(uint8_t* buf, uint32_t sb_size, uint64_t offset) { - m_drive_iface->sync_read(m_iodev.get(), charptr_cast(buf), sb_size, offset); +std::error_code PhysicalDev::read_super_block(uint8_t* buf, uint32_t sb_size, uint64_t offset) { + return m_drive_iface->sync_read(m_iodev.get(), charptr_cast(buf), sb_size, offset); } void PhysicalDev::close_device() { close_and_uncache_dev(m_devname, m_iodev); } -folly::Future< bool > PhysicalDev::async_write(const char* data, uint32_t size, uint64_t offset, bool part_of_batch) { +folly::Future< std::error_code > PhysicalDev::async_write(const char* data, uint32_t size, uint64_t offset, + bool part_of_batch) { HISTOGRAM_OBSERVE(m_metrics, write_io_sizes, (((size - 1) / 1024) + 1)); return m_drive_iface->async_write(m_iodev.get(), data, size, offset, part_of_batch); } -folly::Future< bool > PhysicalDev::async_writev(const iovec* iov, int iovcnt, uint32_t size, uint64_t offset, - bool part_of_batch) { +folly::Future< std::error_code > PhysicalDev::async_writev(const iovec* iov, int iovcnt, uint32_t size, uint64_t offset, + bool part_of_batch) { HISTOGRAM_OBSERVE(m_metrics, write_io_sizes, (((size - 1) / 1024) + 1)); return m_drive_iface->async_writev(m_iodev.get(), iov, iovcnt, size, offset, part_of_batch); } -folly::Future< bool > PhysicalDev::async_read(char* data, uint32_t size, uint64_t offset, bool part_of_batch) { +folly::Future< std::error_code > PhysicalDev::async_read(char* data, uint32_t size, uint64_t offset, + bool part_of_batch) { HISTOGRAM_OBSERVE(m_metrics, read_io_sizes, (((size - 1) / 1024) + 1)); return m_drive_iface->async_read(m_iodev.get(), data, size, offset, part_of_batch); } -folly::Future< bool > PhysicalDev::async_readv(iovec* iov, int iovcnt, uint32_t size, uint64_t offset, - bool part_of_batch) { +folly::Future< std::error_code > PhysicalDev::async_readv(iovec* iov, int iovcnt, uint32_t size, uint64_t offset, + bool part_of_batch) { HISTOGRAM_OBSERVE(m_metrics, read_io_sizes, (((size - 1) / 1024) + 1)); return m_drive_iface->async_readv(m_iodev.get(), iov, iovcnt, size, offset, part_of_batch); } -folly::Future< bool > PhysicalDev::async_write_zero(uint64_t size, uint64_t offset) { +folly::Future< std::error_code > PhysicalDev::async_write_zero(uint64_t size, uint64_t offset) { return m_drive_iface->async_write_zero(m_iodev.get(), size, offset); } #if 0 -folly::Future< bool > PhysicalDev::async_write_zero(uint64_t size, uint64_t offset) { +folly::Future< std::error_code > PhysicalDev::async_write_zero(uint64_t size, uint64_t offset) { return m_drive_iface->async_write_zero(m_iodev.get(), size, offset).thenError([this](auto const& e) -> bool { LOGERROR("Error on async_write_zero: exception={}", e.what()); device_manager_mutable()->handle_error(this); @@ -168,62 +173,48 @@ folly::Future< bool > PhysicalDev::async_write_zero(uint64_t size, uint64_t offs } #endif -folly::Future< bool > PhysicalDev::queue_fsync() { return m_drive_iface->queue_fsync(m_iodev.get()); } +folly::Future< std::error_code > PhysicalDev::queue_fsync() { return m_drive_iface->queue_fsync(m_iodev.get()); } -void PhysicalDev::sync_write(const char* data, uint32_t size, uint64_t offset) { - try { - HISTOGRAM_OBSERVE(m_metrics, write_io_sizes, (((size - 1) / 1024) + 1)); - COUNTER_INCREMENT(m_metrics, drive_sync_write_count, 1); - auto const start_time = Clock::now(); - m_drive_iface->sync_write(m_iodev.get(), data, size, offset); - HISTOGRAM_OBSERVE(m_metrics, drive_write_latency, get_elapsed_time_us(start_time)); - } catch (const std::system_error& e) { - // device_manager_mutable()->handle_error(this); - throw std::system_error(e.code(), fmt::format("dev_name: {}: {}", m_devname, e.what())); - } +__attribute__((no_sanitize_address)) static auto get_current_time() { return Clock::now(); } + +std::error_code PhysicalDev::sync_write(const char* data, uint32_t size, uint64_t offset) { + HISTOGRAM_OBSERVE(m_metrics, write_io_sizes, (((size - 1) / 1024) + 1)); + COUNTER_INCREMENT(m_metrics, drive_sync_write_count, 1); + auto const start_time = get_current_time(); + auto const ret = m_drive_iface->sync_write(m_iodev.get(), data, size, offset); + HISTOGRAM_OBSERVE(m_metrics, drive_write_latency, get_elapsed_time_us(start_time)); + return ret; } -void PhysicalDev::sync_writev(const iovec* iov, int iovcnt, uint32_t size, uint64_t offset) { - try { - HISTOGRAM_OBSERVE(m_metrics, write_io_sizes, (((size - 1) / 1024) + 1)); - COUNTER_INCREMENT(m_metrics, drive_sync_write_count, 1); - auto const start_time = Clock::now(); - m_drive_iface->sync_writev(m_iodev.get(), iov, iovcnt, size, offset); - HISTOGRAM_OBSERVE(m_metrics, drive_write_latency, get_elapsed_time_us(start_time)); - } catch (const std::system_error& e) { - // device_manager_mutable()->handle_error(this); - throw std::system_error(e.code(), fmt::format("dev_name: {}: {}", m_devname, e.what())); - } +std::error_code PhysicalDev::sync_writev(const iovec* iov, int iovcnt, uint32_t size, uint64_t offset) { + HISTOGRAM_OBSERVE(m_metrics, write_io_sizes, (((size - 1) / 1024) + 1)); + COUNTER_INCREMENT(m_metrics, drive_sync_write_count, 1); + auto const start_time = Clock::now(); + auto const ret = m_drive_iface->sync_writev(m_iodev.get(), iov, iovcnt, size, offset); + HISTOGRAM_OBSERVE(m_metrics, drive_write_latency, get_elapsed_time_us(start_time)); + return ret; } -void PhysicalDev::sync_read(char* data, uint32_t size, uint64_t offset) { - try { - HISTOGRAM_OBSERVE(m_metrics, read_io_sizes, (((size - 1) / 1024) + 1)); - COUNTER_INCREMENT(m_metrics, drive_sync_read_count, 1); - auto const start_time = Clock::now(); - m_drive_iface->sync_read(m_iodev.get(), data, size, offset); - HISTOGRAM_OBSERVE(m_metrics, drive_read_latency, get_elapsed_time_us(start_time)); - } catch (const std::system_error& e) { - // device_manager_mutable()->handle_error(this); - throw std::system_error(e.code(), fmt::format("dev_name: {}: {}", m_devname, e.what())); - } +std::error_code PhysicalDev::sync_read(char* data, uint32_t size, uint64_t offset) { + HISTOGRAM_OBSERVE(m_metrics, read_io_sizes, (((size - 1) / 1024) + 1)); + COUNTER_INCREMENT(m_metrics, drive_sync_read_count, 1); + auto const start_time = Clock::now(); + auto const ret = m_drive_iface->sync_read(m_iodev.get(), data, size, offset); + HISTOGRAM_OBSERVE(m_metrics, drive_read_latency, get_elapsed_time_us(start_time)); + return ret; } -void PhysicalDev::sync_readv(iovec* iov, int iovcnt, uint32_t size, uint64_t offset) { - try { - HISTOGRAM_OBSERVE(m_metrics, read_io_sizes, (((size - 1) / 1024) + 1)); - COUNTER_INCREMENT(m_metrics, drive_sync_read_count, 1); - auto const start_time = Clock::now(); - m_drive_iface->sync_readv(m_iodev.get(), iov, iovcnt, size, offset); - HISTOGRAM_OBSERVE(m_metrics, drive_read_latency, get_elapsed_time_us(start_time)); - } catch (const std::system_error& e) { - // device_manager_mutable()->handle_error(this); - throw std::system_error(e.code(), fmt::format("dev_name: {}: {}", m_devname, e.what())); - } +std::error_code PhysicalDev::sync_readv(iovec* iov, int iovcnt, uint32_t size, uint64_t offset) { + HISTOGRAM_OBSERVE(m_metrics, read_io_sizes, (((size - 1) / 1024) + 1)); + COUNTER_INCREMENT(m_metrics, drive_sync_read_count, 1); + auto const start_time = Clock::now(); + auto const ret = m_drive_iface->sync_readv(m_iodev.get(), iov, iovcnt, size, offset); + HISTOGRAM_OBSERVE(m_metrics, drive_read_latency, get_elapsed_time_us(start_time)); + return ret; } -void PhysicalDev::sync_write_zero(uint64_t size, uint64_t offset) { - m_drive_iface->sync_write_zero(m_iodev.get(), size, offset); +std::error_code PhysicalDev::sync_write_zero(uint64_t size, uint64_t offset) { + return m_drive_iface->sync_write_zero(m_iodev.get(), size, offset); } void PhysicalDev::submit_batch() { m_drive_iface->submit_batch(); } diff --git a/src/lib/device/physical_dev.hpp b/src/lib/device/physical_dev.hpp index cb74d1ff0..951e61f34 100644 --- a/src/lib/device/physical_dev.hpp +++ b/src/lib/device/physical_dev.hpp @@ -147,7 +147,7 @@ class PhysicalDev { static first_block read_first_block(const std::string& devname, int oflags); static uint64_t get_dev_size(const std::string& devname); - void read_super_block(uint8_t* buf, uint32_t sb_size, uint64_t offset); + std::error_code read_super_block(uint8_t* buf, uint32_t sb_size, uint64_t offset); void write_super_block(uint8_t* buf, uint32_t sb_size, uint64_t offset); void close_device(); @@ -194,20 +194,21 @@ class PhysicalDev { const std::string& get_devname() const { return m_devname; } /////////////////////////////////////// IO Methods ////////////////////////////////////////// - folly::Future< bool > async_write(const char* data, uint32_t size, uint64_t offset, bool part_of_batch = false); - folly::Future< bool > async_writev(const iovec* iov, int iovcnt, uint32_t size, uint64_t offset, - bool part_of_batch = false); - folly::Future< bool > async_read(char* data, uint32_t size, uint64_t offset, bool part_of_batch = false); - folly::Future< bool > async_readv(iovec* iov, int iovcnt, uint32_t size, uint64_t offset, - bool part_of_batch = false); - folly::Future< bool > async_write_zero(uint64_t size, uint64_t offset); - folly::Future< bool > queue_fsync(); - - void sync_write(const char* data, uint32_t size, uint64_t offset); - void sync_writev(const iovec* iov, int iovcnt, uint32_t size, uint64_t offset); - void sync_read(char* data, uint32_t size, uint64_t offset); - void sync_readv(iovec* iov, int iovcnt, uint32_t size, uint64_t offset); - void sync_write_zero(uint64_t size, uint64_t offset); + folly::Future< std::error_code > async_write(const char* data, uint32_t size, uint64_t offset, + bool part_of_batch = false); + folly::Future< std::error_code > async_writev(const iovec* iov, int iovcnt, uint32_t size, uint64_t offset, + bool part_of_batch = false); + folly::Future< std::error_code > async_read(char* data, uint32_t size, uint64_t offset, bool part_of_batch = false); + folly::Future< std::error_code > async_readv(iovec* iov, int iovcnt, uint32_t size, uint64_t offset, + bool part_of_batch = false); + folly::Future< std::error_code > async_write_zero(uint64_t size, uint64_t offset); + folly::Future< std::error_code > queue_fsync(); + + std::error_code sync_write(const char* data, uint32_t size, uint64_t offset); + std::error_code sync_writev(const iovec* iov, int iovcnt, uint32_t size, uint64_t offset); + std::error_code sync_read(char* data, uint32_t size, uint64_t offset); + std::error_code sync_readv(iovec* iov, int iovcnt, uint32_t size, uint64_t offset); + std::error_code sync_write_zero(uint64_t size, uint64_t offset); void submit_batch(); ///////////// Parameters Getters /////////////////////// diff --git a/src/lib/device/vchunk.cpp b/src/lib/device/vchunk.cpp index f47c372ee..e2430219c 100644 --- a/src/lib/device/vchunk.cpp +++ b/src/lib/device/vchunk.cpp @@ -23,7 +23,7 @@ void VChunk::set_user_private(const sisl::blob& data) { m_internal_chunk->set_us const uint8_t* VChunk::get_user_private() const { return m_internal_chunk->user_private(); }; -blk_cap_t VChunk::available_blks() const { return m_internal_chunk->blk_allocator()->available_blks(); } +blk_num_t VChunk::available_blks() const { return m_internal_chunk->blk_allocator()->available_blks(); } uint32_t VChunk::get_pdev_id() const { return m_internal_chunk->physical_dev()->pdev_id(); } diff --git a/src/lib/device/virtual_dev.cpp b/src/lib/device/virtual_dev.cpp index 0d771d4ab..6bcc4aebc 100644 --- a/src/lib/device/virtual_dev.cpp +++ b/src/lib/device/virtual_dev.cpp @@ -81,7 +81,7 @@ static std::shared_ptr< BlkAllocator > create_blk_allocator(blk_allocator_type_t } } -VirtualDev::VirtualDev(DeviceManager& dmgr, const vdev_info& vinfo, vdev_event_cb_t event_cb, bool is_auto_recovery) : +VirtualDev::VirtualDev(DeviceManager& dmgr, vdev_info const& vinfo, vdev_event_cb_t event_cb, bool is_auto_recovery) : m_vdev_info{vinfo}, m_dmgr{dmgr}, m_name{vinfo.name}, @@ -124,8 +124,8 @@ void VirtualDev::add_chunk(cshared< Chunk >& chunk, bool is_fresh_chunk) { m_chunk_selector->add_chunk(chunk); } -folly::Future< bool > VirtualDev::async_format() { - static thread_local std::vector< folly::Future< bool > > s_futs; +folly::Future< std::error_code > VirtualDev::async_format() { + static thread_local std::vector< folly::Future< std::error_code > > s_futs; s_futs.clear(); for (auto& chunk : m_all_chunks) { @@ -134,36 +134,42 @@ folly::Future< bool > VirtualDev::async_format() { chunk->start_offset()); s_futs.emplace_back(pdev->async_write_zero(chunk->size(), chunk->start_offset())); } - return folly::collectAllUnsafe(s_futs).thenTry([](auto&&) { return folly::makeFuture< bool >(true); }); + return folly::collectAllUnsafe(s_futs).thenTry([](auto&& t) { + for (const auto& err_c : t.value()) { + if (sisl_unlikely(err_c.value())) { return folly::makeFuture< std::error_code >(err_c); } + } + return folly::makeFuture< std::error_code >(std::error_code{}); + }); } /*std::shared_ptr< blkalloc_cp > VirtualDev::attach_prepare_cp(const std::shared_ptr< blkalloc_cp >& cur_ba_cp) { return (Chunk::attach_prepare_cp(cur_ba_cp)); }*/ -bool VirtualDev::is_blk_alloced(const BlkId& blkid) const { - return m_dmgr.get_chunk(blkid.get_chunk_num())->blk_allocator()->is_blk_alloced(blkid); +bool VirtualDev::is_blk_alloced(BlkId const& blkid) const { + return m_dmgr.get_chunk(blkid.chunk_num())->blk_allocator()->is_blk_alloced(blkid); } -BlkAllocStatus VirtualDev::commit_blk(const BlkId& blkid) { - Chunk* chunk = m_dmgr.get_chunk_mutable(blkid.get_chunk_num()); +BlkAllocStatus VirtualDev::commit_blk(BlkId const& blkid) { + Chunk* chunk = m_dmgr.get_chunk_mutable(blkid.chunk_num()); HS_LOG(DEBUG, device, "commit_blk: bid {}", blkid.to_string()); return chunk->blk_allocator_mutable()->alloc_on_disk(blkid); } -BlkAllocStatus VirtualDev::alloc_contiguous_blk(blk_count_t nblks, const blk_alloc_hints& hints, BlkId* out_blkid) { +BlkAllocStatus VirtualDev::alloc_contiguous_blks(blk_count_t nblks, blk_alloc_hints const& hints, BlkId& out_blkid) { BlkAllocStatus ret; try { - static thread_local std::vector< BlkId > blkid{}; - blkid.clear(); - HS_DBG_ASSERT_EQ(hints.is_contiguous, true); - ret = alloc_blk(nblks, hints, blkid); - if (ret == BlkAllocStatus::SUCCESS) { - HS_REL_ASSERT_EQ(blkid.size(), 1, "out blkid more than 1 entries({}) will lead to blk leak!", blkid.size()); - *out_blkid = std::move(blkid.front()); + MultiBlkId mbid; + if (!hints.is_contiguous) { + HS_DBG_ASSERT(false, "Expected alloc_contiguous_blk call to be with hints.is_contiguous=true"); + blk_alloc_hints adjusted_hints = hints; + adjusted_hints.is_contiguous = true; + ret = alloc_blks(nblks, adjusted_hints, mbid); } else { - HS_DBG_ASSERT_EQ(blkid.size(), 0); + ret = alloc_blks(nblks, hints, mbid); } + HS_REL_ASSERT_EQ(mbid.num_pieces(), 1, "out blkid more than 1 entries will lead to blk leak!"); + out_blkid = mbid.to_single_blkid(); } catch (const std::exception& e) { ret = BlkAllocStatus::FAILED; HS_DBG_ASSERT(0, "{}", e.what()); @@ -171,25 +177,7 @@ BlkAllocStatus VirtualDev::alloc_contiguous_blk(blk_count_t nblks, const blk_all return ret; } -BlkAllocStatus VirtualDev::alloc_blk(uint32_t nblks, const blk_alloc_hints& hints, std::vector< BlkId >& out_blkid) { - size_t start_idx = out_blkid.size(); - while (nblks != 0) { - const blk_count_t nblks_op = std::min(BlkId::max_blks_in_op(), s_cast< blk_count_t >(nblks)); - const auto ret = do_alloc_blk(nblks_op, hints, out_blkid); - if (ret != BlkAllocStatus::SUCCESS) { - for (auto i = start_idx; i < out_blkid.size(); ++i) { - free_blk(out_blkid[i]); - out_blkid.erase(out_blkid.begin() + start_idx, out_blkid.end()); - } - return ret; - } - nblks -= nblks_op; - } - return BlkAllocStatus::SUCCESS; -} - -BlkAllocStatus VirtualDev::do_alloc_blk(blk_count_t nblks, const blk_alloc_hints& hints, - std::vector< BlkId >& out_blkid) { +BlkAllocStatus VirtualDev::alloc_blks(blk_count_t nblks, blk_alloc_hints const& hints, MultiBlkId& out_blkid) { try { // First select a chunk to allocate it from BlkAllocStatus status; @@ -198,13 +186,19 @@ BlkAllocStatus VirtualDev::do_alloc_blk(blk_count_t nblks, const blk_alloc_hints do { chunk = m_chunk_selector->select_chunk(nblks, hints).get(); - if (chunk == nullptr) { status = BlkAllocStatus::SPACE_FULL; } + if (chunk == nullptr) { + status = BlkAllocStatus::SPACE_FULL; + break; + } - status = alloc_blk_from_chunk(nblks, hints, out_blkid, chunk); - if (status == BlkAllocStatus::SUCCESS || !hints.can_look_for_other_chunk) { break; } + status = alloc_blks_from_chunk(nblks, hints, out_blkid, chunk); + if ((status == BlkAllocStatus::SUCCESS) || !hints.can_look_for_other_chunk || + (status == BlkAllocStatus::PARTIAL && hints.partial_alloc_ok)) { + break; + } } while (++attempt < m_all_chunks.size()); - if (status != BlkAllocStatus::SUCCESS) { + if ((status != BlkAllocStatus::SUCCESS) || (status != BlkAllocStatus::PARTIAL)) { LOGERROR("nblks={} failed to alloc after trying to alloc on every chunks {} and devices {}.", nblks); COUNTER_INCREMENT(m_metrics, vdev_num_alloc_failure, 1); } @@ -217,41 +211,66 @@ BlkAllocStatus VirtualDev::do_alloc_blk(blk_count_t nblks, const blk_alloc_hints } } -BlkAllocStatus VirtualDev::alloc_blk_from_chunk(blk_count_t nblks, const blk_alloc_hints& hints, - std::vector< BlkId >& out_blkid, Chunk* chunk) { +BlkAllocStatus VirtualDev::alloc_blks(blk_count_t nblks, blk_alloc_hints const& hints, + std::vector< BlkId >& out_blkids) { + // Regular alloc blks will allocate in MultiBlkId, but there is an upper limit on how many it can accomodate in a + // single MultiBlkId, if caller is ok to generate multiple MultiBlkids, this method is called. + auto h = hints; + h.partial_alloc_ok = true; + h.is_contiguous = true; + blk_count_t nblks_remain = nblks; + BlkAllocStatus status; + + do { + out_blkids.emplace_back(); // Put an empty MultiBlkId and use that for allocating them + BlkId& out_bid = out_blkids.back(); + status = alloc_contiguous_blks(nblks_remain, h, out_bid); + + auto nblks_this_iter = out_bid.blk_count(); + nblks_remain = (nblks_remain < nblks_this_iter) ? 0 : (nblks_remain - nblks_this_iter); + } while (nblks_remain); + + return status; +} + +BlkAllocStatus VirtualDev::alloc_blks_from_chunk(blk_count_t nblks, blk_alloc_hints const& hints, MultiBlkId& out_blkid, + Chunk* chunk) { #ifdef _PRERELEASE if (auto const fake_status = iomgr_flip::instance()->get_test_flip< uint32_t >("blk_allocation_flip", nblks, chunk->vdev_id())) { return static_cast< BlkAllocStatus >(fake_status.get()); } #endif - static thread_local std::vector< BlkId > chunk_blkid{}; - chunk_blkid.clear(); - auto status = chunk->blk_allocator_mutable()->alloc(nblks, hints, chunk_blkid); - if (status == BlkAllocStatus::PARTIAL) { + auto status = chunk->blk_allocator_mutable()->alloc(nblks, hints, out_blkid); + if ((status == BlkAllocStatus::PARTIAL) && (!hints.partial_alloc_ok)) { // free partial result - for (auto const b : chunk_blkid) { - auto const ret = chunk->blk_allocator_mutable()->free_on_realtime(b); + auto it = out_blkid.iterate(); + while (auto const b = it.next()) { + auto const ret = chunk->blk_allocator_mutable()->free_on_realtime(*b); HS_REL_ASSERT(ret, "failed to free on realtime"); } - chunk->blk_allocator_mutable()->free(chunk_blkid); + chunk->blk_allocator_mutable()->free(out_blkid); + out_blkid = MultiBlkId{}; status = BlkAllocStatus::FAILED; - } else if (status == BlkAllocStatus::SUCCESS) { - // append chunk blocks to out blocks - out_blkid.insert(std::end(out_blkid), std::make_move_iterator(std::begin(chunk_blkid)), - std::make_move_iterator(std::end(chunk_blkid))); } + return status; } -/*bool VirtualDev::free_on_realtime(const BlkId& b) { - Chunk* chunk = m_dmgr.get_chunk_mutable(b.get_chunk_num()); +/*bool VirtualDev::free_on_realtime(BlkId const& b) { + Chunk* chunk = m_dmgr.get_chunk_mutable(b.chunk_num()); return chunk->blk_allocator_mutable()->free_on_realtime(b); }*/ -void VirtualDev::free_blk(const BlkId& b) { - Chunk* chunk = m_dmgr.get_chunk_mutable(b.get_chunk_num()); - chunk->blk_allocator_mutable()->free(b); +void VirtualDev::free_blk(BlkId const& b) { + if (b.is_multi()) { + MultiBlkId const& mb = r_cast< MultiBlkId const& >(b); + Chunk* chunk = m_dmgr.get_chunk_mutable(mb.chunk_num()); + chunk->blk_allocator_mutable()->free(mb); + } else { + Chunk* chunk = m_dmgr.get_chunk_mutable(b.chunk_num()); + chunk->blk_allocator_mutable()->free(b); + } } void VirtualDev::recovery_done() { @@ -261,7 +280,7 @@ void VirtualDev::recovery_done() { } } -uint64_t VirtualDev::get_len(const iovec* iov, const int iovcnt) { +uint64_t VirtualDev::get_len(const iovec* iov, int iovcnt) { uint64_t len{0}; for (int i{0}; i < iovcnt; ++i) { len += iov[i].iov_len; @@ -270,7 +289,10 @@ uint64_t VirtualDev::get_len(const iovec* iov, const int iovcnt) { } ////////////////////////// async write section ////////////////////////////////// -folly::Future< bool > VirtualDev::async_write(const char* buf, uint32_t size, const BlkId& bid, bool part_of_batch) { +folly::Future< std::error_code > VirtualDev::async_write(const char* buf, uint32_t size, BlkId const& bid, + bool part_of_batch) { + HS_DBG_ASSERT_EQ(bid.is_multi(), false, "async_write needs individual pieces of blkid - not MultiBlkid"); + Chunk* chunk; uint64_t const dev_offset = to_dev_offset(bid, &chunk); auto* pdev = chunk->physical_dev_mutable(); @@ -283,8 +305,8 @@ folly::Future< bool > VirtualDev::async_write(const char* buf, uint32_t size, co return pdev->async_write(buf, size, dev_offset, part_of_batch); } -folly::Future< bool > VirtualDev::async_write(const char* buf, uint32_t size, cshared< Chunk >& chunk, - uint64_t offset_in_chunk) { +folly::Future< std::error_code > VirtualDev::async_write(const char* buf, uint32_t size, cshared< Chunk >& chunk, + uint64_t offset_in_chunk) { auto const dev_offset = chunk->start_offset() + offset_in_chunk; auto* pdev = chunk->physical_dev_mutable(); @@ -296,8 +318,10 @@ folly::Future< bool > VirtualDev::async_write(const char* buf, uint32_t size, cs return pdev->async_write(buf, size, dev_offset, false /* part_of_batch */); } -folly::Future< bool > VirtualDev::async_writev(const iovec* iov, const int iovcnt, const BlkId& bid, - bool part_of_batch) { +folly::Future< std::error_code > VirtualDev::async_writev(const iovec* iov, const int iovcnt, BlkId const& bid, + bool part_of_batch) { + HS_DBG_ASSERT_EQ(bid.is_multi(), false, "async_writev needs individual pieces of blkid - not MultiBlkid"); + Chunk* chunk; uint64_t const dev_offset = to_dev_offset(bid, &chunk); auto const size = get_len(iov, iovcnt); @@ -311,8 +335,8 @@ folly::Future< bool > VirtualDev::async_writev(const iovec* iov, const int iovcn return pdev->async_writev(iov, iovcnt, size, dev_offset, part_of_batch); } -folly::Future< bool > VirtualDev::async_writev(const iovec* iov, const int iovcnt, cshared< Chunk >& chunk, - uint64_t offset_in_chunk) { +folly::Future< std::error_code > VirtualDev::async_writev(const iovec* iov, const int iovcnt, cshared< Chunk >& chunk, + uint64_t offset_in_chunk) { auto const dev_offset = chunk->start_offset() + offset_in_chunk; auto const size = get_len(iov, iovcnt); auto* pdev = chunk->physical_dev_mutable(); @@ -326,17 +350,22 @@ folly::Future< bool > VirtualDev::async_writev(const iovec* iov, const int iovcn } ////////////////////////// sync write section ////////////////////////////////// -void VirtualDev::sync_write(const char* buf, uint32_t size, const BlkId& bid) { +std::error_code VirtualDev::sync_write(const char* buf, uint32_t size, BlkId const& bid) { + HS_DBG_ASSERT_EQ(bid.is_multi(), false, "sync_write needs individual pieces of blkid - not MultiBlkid"); + Chunk* chunk; uint64_t const dev_offset = to_dev_offset(bid, &chunk); - chunk->physical_dev_mutable()->sync_write(buf, size, dev_offset); + return chunk->physical_dev_mutable()->sync_write(buf, size, dev_offset); } -void VirtualDev::sync_write(const char* buf, uint32_t size, cshared< Chunk >& chunk, uint64_t offset_in_chunk) { - chunk->physical_dev_mutable()->sync_write(buf, size, chunk->start_offset() + offset_in_chunk); +std::error_code VirtualDev::sync_write(const char* buf, uint32_t size, cshared< Chunk >& chunk, + uint64_t offset_in_chunk) { + return chunk->physical_dev_mutable()->sync_write(buf, size, chunk->start_offset() + offset_in_chunk); } -void VirtualDev::sync_writev(const iovec* iov, int iovcnt, const BlkId& bid) { +std::error_code VirtualDev::sync_writev(const iovec* iov, int iovcnt, BlkId const& bid) { + HS_DBG_ASSERT_EQ(bid.is_multi(), false, "sync_writev needs individual pieces of blkid - not MultiBlkid"); + Chunk* chunk; uint64_t const dev_offset = to_dev_offset(bid, &chunk); auto const size = get_len(iov, iovcnt); @@ -347,10 +376,11 @@ void VirtualDev::sync_writev(const iovec* iov, int iovcnt, const BlkId& bid) { COUNTER_INCREMENT(m_metrics, unalign_writes, 1); } - pdev->sync_writev(iov, iovcnt, size, dev_offset); + return pdev->sync_writev(iov, iovcnt, size, dev_offset); } -void VirtualDev::sync_writev(const iovec* iov, int iovcnt, cshared< Chunk >& chunk, uint64_t offset_in_chunk) { +std::error_code VirtualDev::sync_writev(const iovec* iov, int iovcnt, cshared< Chunk >& chunk, + uint64_t offset_in_chunk) { uint64_t const dev_offset = chunk->start_offset() + offset_in_chunk; auto const size = get_len(iov, iovcnt); auto* pdev = chunk->physical_dev_mutable(); @@ -360,35 +390,44 @@ void VirtualDev::sync_writev(const iovec* iov, int iovcnt, cshared< Chunk >& chu COUNTER_INCREMENT(m_metrics, unalign_writes, 1); } - pdev->sync_writev(iov, iovcnt, size, dev_offset); + return pdev->sync_writev(iov, iovcnt, size, dev_offset); } ////////////////////////////////// async read section /////////////////////////////////////////////// -folly::Future< bool > VirtualDev::async_read(char* buf, uint64_t size, const BlkId& bid, bool part_of_batch) { +folly::Future< std::error_code > VirtualDev::async_read(char* buf, uint64_t size, BlkId const& bid, + bool part_of_batch) { + HS_DBG_ASSERT_EQ(bid.is_multi(), false, "async_read needs individual pieces of blkid - not MultiBlkid"); + Chunk* pchunk; uint64_t const dev_offset = to_dev_offset(bid, &pchunk); return pchunk->physical_dev_mutable()->async_read(buf, size, dev_offset, part_of_batch); } -folly::Future< bool > VirtualDev::async_readv(iovec* iovs, int iovcnt, uint64_t size, const BlkId& bid, - bool part_of_batch) { +folly::Future< std::error_code > VirtualDev::async_readv(iovec* iovs, int iovcnt, uint64_t size, BlkId const& bid, + bool part_of_batch) { + HS_DBG_ASSERT_EQ(bid.is_multi(), false, "async_readv needs individual pieces of blkid - not MultiBlkid"); + Chunk* pchunk; uint64_t const dev_offset = to_dev_offset(bid, &pchunk); return pchunk->physical_dev_mutable()->async_readv(iovs, iovcnt, size, dev_offset, part_of_batch); } ////////////////////////////////////////// sync read section //////////////////////////////////////////// -void VirtualDev::sync_read(char* buf, uint32_t size, const BlkId& bid) { +std::error_code VirtualDev::sync_read(char* buf, uint32_t size, BlkId const& bid) { + HS_DBG_ASSERT_EQ(bid.is_multi(), false, "sync_read needs individual pieces of blkid - not MultiBlkid"); + Chunk* chunk; uint64_t const dev_offset = to_dev_offset(bid, &chunk); - chunk->physical_dev_mutable()->sync_read(buf, size, dev_offset); + return chunk->physical_dev_mutable()->sync_read(buf, size, dev_offset); } -void VirtualDev::sync_read(char* buf, uint32_t size, cshared< Chunk >& chunk, uint64_t offset_in_chunk) { - chunk->physical_dev_mutable()->sync_read(buf, size, chunk->start_offset() + offset_in_chunk); +std::error_code VirtualDev::sync_read(char* buf, uint32_t size, cshared< Chunk >& chunk, uint64_t offset_in_chunk) { + return chunk->physical_dev_mutable()->sync_read(buf, size, chunk->start_offset() + offset_in_chunk); } -void VirtualDev::sync_readv(iovec* iov, int iovcnt, const BlkId& bid) { +std::error_code VirtualDev::sync_readv(iovec* iov, int iovcnt, BlkId const& bid) { + HS_DBG_ASSERT_EQ(bid.is_multi(), false, "sync_readv needs individual pieces of blkid - not MultiBlkid"); + Chunk* chunk; uint64_t const dev_offset = to_dev_offset(bid, &chunk); auto const size = get_len(iov, iovcnt); @@ -399,10 +438,10 @@ void VirtualDev::sync_readv(iovec* iov, int iovcnt, const BlkId& bid) { COUNTER_INCREMENT(m_metrics, unalign_writes, 1); } - pdev->sync_readv(iov, iovcnt, size, dev_offset); + return pdev->sync_readv(iov, iovcnt, size, dev_offset); } -void VirtualDev::sync_readv(iovec* iov, int iovcnt, cshared< Chunk >& chunk, uint64_t offset_in_chunk) { +std::error_code VirtualDev::sync_readv(iovec* iov, int iovcnt, cshared< Chunk >& chunk, uint64_t offset_in_chunk) { uint64_t const dev_offset = chunk->start_offset() + offset_in_chunk; auto const size = get_len(iov, iovcnt); auto* pdev = chunk->physical_dev_mutable(); @@ -412,10 +451,10 @@ void VirtualDev::sync_readv(iovec* iov, int iovcnt, cshared< Chunk >& chunk, uin COUNTER_INCREMENT(m_metrics, unalign_writes, 1); } - pdev->sync_readv(iov, iovcnt, size, dev_offset); + return pdev->sync_readv(iov, iovcnt, size, dev_offset); } -folly::Future< bool > VirtualDev::queue_fsync_pdevs() { +folly::Future< std::error_code > VirtualDev::queue_fsync_pdevs() { HS_DBG_ASSERT_EQ(HS_DYNAMIC_CONFIG(device->direct_io_mode), false, "Not expect to do fsync in DIRECT_IO_MODE."); assert(m_pdevs.size() > 0); @@ -424,13 +463,18 @@ folly::Future< bool > VirtualDev::queue_fsync_pdevs() { HS_LOG(TRACE, device, "Flushing pdev {}", pdev->get_devname()); return pdev->queue_fsync(); } else { - static thread_local std::vector< folly::Future< bool > > s_futs; + static thread_local std::vector< folly::Future< std::error_code > > s_futs; s_futs.clear(); for (auto* pdev : m_pdevs) { HS_LOG(TRACE, device, "Flushing pdev {}", pdev->get_devname()); s_futs.emplace_back(pdev->queue_fsync()); } - return folly::collectAllUnsafe(s_futs).thenTry([](auto&&) { return folly::makeFuture< bool >(true); }); + return folly::collectAllUnsafe(s_futs).thenTry([](auto&& t) { + for (const auto& err_c : t.value()) { + if (sisl_unlikely(err_c.value())) { return folly::makeFuture< std::error_code >(err_c); } + } + return folly::makeFuture< std::error_code >(std::error_code{}); + }); } } @@ -543,9 +587,9 @@ void VirtualDev::cp_cleanup(CP*) { } ///////////////////////// VirtualDev Private Methods ///////////////////////////// -uint64_t VirtualDev::to_dev_offset(const BlkId& b, Chunk** chunk) const { - *chunk = m_dmgr.get_chunk_mutable(b.get_chunk_num()); - return uint64_cast(b.get_blk_num()) * block_size() + uint64_cast((*chunk)->start_offset()); +uint64_t VirtualDev::to_dev_offset(BlkId const& b, Chunk** chunk) const { + *chunk = m_dmgr.get_chunk_mutable(b.chunk_num()); + return uint64_cast(b.blk_num()) * block_size() + uint64_cast((*chunk)->start_offset()); } } // namespace homestore diff --git a/src/lib/device/virtual_dev.hpp b/src/lib/device/virtual_dev.hpp index fe0e61ac5..0fff20026 100644 --- a/src/lib/device/virtual_dev.hpp +++ b/src/lib/device/virtual_dev.hpp @@ -56,9 +56,9 @@ class VirtualDevMetrics : public sisl::MetricsGroupWrapper { register_me_to_farm(); } - VirtualDevMetrics(const VirtualDevMetrics&) = delete; + VirtualDevMetrics(VirtualDevMetrics const&) = delete; VirtualDevMetrics(VirtualDevMetrics&&) noexcept = delete; - VirtualDevMetrics& operator=(const VirtualDevMetrics&) = delete; + VirtualDevMetrics& operator=(VirtualDevMetrics const&) = delete; VirtualDevMetrics& operator=(VirtualDevMetrics&&) noexcept = delete; ~VirtualDevMetrics() { deregister_me_from_farm(); } @@ -99,8 +99,8 @@ class VirtualDev { public: VirtualDev(DeviceManager& dmgr, const vdev_info& vinfo, vdev_event_cb_t event_cb, bool is_auto_recovery); - VirtualDev(const VirtualDev& other) = delete; - VirtualDev& operator=(const VirtualDev& other) = delete; + VirtualDev(VirtualDev const& other) = delete; + VirtualDev& operator=(VirtualDev const& other) = delete; VirtualDev(VirtualDev&&) noexcept = delete; VirtualDev& operator=(VirtualDev&&) noexcept = delete; virtual ~VirtualDev() = default; @@ -114,28 +114,31 @@ class VirtualDev { /// @brief Formats the vdev asynchronously by zeroing the entire vdev. It will use underlying physical device /// capabilities to zero them if fast zero is possible, otherwise will zero block by block /// @param cb Callback after formatting is completed. - virtual folly::Future< bool > async_format(); + virtual folly::Future< std::error_code > async_format(); /////////////////////// Block Allocation related methods ///////////////////////////// /// @brief This method allocates contigous blocks in the vdev /// @param nblks : Number of blocks to allocate /// @param hints : Hints about block allocation, (specific device to allocate, stream etc) - /// @param out_blkid : Pointer to where allocated BlkId to be placed + /// @param out_blkid : Reference to where allocated BlkId to be placed /// @return BlkAllocStatus : Status about the allocation - virtual BlkAllocStatus alloc_contiguous_blk(blk_count_t nblks, const blk_alloc_hints& hints, BlkId* out_blkid); + virtual BlkAllocStatus alloc_contiguous_blks(blk_count_t nblks, blk_alloc_hints const& hints, BlkId& out_blkid); /// @brief This method allocates blocks in the vdev and it could be non-contiguous, hence multiple BlkIds are /// returned /// @param nblks : Number of blocks to allocate /// @param hints : Hints about block allocation, (specific device to allocate, stream etc) - /// @param out_blkid : Reference to the vector of blkids to be placed. It appends into the vector + /// @param out_blkid : Reference to the MultiBlkd which can hold multiple blkids. /// @return BlkAllocStatus : Status about the allocation - virtual BlkAllocStatus alloc_blk(uint32_t nblks, const blk_alloc_hints& hints, std::vector< BlkId >& out_blkid); + virtual BlkAllocStatus alloc_blks(blk_count_t nblks, blk_alloc_hints const& hints, MultiBlkId& out_blkid); + + virtual BlkAllocStatus alloc_blks(blk_count_t nblks, blk_alloc_hints const& hints, + std::vector< BlkId >& out_blkids); /// @brief Checks if a given block id is allocated in the in-memory version of the blk allocator /// @param blkid : BlkId to check for allocation /// @return true or false - virtual bool is_blk_alloced(const BlkId& blkid) const; + virtual bool is_blk_alloced(BlkId const& blkid) const; /// @brief Commits the blkid in on-disk version of the blk allocator. The blkid is assumed to be allocated using /// alloc_blk or alloc_contiguous_blk method earlier (either after reboot or prior to reboot). It is not required @@ -144,9 +147,9 @@ class VirtualDev { /// recover Please note that even calling this method is not guaranteed to persisted until checkpoint is taken. /// @param blkid BlkId to commit explicitly. /// @return Allocation Status - virtual BlkAllocStatus commit_blk(const BlkId& blkid); + virtual BlkAllocStatus commit_blk(BlkId const& blkid); - virtual void free_blk(const BlkId& b); + virtual void free_blk(BlkId const& b); /////////////////////// Write API related methods ///////////////////////////// /// @brief Asynchornously write the buffer to the device on a given blkid @@ -156,10 +159,11 @@ class VirtualDev { /// @param part_of_batch : Is this write part of batch io. If true, caller is expected to call submit_batch at /// the end of the batch, otherwise this write request will not be queued. /// @return future< bool > Future result of success or failure - folly::Future< bool > async_write(const char* buf, uint32_t size, const BlkId& bid, bool part_of_batch = false); + folly::Future< std::error_code > async_write(const char* buf, uint32_t size, BlkId const& bid, + bool part_of_batch = false); - folly::Future< bool > async_write(const char* buf, uint32_t size, cshared< Chunk >& chunk, - uint64_t offset_in_chunk); + folly::Future< std::error_code > async_write(const char* buf, uint32_t size, cshared< Chunk >& chunk, + uint64_t offset_in_chunk); /// @brief Asynchornously write the buffer to the device on a given blkid from vector of buffer /// @param iov : Vector of buffer to write data from @@ -168,31 +172,32 @@ class VirtualDev { /// @param part_of_batch : Is this write part of batch io. If true, caller is expected to call submit_batch at /// the end of the batch, otherwise this write request will not be queued. /// @return future< bool > Future result of success or failure - folly::Future< bool > async_writev(const iovec* iov, int iovcnt, const BlkId& bid, bool part_of_batch = false); + folly::Future< std::error_code > async_writev(const iovec* iov, int iovcnt, BlkId const& bid, + bool part_of_batch = false); // TODO: This needs to be removed once Journal starting to use AppendBlkAllocator - folly::Future< bool > async_writev(const iovec* iov, const int iovcnt, cshared< Chunk >& chunk, - uint64_t offset_in_chunk); + folly::Future< std::error_code > async_writev(const iovec* iov, const int iovcnt, cshared< Chunk >& chunk, + uint64_t offset_in_chunk); /// @brief Synchronously write the buffer to the blkid /// @param buf : Buffer to write data from /// @param size : Size of the buffer /// @param bid : BlkId which was previously allocated. It is expected that entire size was allocated previously. /// @return ssize_t: Size of the data actually written. - void sync_write(const char* buf, uint32_t size, const BlkId& bid); + std::error_code sync_write(const char* buf, uint32_t size, BlkId const& bid); // TODO: This needs to be removed once Journal starting to use AppendBlkAllocator - void sync_write(const char* buf, uint32_t size, cshared< Chunk >& chunk, uint64_t offset_in_chunk); + std::error_code sync_write(const char* buf, uint32_t size, cshared< Chunk >& chunk, uint64_t offset_in_chunk); /// @brief Synchronously write the vector of buffers to the blkid /// @param iov : Vector of buffer to write data from /// @param iovcnt : Count of buffer /// @param bid BlkId which was previously allocated. It is expected that entire size was allocated previously. /// @return ssize_t: Size of the data actually written. - void sync_writev(const iovec* iov, int iovcnt, const BlkId& bid); + std::error_code sync_writev(const iovec* iov, int iovcnt, BlkId const& bid); // TODO: This needs to be removed once Journal starting to use AppendBlkAllocator - void sync_writev(const iovec* iov, int iovcnt, cshared< Chunk >& chunk, uint64_t offset_in_chunk); + std::error_code sync_writev(const iovec* iov, int iovcnt, cshared< Chunk >& chunk, uint64_t offset_in_chunk); /////////////////////// Read API related methods ///////////////////////////// @@ -203,7 +208,7 @@ class VirtualDev { /// @param part_of_batch : Is this read part of batch io. If true, caller is expected to call submit_batch at /// the end of the batch, otherwise this read request will not be queued. /// @return future< bool > Future result of success or failure - folly::Future< bool > async_read(char* buf, uint64_t size, const BlkId& bid, bool part_of_batch = false); + folly::Future< std::error_code > async_read(char* buf, uint64_t size, BlkId const& bid, bool part_of_batch = false); /// @brief Asynchronously read the data for a given BlkId to the vector of buffers /// @param iov : Vector of buffer to write read to @@ -213,34 +218,34 @@ class VirtualDev { /// @param part_of_batch : Is this read part of batch io. If true, caller is expected to call submit_batch at /// the end of the batch, otherwise this read request will not be queued. /// @return future< bool > Future result of success or failure - folly::Future< bool > async_readv(iovec* iovs, int iovcnt, uint64_t size, const BlkId& bid, - bool part_of_batch = false); + folly::Future< std::error_code > async_readv(iovec* iovs, int iovcnt, uint64_t size, BlkId const& bid, + bool part_of_batch = false); /// @brief Synchronously read the data for a given BlkId. /// @param buf : Buffer to read data to /// @param size : Size of the buffer /// @param bid : BlkId from data needs to be read /// @return ssize_t: Size of the data actually read. - void sync_read(char* buf, uint32_t size, const BlkId& bid); + std::error_code sync_read(char* buf, uint32_t size, BlkId const& bid); // TODO: This needs to be removed once Journal starting to use AppendBlkAllocator - void sync_read(char* buf, uint32_t size, cshared< Chunk >& chunk, uint64_t offset_in_chunk); + std::error_code sync_read(char* buf, uint32_t size, cshared< Chunk >& chunk, uint64_t offset_in_chunk); /// @brief Synchronously read the data for a given BlkId to vector of buffers /// @param iov : Vector of buffer to write read to /// @param iovcnt : Count of buffer /// @param size : Size of the actual data, it is really to optimize the iovec from iterating again to get size /// @return ssize_t: Size of the data actually read. - void sync_readv(iovec* iov, int iovcnt, const BlkId& bid); + std::error_code sync_readv(iovec* iov, int iovcnt, BlkId const& bid); // TODO: This needs to be removed once Journal starting to use AppendBlkAllocator - void sync_readv(iovec* iov, int iovcnt, cshared< Chunk >& chunk, uint64_t offset_in_chunk); + std::error_code sync_readv(iovec* iov, int iovcnt, cshared< Chunk >& chunk, uint64_t offset_in_chunk); /////////////////////// Other API related methods ///////////////////////////// /// @brief Fsync the underlying physical devices that vdev is sitting on asynchornously /// @return future< bool > Future result with bool to indicate when fsync is actually executed - folly::Future< bool > queue_fsync_pdevs(); + folly::Future< std::error_code > queue_fsync_pdevs(); /// @brief Submit the batch of IOs previously queued as part of async read/write APIs. void submit_batch(); @@ -274,7 +279,7 @@ class VirtualDev { uint32_t optimal_page_size() const; uint32_t atomic_page_size() const; - static uint64_t get_len(const iovec* iov, const int iovcnt); + static uint64_t get_len(const iovec* iov, int iovcnt); const std::set< PhysicalDev* >& get_pdevs() const { return m_pdevs; } std::vector< shared< Chunk > > get_chunks() const; shared< Chunk > get_next_chunk(cshared< Chunk >& chunk) const; @@ -283,10 +288,9 @@ class VirtualDev { void update_vdev_private(const sisl::blob& data); private: - BlkAllocStatus do_alloc_blk(blk_count_t nblks, const blk_alloc_hints& hints, std::vector< BlkId >& out_blkid); - uint64_t to_dev_offset(const BlkId& b, Chunk** chunk) const; - BlkAllocStatus alloc_blk_from_chunk(blk_count_t nblks, const blk_alloc_hints& hints, - std::vector< BlkId >& out_blkid, Chunk* chunk); + uint64_t to_dev_offset(BlkId const& b, Chunk** chunk) const; + BlkAllocStatus alloc_blks_from_chunk(blk_count_t nblks, blk_alloc_hints const& hints, MultiBlkId& out_blkid, + Chunk* chunk); }; // place holder for future needs in which components underlying virtualdev needs cp flush context; diff --git a/src/lib/homestore.cpp b/src/lib/homestore.cpp index 033c41611..35fb79e64 100644 --- a/src/lib/homestore.cpp +++ b/src/lib/homestore.cpp @@ -113,7 +113,7 @@ void HomeStore::format_and_start(std::map< uint32_t, hs_format_params >&& format m_dev_mgr->format_devices(); hs_utils::set_btree_mempool_size(m_dev_mgr->atomic_page_size({HSDevType::Fast})); - std::vector< folly::Future< bool > > futs; + std::vector< folly::Future< std::error_code > > futs; for (const auto& [svc_type, fparams] : format_opts) { if (fparams.size_pct == 0) { continue; } @@ -133,10 +133,13 @@ void HomeStore::format_and_start(std::map< uint32_t, hs_format_params >&& format } } - try { - if (!futs.empty()) { folly::collectAllUnsafe(futs).get(); } - } catch (const std::exception& e) { HS_REL_ASSERT(false, "IO error during format of vdev, error={}", e.what()); } - + if (!futs.empty()) { + auto tlist = folly::collectAllUnsafe(futs).get(); + for (auto const& t : tlist) { + auto const err = t.value(); + HS_REL_ASSERT(!err, "IO error during format of vdev, error={}", err.message()); + } + } do_start(); } diff --git a/src/lib/index/wb_cache.cpp b/src/lib/index/wb_cache.cpp index 5b117ca03..27899db61 100644 --- a/src/lib/index/wb_cache.cpp +++ b/src/lib/index/wb_cache.cpp @@ -81,11 +81,9 @@ void IndexWBCache::start_flush_threads() { BtreeNodePtr IndexWBCache::alloc_buf(node_initializer_t&& node_initializer) { // Alloc a block of data from underlying vdev - static thread_local std::vector< BlkId > t_blkids; - t_blkids.clear(); - auto ret = m_vdev->alloc_blk(1, blk_alloc_hints{}, t_blkids); + BlkId blkid; + auto ret = m_vdev->alloc_contiguous_blks(1, blk_alloc_hints{}, blkid); if (ret != BlkAllocStatus::SUCCESS) { return nullptr; } - BlkId blkid = t_blkids[0]; // Alloc buffer and initialize the node auto idx_buf = std::make_shared< IndexBuffer >(blkid, m_node_size, m_vdev->align_size()); diff --git a/src/lib/logstore/log_store_service.cpp b/src/lib/logstore/log_store_service.cpp index ec3be9da9..02f223131 100644 --- a/src/lib/logstore/log_store_service.cpp +++ b/src/lib/logstore/log_store_service.cpp @@ -42,7 +42,7 @@ LogStoreService::LogStoreService() : m_logstore_families{std::make_unique< LogStoreFamily >(DATA_LOG_FAMILY_IDX), std::make_unique< LogStoreFamily >(CTRL_LOG_FAMILY_IDX)} {} -folly::Future< bool > LogStoreService::create_vdev(uint64_t size, logstore_family_id_t family) { +folly::Future< std::error_code > LogStoreService::create_vdev(uint64_t size, logstore_family_id_t family) { const auto atomic_page_size = hs()->device_mgr()->atomic_page_size(HSDevType::Fast); hs_vdev_context hs_ctx; diff --git a/src/lib/meta/meta_blk_service.cpp b/src/lib/meta/meta_blk_service.cpp index ac9268e5f..338eb9a80 100644 --- a/src/lib/meta/meta_blk_service.cpp +++ b/src/lib/meta/meta_blk_service.cpp @@ -136,7 +136,7 @@ void MetaBlkService::cache_clear() { void MetaBlkService::read(const BlkId& bid, uint8_t* dest, size_t sz) const { sz = sisl::round_up(sz, align_size()); - HS_DBG_ASSERT_LE(sz, bid.get_nblks() * block_size()); + HS_DBG_ASSERT_LE(sz, bid.blk_count() * block_size()); try { m_sb_vdev->sync_read(r_cast< char* >(dest), sz, bid); } catch (std::exception& e) { HS_REL_ASSERT(0, "Exception: {}", e.what()); } @@ -224,7 +224,7 @@ bool MetaBlkService::scan_and_load_meta_blks(meta_blk_map_t& meta_blks, ovf_hdr_ auto self_recover{false}; while (bid.is_valid()) { - last_mblk_id->set(bid); + *last_mblk_id = bid; // TODO: add a new API in blkstore read to by pass cache; // e.g. take caller's read buf to avoid this extra memory copy; @@ -462,7 +462,7 @@ void MetaBlkService::write_ovf_blk_to_disk(meta_blk_ovf_hdr* ovf_hdr, const uint cur_ptr = const_cast< uint8_t* >(write_context_data) + size_written; if (i < ovf_hdr->h.nbids - 1) { - cur_size = data_bid[i].get_nblks() * block_size(); + cur_size = data_bid[i].blk_count() * block_size(); size_written += cur_size; } else { const size_t remain_sz_to_write = uint64_cast(write_size - size_written); @@ -549,7 +549,7 @@ meta_blk* MetaBlkService::init_meta_blk(BlkId& bid, meta_sub_type type, const ui } // point last mblk to this mblk; - m_last_mblk_id->set(bid); + *m_last_mblk_id = bid; // add to cache; HS_DBG_ASSERT(m_meta_blks.find(bid.to_integer()) == m_meta_blks.end(), @@ -573,7 +573,7 @@ void MetaBlkService::write_meta_blk_ovf(BlkId& out_obid, const uint8_t* context_ // allocate data blocks static thread_local std::vector< BlkId > context_data_blkids{}; context_data_blkids.clear(); - alloc_meta_blk(sisl::round_up(sz, block_size()), context_data_blkids); + alloc_meta_blks(sisl::round_up(sz, block_size()), context_data_blkids); HS_LOG(DEBUG, metablk, "Start to allocate nblks(data): {}, mstore used size: {}", context_data_blkids.size(), m_sb_vdev->used_size()); @@ -603,7 +603,7 @@ void MetaBlkService::write_meta_blk_ovf(BlkId& out_obid, const uint8_t* context_ uint64_t data_size{0}; auto* data_bid = ovf_hdr->get_data_bid_mutable(); for (; (j < ovf_blk_max_num_data_blk()) && (data_blkid_indx < context_data_blkids.size()); ++j) { - data_size += context_data_blkids[data_blkid_indx].data_size(block_size()); + data_size += context_data_blkids[data_blkid_indx].blk_count() * block_size(); data_bid[j] = context_data_blkids[data_blkid_indx++]; } @@ -887,7 +887,7 @@ std::error_condition MetaBlkService::remove_sub_sb(void* cookie) { HS_LOG(DEBUG, metablk, "removing last mblk, change m_last_mblk to bid: {}, [type={}]", prev_bid.to_string(), m_meta_blks[prev_bid.to_integer()]->hdr.h.type); - m_last_mblk_id->set(prev_bid); + *m_last_mblk_id = prev_bid; } // remove the in-memory handle from meta blk map; @@ -925,7 +925,7 @@ void MetaBlkService::free_ovf_blk_chain(const BlkId& obid) { auto* data_bid = ovf_hdr->get_data_bid(); for (decltype(ovf_hdr->h.nbids) i{0}; i < ovf_hdr->h.nbids; ++i) { m_sb_vdev->free_blk(data_bid[i]); - total_nblks_freed += data_bid[i].get_nblks(); + total_nblks_freed += data_bid[i].blk_count(); HS_LOG(DEBUG, metablk, "after freeing data bid: {}, mstore used size: {}", data_bid[i].to_string(), m_sb_vdev->used_size()); @@ -933,7 +933,7 @@ void MetaBlkService::free_ovf_blk_chain(const BlkId& obid) { // free on-disk ovf header blk m_sb_vdev->free_blk(cur_obid); - total_nblks_freed += cur_obid.get_nblks(); + total_nblks_freed += cur_obid.blk_count(); HS_LOG(DEBUG, metablk, "after freeing ovf bidid: {}, mstore used size: {}", cur_obid.to_string(), m_sb_vdev->used_size()); @@ -973,15 +973,16 @@ void MetaBlkService::free_meta_blk(meta_blk* mblk) { hs_utils::iobuf_free(uintptr_cast(mblk), sisl::buftag::metablk); } -void MetaBlkService::alloc_meta_blk(uint64_t size, std::vector< BlkId >& bid) { +void MetaBlkService::alloc_meta_blks(uint64_t size, std::vector< BlkId >& bids) { auto const nblks = uint32_cast(size / m_sb_vdev->block_size()); + try { - const auto ret = m_sb_vdev->alloc_blk(nblks, blk_alloc_hints{}, bid); + const auto ret = m_sb_vdev->alloc_blks(nblks, blk_alloc_hints{}, bids); HS_REL_ASSERT_EQ(ret, BlkAllocStatus::SUCCESS); #ifndef NDEBUG uint64_t debug_size{0}; - for (size_t i{0}; i < bid.size(); ++i) { - debug_size += bid[i].data_size(m_sb_vdev->block_size()); + for (auto const& b : bids) { + debug_size += (b.blk_count() * m_sb_vdev->block_size()); } HS_DBG_ASSERT_EQ(debug_size, size); #endif @@ -997,7 +998,7 @@ void MetaBlkService::alloc_meta_blk(BlkId& bid) { hints.is_contiguous = true; try { - const auto ret = m_sb_vdev->alloc_contiguous_blk(1, hints, &bid); + const auto ret = m_sb_vdev->alloc_contiguous_blks(1, hints, bid); HS_REL_ASSERT_EQ(ret, BlkAllocStatus::SUCCESS); } catch (const std::exception& e) { HS_REL_ASSERT(0, "{}", e.what()); } } @@ -1038,7 +1039,7 @@ sisl::byte_array MetaBlkService::read_sub_sb_internal(const meta_blk* mblk) cons for (decltype(ovf_hdr->h.nbids) i{0}; i < ovf_hdr->h.nbids; ++i) { size_t read_sz_per_db{0}; if (i < ovf_hdr->h.nbids - 1) { - read_sz_per_db = data_bid[i].get_nblks() * block_size(); + read_sz_per_db = data_bid[i].blk_count() * block_size(); } else { // it is possible user context data doesn't occupy the whole block, so we need to remember the // size that was written to the last data blk; @@ -1197,7 +1198,7 @@ uint64_t MetaBlkService::meta_size(const void* cookie) const { ++nblks; // ovf header blk; const auto* data_bid = ovf_hdr->get_data_bid(); for (decltype(ovf_hdr->h.nbids) i{0}; i < ovf_hdr->h.nbids; ++i) { - nblks += data_bid[i].get_nblks(); // data blks; + nblks += data_bid[i].blk_count(); // data blks; } obid = ovf_hdr->h.next_bid; } diff --git a/src/lib/meta/meta_sb.hpp b/src/lib/meta/meta_sb.hpp index b61e4c5fc..9eda9b017 100644 --- a/src/lib/meta/meta_sb.hpp +++ b/src/lib/meta/meta_sb.hpp @@ -96,10 +96,10 @@ struct MetaSubRegInfo { // meta blk super block put as 1st block in the block chain; #pragma pack(1) struct meta_blk_sb { - uint32_t magic; // ssb magic + uint32_t magic; // ssb magic uint32_t version; - BlkId8_t next_bid; // next metablk - BlkId8_t bid; + BlkId next_bid; // next metablk + BlkId bid; uint8_t migrated; uint8_t pad[7]; std::string to_string() const { @@ -116,14 +116,14 @@ struct meta_blk_sb { // #pragma pack(1) struct meta_blk_hdr_s { - uint32_t magic; // magic + uint32_t magic; // magic uint32_t version; - uint32_t gen_cnt; // generation count, bump on every update + uint32_t gen_cnt; // generation count, bump on every update crc32_t crc; - BlkId8_t next_bid; // next metablk - BlkId8_t prev_bid; // previous metablk - BlkId8_t ovf_bid; // overflow blk id; - BlkId8_t bid; // current blk id; might not be needd; + BlkId next_bid; // next metablk + BlkId prev_bid; // previous metablk + BlkId ovf_bid; // overflow blk id; + BlkId bid; // current blk id; might not be needd; uint64_t context_sz; // total size of context data; if compressed is true, it is the round up of compressed size // that is written to disk; if compressed is false, it is the original size of context data; uint64_t compressed_sz; // compressed size before round up to align_size, used for decompress @@ -171,10 +171,10 @@ struct meta_blk { // single list overflow block chain #pragma pack(1) struct meta_blk_ovf_hdr_s { - uint32_t magic; // ovf magic - uint32_t nbids; // number of data blkids stored in data_bid; - BlkId8_t next_bid; // next ovf blk id; - BlkId8_t bid; // self blkid + uint32_t magic; // ovf magic + uint32_t nbids; // number of data blkids stored in data_bid; + BlkId next_bid; // next ovf blk id; + BlkId bid; // self blkid uint64_t context_sz; }; #pragma pack() diff --git a/src/lib/replication/repl_service.cpp b/src/lib/replication/repl_service.cpp new file mode 100644 index 000000000..52cdca413 --- /dev/null +++ b/src/lib/replication/repl_service.cpp @@ -0,0 +1,122 @@ +#include + +#include +#include +#include +#include + +#include +#include "service/repl_backend.h" +#include "service/home_repl_backend.h" + +namespace homestore { +ReplicationServiceImpl::ReplicationServiceImpl(std::unique_ptr< ReplServiceCallbacks > cbs) : + m_svc_cbs{std::move(cbs)} { + m_messaging = std::make_shared< nuraft_mesg::service >(); + + // FIXME: RAFT server parameters, should be a config and reviewed!!! + nuraft::raft_params r_params; + r_params.with_election_timeout_lower(900) + .with_election_timeout_upper(1400) + .with_hb_interval(250) + .with_max_append_size(10) + .with_rpc_failure_backoff(250) + .with_auto_forwarding(true) + .with_snapshot_enabled(1); + + meta_service().register_handler( + "replication", + [this](meta_blk* mblk, sisl::byte_view buf, size_t) { rd_super_blk_found(std::move(buf), voidptr_cast(mblk)); }, + nullptr); + + // This closure is where we initialize new ReplicaSet instances. When NuRaft Messging is asked to join a new group + // either through direct creation or gRPC request it will use this callback to initialize a new state_manager and + // state_machine for the raft_server it constructs. + auto group_type_params = nuraft_mesg::consensus_component::register_params{ + r_params, [this](int32_t const, std::string const& group_id) mutable { + return create_replica_dev(group_id, std::set< std::string, std::less<> >()) + .via(&folly::QueuedImmediateExecutor::instance()) + .get(); + // RELEASE_ASSERT(std::holds_alternative< shared< ReplDev > >(v), "Could Not Create ReplicaSet!"); + // return std::get< shared< ReplDev > >(v); + }}; + // m_messaging->register_mgr_type("homestore", group_type_params); +} + +void ReplicationServiceImpl::create_vdev(uint64_t size) { + auto const atomic_page_size = hs()->device_mgr()->atomic_page_size(HSDevType::Data); + hs_vdev_context vdev_ctx; + vdev_ctx.type = hs_vdev_type_t::REPL_DATA_VDEV; + + hs()->device_mgr()->create_vdev(vdev_parameters{.vdev_name = "index", + .vdev_size = size, + .num_chunks = 1, + .blk_size = atomic_page_size, + .dev_type = HSDevType::Data, + .multi_pdev_opts = vdev_multi_pdev_opts_t::ALL_PDEV_STRIPED, + .context_data = vdev_ctx.to_blob()}); +} + +shared< VirtualDev > ReplicationServiceImpl::open_vdev(const vdev_info& vinfo, bool load_existing) { + m_vdev = std::make_shared< VirtualDev >(*(hs()->device_mgr()), vinfo, m_svc_cbs->blk_allocator_type(), + m_svc_cbs->chunk_selector(), nullptr, true /* auto_recovery */); + return m_vdev; +} + +ReplAsyncResult< shared< ReplDev > > +ReplicationServiceImpl::create_replica_dev(std::string const& group_id, + std::set< std::string, std::less<> >&& members) { + superblk< repl_dev_superblk > rd_sb; + rd_sb.create(sizeof(repl_dev_superblk)); + rd_sb->gid = group_id; + return folly::makeSemiFuture< shared< ReplDev > >(open_replica_dev(rd_sb, false /* load_existing */)); +} + +folly::SemiFuture< ReplServiceError > ReplicationServiceImpl::replace_member(std::string const& group_id, + std::string const& member_out, + std::string const& member_in) const { + return folly::makeSemiFuture(ReplServiceError::CANCELLED); +} + +ReplAsyncResult< shared< ReplDev > > ReplicationServiceImpl::get_replica_dev(std::string const& group_id) const { + std::unique_lock lg(m_rd_map_mtx); + if (auto it = m_rd_map.find(group_id); it != m_rd_map.end()) { return it->second; } + return ReplServiceError::SERVER_NOT_FOUND; +} + +void ReplicationServiceImpl::iterate_replica_devs(std::function< void(cshared< ReplDev >&) > const& cb) { + std::unique_lock lg(m_rd_map_mtx); + for (const auto& [uuid, rd] : m_rd_map) { + cb(rd); + } +} + +shared< ReplDev > ReplicationServiceImpl::open_replica_dev(superblk< repl_dev_superblk > const& rd_sb, + bool load_existing) { + auto it = m_rd_map.end(); + bool happened = false; + + { + std::unique_lock lg(m_rd_map_mtx); + std::tie(it, happened) = m_rd_map.emplace(std::make_pair(gid, nullptr)); + } + DEBUG_ASSERT(m_rd_map.end() != it, "Could not insert into map!"); + if (!happened) { return it->second }; + + auto repl_dev = std::make_shared< ReplDevImpl >(rd_sb, load_existing); + it->second = repl_dev; + repl_dev->attach_listener(std::move(m_svc_cbs->on_repl_dev_init(repl_dev))); + + return repl_dev; +} + +void ReplicationServiceImpl::rd_super_blk_found(sisl::byte_view const& buf, void* meta_cookie) { + superblk< repl_dev_superblk > rd_sb; + rd_sb.load(buf, meta_cookie); + DEBUG_ASSERT_EQ(rd_sb->get_magic(), home_rs_superblk::REPLICA_DEV_SB_MAGIC, "Invalid rdev metablk, magic mismatch"); + DEBUG_ASSERT_EQ(rd_sb->get_version(), home_rs_superblk::REPLICA_DEV_SB_VERSION, "Invalid version of rdev metablk"); + + open_replica_dev(rd_sb, true /* load_existing */); +} + +} // namespace homestore diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt index fdaa3fa0d..021138bd9 100644 --- a/src/tests/CMakeLists.txt +++ b/src/tests/CMakeLists.txt @@ -37,11 +37,6 @@ if (${build_nonio_tests}) target_link_libraries(test_mem_btree ${COMMON_TEST_DEPS} GTest::gtest) add_test(NAME MemBtree COMMAND test_mem_btree) - set(TEST_INDEXBTREE_SOURCE_FILES test_index_btree.cpp) - add_executable(test_index_btree ${TEST_INDEXBTREE_SOURCE_FILES}) - target_link_libraries(test_index_btree homestore ${COMMON_TEST_DEPS} GTest::gtest) - add_test(NAME IndexBtree COMMAND test_index_btree) - add_executable(test_blk_read_tracker) target_sources(test_blk_read_tracker PRIVATE test_blk_read_tracker.cpp ../lib/blkdata_svc/blk_read_tracker.cpp ../lib/blkalloc/blk.cpp) target_link_libraries(test_blk_read_tracker ${COMMON_TEST_DEPS} GTest::gtest) @@ -61,10 +56,22 @@ if (${build_nonio_tests}) target_sources(test_append_blk_allocator PRIVATE test_append_blkalloc.cpp) target_link_libraries(test_append_blk_allocator homestore ${COMMON_TEST_DEPS} GTest::gmock) add_test(NAME AppendBlkAlloc COMMAND test_append_blk_allocator) + set_property(TEST AppendBlkAlloc PROPERTY ENVIRONMENT "ASAN_OPTIONS=detect_stack_use_after_return=true") + + set(TEST_BLKID_SOURCES test_blkid.cpp ../lib/blkalloc/blk.cpp) + add_executable(test_blkid ${TEST_BLKID_SOURCES}) + target_link_libraries(test_blkid ${COMMON_TEST_DEPS} GTest::gtest) + add_test(NAME TestBlkid COMMAND test_blkid) + endif() can_build_io_tests(io_tests) if (${io_tests}) + set(TEST_INDEXBTREE_SOURCE_FILES test_index_btree.cpp) + add_executable(test_index_btree ${TEST_INDEXBTREE_SOURCE_FILES}) + target_link_libraries(test_index_btree homestore ${COMMON_TEST_DEPS} GTest::gtest) + add_test(NAME IndexBtree COMMAND test_index_btree) + set_property(TEST IndexBtree PROPERTY ENVIRONMENT "ASAN_OPTIONS=detect_stack_use_after_return=true") add_executable(test_data_service) target_sources(test_data_service PRIVATE test_data_service.cpp) diff --git a/src/tests/test_append_blkalloc.cpp b/src/tests/test_append_blkalloc.cpp index 27125f810..4c0c01e4a 100644 --- a/src/tests/test_append_blkalloc.cpp +++ b/src/tests/test_append_blkalloc.cpp @@ -104,28 +104,23 @@ class AppendBlkAllocatorTest : public testing::Test { auto sg_read_ptr = std::make_shared< sisl::sg_list >(); write_sgs(io_size, sg_write_ptr, 1 /* num_iovs */) - .thenValue([sg_write_ptr, sg_read_ptr, this](const std::vector< BlkId >& out_bids) mutable { + .thenValue([sg_write_ptr, sg_read_ptr, this](auto&& written_bid_ptr) mutable { // this will be called in write io completion cb; LOGINFO("after_write_cb: Write completed;"); - HS_DBG_ASSERT_EQ(out_bids.size(), 1); + iovec iov; + iov.iov_len = written_bid_ptr->blk_count() * inst().get_blk_size(); + iov.iov_base = iomanager.iobuf_alloc(512, iov.iov_len); + sg_read_ptr->iovs.push_back(iov); + sg_read_ptr->size = iov.iov_len; - const auto num_iovs = out_bids.size(); - - for (auto i = 0ul; i < num_iovs; ++i) { - struct iovec iov; - iov.iov_len = out_bids[i].get_nblks() * inst().get_page_size(); - iov.iov_base = iomanager.iobuf_alloc(512, iov.iov_len); - sg_read_ptr->iovs.push_back(iov); - sg_read_ptr->size += iov.iov_len; - } - - LOGINFO("Step 2: async read on blkid: {}", out_bids[0].to_string()); - return inst().async_read(out_bids[0], *sg_read_ptr, sg_read_ptr->size); + LOGINFO("Step 2: async read on blkid: {}", written_bid_ptr->to_string()); + return inst().async_read(*written_bid_ptr, *sg_read_ptr, sg_read_ptr->size); }) - .thenValue([this, sg_write_ptr, sg_read_ptr](auto) mutable { + .thenValue([this, sg_write_ptr, sg_read_ptr](auto err) mutable { + RELEASE_ASSERT(!err, "read failured"); const auto equal = test_common::HSTestHelper::compare(*sg_read_ptr, *sg_write_ptr); - assert(equal); + RELEASE_ASSERT(equal, "read/write mismatch"); LOGINFO("Read completed;"); free(*sg_write_ptr); @@ -138,24 +133,19 @@ class AppendBlkAllocatorTest : public testing::Test { void write_io_free_blk(const uint64_t io_size) { std::shared_ptr< sisl::sg_list > sg_write_ptr = std::make_shared< sisl::sg_list >(); - auto futs = write_sgs(io_size, sg_write_ptr, 1 /* num_iovs */) - .thenValue([sg_write_ptr, this](const std::vector< BlkId >& out_bids) { - LOGINFO("after_write_cb: Write completed;"); - free(*sg_write_ptr); - - std::vector< folly::Future< bool > > futs; - for (const auto& free_bid : out_bids) { - LOGINFO("Step 2: started async_free_blk: {}", free_bid.to_string()); - auto f = inst().async_free_blk(free_bid); - futs.emplace_back(std::move(f)); - } - return futs; - }); - - folly::collectAllUnsafe(futs).then([this](auto) { - LOGINFO("completed async_free_blks"); - this->finish_and_notify(); - }); + write_sgs(io_size, sg_write_ptr, 1 /* num_iovs */) + .thenValue([sg_write_ptr, this](auto&& written_bid_ptr) { + LOGINFO("after_write_cb: Write completed;"); + free(*sg_write_ptr); + + LOGINFO("Step 2: started async_free_blk: {}", written_bid_ptr->to_string()); + return inst().async_free_blk(*written_bid_ptr); + }) + .thenValue([this](auto&& err) { + RELEASE_ASSERT(!err, "Failed to free blks"); + LOGINFO("completed async_free_blks"); + this->finish_and_notify(); + }); } private: @@ -166,7 +156,7 @@ class AppendBlkAllocatorTest : public testing::Test { // caller should be responsible to call free(sg) to free the iobuf allocated in iovs, // normally it should be freed in after_write_cb; // - folly::Future< std::vector< BlkId > > write_sgs(uint64_t io_size, cshared< sisl::sg_list >& sg, uint32_t num_iovs) { + folly::Future< shared< BlkId > > write_sgs(uint64_t io_size, cshared< sisl::sg_list >& sg, uint32_t num_iovs) { // TODO: What if iov_len is not multiple of 4Ki? HS_DBG_ASSERT_EQ(io_size % (4 * Ki * num_iovs), 0, "Expecting iov_len : {} to be multiple of {}.", io_size / num_iovs, 4 * Ki); @@ -180,15 +170,12 @@ class AppendBlkAllocatorTest : public testing::Test { sg->size += iov_len; } - auto out_bids_ptr = std::make_shared< std::vector< BlkId > >(); + MultiBlkId blkid; return inst() - .async_alloc_write(*(sg.get()), blk_alloc_hints{}, *out_bids_ptr, false /* part_of_batch*/) - .thenValue([sg, this, out_bids_ptr](bool success) { - assert(success); - for (const auto& bid : *out_bids_ptr) { - LOGINFO("bid: {}", bid.to_string()); - } - return folly::makeFuture< std::vector< BlkId > >(std::move(*out_bids_ptr)); + .async_alloc_write(*(sg.get()), blk_alloc_hints{}, blkid, false /* part_of_batch*/) + .thenValue([sg, this, blkid](auto err) { + RELEASE_ASSERT(!err, "Write failure"); + return folly::makeFuture< shared< MultiBlkId > >(std::make_shared< MultiBlkId >(blkid)); }); } diff --git a/src/tests/test_blk_cache_queue.cpp b/src/tests/test_blk_cache_queue.cpp index c91aa4e19..840c921af 100644 --- a/src/tests/test_blk_cache_queue.cpp +++ b/src/tests/test_blk_cache_queue.cpp @@ -77,7 +77,7 @@ struct BlkCacheQueueTest : public ::testing::Test { if (!(fill_session->slab_requirements.empty())) { uint32_t blk_id{0}; for (const auto& slab_cfg : m_cfg.m_per_slab_cfg) { - for (blk_cap_t i{0}; i < slab_cfg.max_entries; ++i) { + for (blk_num_t i{0}; i < slab_cfg.max_entries; ++i) { blk_cache_fill_req fill_req; fill_req.start_blk_num = blk_id; fill_req.nblks = slab_cfg.slab_size; diff --git a/src/tests/test_blkalloc.cpp b/src/tests/test_blkalloc.cpp index e0328c2a2..2fdf2f90c 100644 --- a/src/tests/test_blkalloc.cpp +++ b/src/tests/test_blkalloc.cpp @@ -114,8 +114,8 @@ struct BlkAllocatorTest { for (size_t slab_index{0}; slab_index < slab_distribution.size(); ++slab_index) { cum_pct += slab_distribution[slab_index]; const blk_count_t slab_size{static_cast< blk_count_t >(static_cast< blk_count_t >(1) << slab_index)}; - const blk_cap_t slab_count{ - static_cast< blk_cap_t >((m_total_count / slab_size) * (slab_distribution[slab_index] / 100.0))}; + const blk_num_t slab_count{ + static_cast< blk_num_t >((m_total_count / slab_size) * (slab_distribution[slab_index] / 100.0))}; if (slab_index == 0) { m_slab_alloced_blks[0].m_max_quota = slab_count; } else { @@ -137,7 +137,7 @@ struct BlkAllocatorTest { } [[nodiscard]] bool alloced(const BlkId& bid, const bool track_block_group) { - uint32_t blk_num{static_cast< uint32_t >(bid.get_blk_num())}; + uint32_t blk_num = bid.blk_num(); if (blk_num >= m_total_count) { { std::scoped_lock< std::mutex > lock{s_print_mutex}; @@ -145,12 +145,12 @@ struct BlkAllocatorTest { } return false; } - m_alloced_count.fetch_add(bid.get_nblks(), std::memory_order_acq_rel); + m_alloced_count.fetch_add(bid.blk_count(), std::memory_order_acq_rel); - const slab_idx_t slab_idx{m_track_slabs ? nblks_to_idx(bid.get_nblks()) : static_cast< slab_idx_t >(0)}; + const slab_idx_t slab_idx{m_track_slabs ? nblks_to_idx(bid.blk_count()) : static_cast< slab_idx_t >(0)}; if (track_block_group) { // add blocks as group to each slab - if (!blk_map(slab_idx).insert(blk_num, bid.get_nblks()).second) { + if (!blk_map(slab_idx).insert(blk_num, bid.blk_count()).second) { { std::scoped_lock< std::mutex > lock{s_print_mutex}; std::cout << "Duplicate alloc of blk=" << blk_num << std::endl; @@ -163,7 +163,7 @@ struct BlkAllocatorTest { } else { // add blocks individually to each slab - for (blk_count_t i{0}; i < bid.get_nblks(); ++i) { + for (blk_count_t i{0}; i < bid.blk_count(); ++i) { if (!blk_list(slab_idx).add(blk_num)) { { std::scoped_lock< std::mutex > lock{s_print_mutex}; @@ -176,7 +176,7 @@ struct BlkAllocatorTest { } LOGTRACEMOD(blkalloc, "After Alloced nblks={} blk_range=[{}-{}] skip_list_size={} alloced_count={}", - bid.get_nblks(), blk_num, blk_num + bid.get_nblks() - 1, blk_list(slab_idx).size(), + bid.blk_count(), blk_num, blk_num + bid.blk_count() - 1, blk_list(slab_idx).size(), m_alloced_count.load(std::memory_order_relaxed)); return true; } @@ -381,8 +381,8 @@ struct FixedBlkAllocatorTest : public ::testing::Test, BlkAllocatorTest { virtual void SetUp() override{}; virtual void TearDown() override{}; - [[nodiscard]] bool alloc_blk(const BlkAllocStatus exp_status, BlkId& bid, const bool track_block_group) { - const auto ret{m_allocator->alloc(bid)}; + bool alloc_blk(const BlkAllocStatus exp_status, BlkId& bid, const bool track_block_group) { + const auto ret = m_allocator->alloc_contiguous(bid); if (ret != exp_status) { { std::scoped_lock< std::mutex > lock{s_print_mutex}; @@ -442,7 +442,7 @@ struct VarsizeBlkAllocatorTest : public ::testing::Test, BlkAllocatorTest { static thread_local std::vector< BlkId > bids; bids.clear(); - const auto ret{m_allocator->alloc(reqd_size, hints, bids)}; + const auto ret = m_allocator->alloc(reqd_size, hints, bids); if (ret != exp_status) { { std::scoped_lock< std::mutex > lock{s_print_mutex}; @@ -465,7 +465,7 @@ struct VarsizeBlkAllocatorTest : public ::testing::Test, BlkAllocatorTest { blk_count_t sz{0}; for (auto& bid : bids) { if (!alloced(bid, track_block_group)) { return false; } - sz += bid.get_nblks(); + sz += bid.blk_count(); } if (sz != reqd_size) { { @@ -546,8 +546,8 @@ struct VarsizeBlkAllocatorTest : public ::testing::Test, BlkAllocatorTest { while (freed_size < rand_size) { const auto bid{ free_random_alloced_sized_blk(rand_size - freed_size, round_nblks, track_block_group)}; - freed_nblks += bid.get_nblks(); - freed_size += bid.get_nblks(); + freed_nblks += bid.blk_count(); + freed_size += bid.blk_count(); } } } diff --git a/src/tests/test_blkid.cpp b/src/tests/test_blkid.cpp new file mode 100644 index 000000000..435e41784 --- /dev/null +++ b/src/tests/test_blkid.cpp @@ -0,0 +1,178 @@ +#include +#include + +#include +#include +#include + +#include + +SISL_LOGGING_INIT(test_blkid, iomgr, flip, io_wd) +SISL_OPTIONS_ENABLE(logging, test_blkid) + +SISL_OPTION_GROUP(test_blkid, + (num_iterations, "", "num_iterations", "number of iterations", + ::cxxopts::value< uint32_t >()->default_value("1"), "number")); + +using namespace homestore; +TEST(BlkIdTest, SingleBlkIdBasic) { + BlkId b1; + ASSERT_EQ(b1.is_valid(), false); + ASSERT_EQ(b1.to_integer(), 0ULL); + ASSERT_EQ(b1.to_string(), "Invalid_Blkid"); + + BlkId b2{10, 5, 1}; + ASSERT_EQ(b2.is_valid(), true); + ASSERT_EQ(b2.blk_num(), 10); + ASSERT_EQ(b2.blk_count(), 5); + ASSERT_EQ(b2.chunk_num(), 1); + ASSERT_EQ(b2.is_multi(), false); + + sisl::blob buf = b2.serialize(); + ASSERT_EQ(buf.size, sizeof(uint64_t)); + + BlkId b3; + b3.deserialize(buf, true); + ASSERT_EQ(b3.is_valid(), true); + ASSERT_EQ(b3, b2); + + BlkId b4{10, 6, 1}; + BlkId b5{9, 6, 1}; + BlkId b6{10, 5, 2}; + BlkId b7{10, 5, 1}; + ASSERT_LT(BlkId::compare(b2, b4), 0); + ASSERT_GT(BlkId::compare(b2, b5), 0); + ASSERT_LT(BlkId::compare(b2, b6), 0); + ASSERT_EQ(BlkId::compare(b2, b7), 0); +} + +TEST(BlkIdTest, SingleBlkIdInMap) { + std::map< int, BlkId > m1; + BlkId b1{30, 4, 2}; + m1.emplace(std::pair(84, BlkId{30, 4, 2})); + ASSERT_EQ(m1.at(84), b1); + + std::map< BlkId, int > m2; + m2.insert(std::pair(BlkId{30, 4, 2}, 94)); + m2.insert(std::pair(BlkId{30, 4, 1}, 96)); + + auto const it1 = m2.find(BlkId{30, 4, 2}); + ASSERT_EQ(it1->second, 94); + auto const it2 = m2.find(BlkId{30, 4, 3}); + ASSERT_EQ(it2, m2.cend()); +} + +TEST(BlkIdTest, MultiBlkIdTest) { + MultiBlkId mb1; + ASSERT_EQ(mb1.is_valid(), false); + ASSERT_EQ(mb1.to_string(), "MultiBlks: {}"); + ASSERT_EQ(mb1.is_multi(), true); + ASSERT_EQ(mb1.num_pieces(), 0); + + mb1.add(10, 5, 1); + ASSERT_EQ(mb1.is_valid(), true); + ASSERT_EQ(mb1.blk_num(), 10); + ASSERT_EQ(mb1.blk_count(), 5); + ASSERT_EQ(mb1.chunk_num(), 1); + ASSERT_EQ(mb1.is_multi(), true); + + std::array< BlkId, 5 > abs{BlkId{20, 8, 1}, BlkId{30, 1, 1}, BlkId{60, 9, 1}, BlkId{80, 5, 1}, BlkId{90, 2, 1}}; + for (auto const& b : abs) { + mb1.add(b); + } + ASSERT_EQ(mb1.num_pieces(), 6); + + auto it = mb1.iterate(); + uint32_t i = 0; + while (auto b = it.next()) { + if (i == 0) { + ASSERT_EQ(b->blk_num(), 10); + ASSERT_EQ(b->blk_count(), 5); + } else { + ASSERT_EQ(*b, abs[i - 1]); + } + ++i; + } + ASSERT_EQ(i, 6); + + auto bl = mb1.serialize(); + MultiBlkId mb2; + mb2.add(5, 6, 2); + mb2.add(11, 10, 2); + mb2.deserialize(bl, true); // Overwrite + ASSERT_EQ(mb1, mb2); +} + +TEST(BlkIdTest, MultiBlkIdInMap) { + std::map< MultiBlkId, int > m1; + std::unordered_map< MultiBlkId, int > m2; + + MultiBlkId mb1{30, 4, 2}; + mb1.add(90, 4, 2); + mb1.add(80, 4, 2); + mb1.add(20, 4, 2); + mb1.add(10, 4, 2); + ASSERT_EQ(mb1.num_pieces(), 5); + + m1.insert(std::pair(mb1, 92)); + m2.insert(std::pair(mb1, 92)); + + MultiBlkId mb2{30, 4, 1}; + mb2.add(90, 4, 1); + mb2.add(30, 4, 1); + mb2.add(20, 4, 1); + mb2.add(10, 4, 1); + m1.insert(std::pair(mb2, 89)); + m2.insert(std::pair(mb2, 89)); // Insert exactly same except chunk_id different + + MultiBlkId mb3{30, 4, 1}; + mb3.add(90, 4, 1); + mb3.add(30, 4, 1); + mb3.add(20, 4, 1); + mb3.add(10, 4, 1); + m1.insert_or_assign(mb3, 90); + m2.insert_or_assign(mb3, 90); // Update the value to validate == works correctly + + MultiBlkId mb4{30, 4, 2}; + mb4.add(80, 4, 2); + ASSERT_EQ(mb4.num_pieces(), 2); + m1.insert(std::pair(mb4, 93)); + m2.insert(std::pair(mb4, 93)); + + MultiBlkId mb5{30, 4, 2}; + mb5.add(10, 3, 2); + m1.insert(std::pair(mb5, 91)); + m2.insert(std::pair(mb5, 91)); + + // Validate get on both the maps + ASSERT_EQ(m1[mb1], 92); + ASSERT_EQ(m2[mb1], 92); + ASSERT_EQ(m1[mb3], 90); + ASSERT_EQ(m2[mb3], 90); + ASSERT_EQ(m1[mb4], 93); + ASSERT_EQ(m2[mb4], 93); + ASSERT_EQ(m1[mb5], 91); + ASSERT_EQ(m2[mb5], 91); + auto const it1 = m1.find(MultiBlkId{1, 1, 1}); + ASSERT_EQ(it1, m1.cend()); + auto const it2 = m2.find(MultiBlkId{100, 1, 2}); + ASSERT_EQ(it2, m2.cend()); + + // Validate sorting order of std::map + int prev_v{0}; + for (auto const [k, v] : m1) { + ASSERT_GT(v, prev_v); + prev_v = v; + } + ASSERT_EQ(m1.size(), 4u); +} + +int main(int argc, char* argv[]) { + int parsed_argc = argc; + ::testing::InitGoogleTest(&parsed_argc, argv); + SISL_OPTIONS_LOAD(parsed_argc, argv, logging, test_blkid); + sisl::logging::SetLogger("test_blkid"); + spdlog::set_pattern("[%D %T%z] [%^%l%$] [%t] %v"); + + return RUN_ALL_TESTS(); +} \ No newline at end of file diff --git a/src/tests/test_data_service.cpp b/src/tests/test_data_service.cpp index e9e91c8a7..baf1f3d48 100644 --- a/src/tests/test_data_service.cpp +++ b/src/tests/test_data_service.cpp @@ -71,15 +71,7 @@ typedef std::function< void(std::error_condition err, std::shared_ptr< std::vect class BlkDataServiceTest : public testing::Test { public: - BlkDataService& inst() { // return hs()->data_service(); - return homestore::data_service(); - } - - void print_bids(const std::vector< BlkId >& out_bids) { - for (auto i = 0ul; i < out_bids.size(); ++i) { - LOGINFO("bid[{}]: {}", i, out_bids[i].to_string()); - } - } + BlkDataService& inst() { return homestore::data_service(); } void free(sisl::sg_list& sg) { test_common::HSTestHelper::free(sg); } @@ -87,37 +79,35 @@ class BlkDataServiceTest : public testing::Test { void write_read_free_blk(uint64_t io_size) { auto sg_write_ptr = std::make_shared< sisl::sg_list >(); auto sg_read_ptr = std::make_shared< sisl::sg_list >(); - auto test_blkid_ptr = std::make_shared< BlkId >(); + auto test_blkid_ptr = std::make_shared< MultiBlkId >(); - write_sgs(io_size, sg_write_ptr, 1 /* num_iovs */) - .thenValue([this, sg_write_ptr, test_blkid_ptr](const std::vector< BlkId >& out_bids) { + write_sgs(io_size, sg_write_ptr, 1 /* num_iovs */, *test_blkid_ptr) + .thenValue([this, sg_write_ptr, sg_read_ptr, test_blkid_ptr](auto&& err) { + RELEASE_ASSERT(!err, "Write error"); LOGINFO("after_write_cb: Write completed;"); // sg_write buffer is no longer needed; free(*sg_write_ptr); - LOGINFO("Write blk ids: "); - print_bids(out_bids); + LOGINFO("Write blk ids: {}", test_blkid_ptr->to_string()); + HS_REL_ASSERT_GE(test_blkid_ptr->num_pieces(), 1); - HS_DBG_ASSERT_GE(out_bids.size(), 1); - *test_blkid_ptr = out_bids[0]; - }) - .thenValue([this, sg_read_ptr, test_blkid_ptr](auto) { struct iovec iov; - iov.iov_len = test_blkid_ptr->get_nblks() * inst().get_page_size(); + iov.iov_len = test_blkid_ptr->blk_count() * inst().get_blk_size(); iov.iov_base = iomanager.iobuf_alloc(512, iov.iov_len); sg_read_ptr->iovs.push_back(iov); - sg_read_ptr->size += iov.iov_len; + sg_read_ptr->size = iov.iov_len; LOGINFO("Step 2: async read on blkid: {}", test_blkid_ptr->to_string()); - add_read_delay(); return inst().async_read(*test_blkid_ptr, *sg_read_ptr, sg_read_ptr->size); }) - .thenValue([this, sg_read_ptr, test_blkid_ptr](auto) { + .thenValue([this, sg_read_ptr, test_blkid_ptr](auto&& err) { + RELEASE_ASSERT(!err, "Read error"); LOGINFO("read completed;"); free(*sg_read_ptr); return inst().async_free_blk(*test_blkid_ptr); }) - .thenValue([this, test_blkid_ptr](auto) { + .thenValue([this, test_blkid_ptr](auto&& err) { + RELEASE_ASSERT(!err, "free_blk error"); LOGINFO("completed async_free_blk: {}", test_blkid_ptr->to_string()); this->finish_and_notify(); }); @@ -127,39 +117,35 @@ class BlkDataServiceTest : public testing::Test { void write_free_blk_before_read_comp(const uint64_t io_size) { auto sg_write_ptr = std::make_shared< sisl::sg_list >(); auto sg_read_ptr = std::make_shared< sisl::sg_list >(); - auto test_blkid_ptr = std::make_shared< BlkId >(); - - write_sgs(io_size, sg_write_ptr, 1 /* num_iovs */) - .thenValue([this, sg_write_ptr, test_blkid_ptr](const std::vector< BlkId >& out_bids) { - // write completed, now we trigger read on a blkid and in read completion routine, we do - // a free blk; + auto test_blkid_ptr = std::make_shared< MultiBlkId >(); + write_sgs(io_size, sg_write_ptr, 1 /* num_iovs */, *test_blkid_ptr) + .thenValue([this, sg_write_ptr, sg_read_ptr, test_blkid_ptr](auto&& err) { + RELEASE_ASSERT(!err, "Write error"); LOGINFO("after_write_cb: Write completed;"); free(*sg_write_ptr); // sg_write buffer is no longer needed; - LOGINFO("Write blk ids: "); - print_bids(out_bids); + LOGINFO("Write blk ids: {}", test_blkid_ptr->to_string()); + HS_REL_ASSERT_GE(test_blkid_ptr->num_pieces(), 1); - HS_DBG_ASSERT_GE(out_bids.size(), 1); - *test_blkid_ptr = out_bids[0]; - }) - .thenValue([this, sg_read_ptr, test_blkid_ptr](auto) mutable { struct iovec iov; - iov.iov_len = test_blkid_ptr->get_nblks() * inst().get_page_size(); + iov.iov_len = test_blkid_ptr->blk_count() * inst().get_blk_size(); iov.iov_base = iomanager.iobuf_alloc(512, iov.iov_len); sg_read_ptr->iovs.push_back(iov); - sg_read_ptr->size += iov.iov_len; + sg_read_ptr->size = iov.iov_len; - LOGINFO("Step 2a: inject read delay on blkid: {}", test_blkid_ptr->to_string()); - LOGINFO("Step 2b: async read on blkid: {}", test_blkid_ptr->to_string()); + LOGINFO("Step 2a: inject read delay and read on blkid: {}", test_blkid_ptr->to_string()); + add_read_delay(); inst() .async_read(*test_blkid_ptr, *sg_read_ptr, sg_read_ptr->size) - .thenValue([sg_read_ptr, this](auto) { + .thenValue([sg_read_ptr, this](auto&& err) { + RELEASE_ASSERT(!err, "Read error"); + // if we are here, free_blk callback must have been called already, because data service layer // trigger the free_blk cb firstly then send read complete cb back to caller; m_read_blk_done = true; LOGINFO("read completed;"); - HS_DBG_ASSERT_EQ(m_free_blk_done.load(), true, + HS_REL_ASSERT_EQ(m_free_blk_done.load(), true, "free blk callback should not be called before read blk completes"); free(*sg_read_ptr); @@ -167,9 +153,10 @@ class BlkDataServiceTest : public testing::Test { }); LOGINFO("Step 3: started async_free_blk: {}", test_blkid_ptr->to_string()); - inst().async_free_blk(*test_blkid_ptr).thenValue([this](auto) { + inst().async_free_blk(*test_blkid_ptr).thenValue([this](auto&& err) { + RELEASE_ASSERT(!err, "free_blk error"); LOGINFO("completed async_free_blk"); - HS_DBG_ASSERT_EQ(m_free_blk_done.load(), false, "Duplicate free blk completion"); + HS_REL_ASSERT_EQ(m_free_blk_done.load(), false, "Duplicate free blk completion"); m_free_blk_done = true; }); }); @@ -177,55 +164,52 @@ class BlkDataServiceTest : public testing::Test { void write_io_free_blk(const uint64_t io_size) { std::shared_ptr< sisl::sg_list > sg_write_ptr = std::make_shared< sisl::sg_list >(); + auto test_blkid_ptr = std::make_shared< MultiBlkId >(); - auto futs = write_sgs(io_size, sg_write_ptr, 1 /* num_iovs */) - .thenValue([sg_write_ptr, this](const std::vector< BlkId >& out_bids) { - LOGINFO("after_write_cb: Write completed;"); - free(*sg_write_ptr); - - std::vector< folly::Future< bool > > futs; - for (const auto& free_bid : out_bids) { - LOGINFO("Step 2: started async_free_blk: {}", free_bid.to_string()); - auto f = inst().async_free_blk(free_bid); - futs.emplace_back(std::move(f)); - } - return futs; - }); - - folly::collectAllUnsafe(futs).then([this](auto) { - LOGINFO("completed async_free_blks"); - this->finish_and_notify(); - }); + write_sgs(io_size, sg_write_ptr, 1 /* num_iovs */, *test_blkid_ptr) + .thenValue([sg_write_ptr, this, test_blkid_ptr](auto&& err) { + RELEASE_ASSERT(!err, "Write error"); + LOGINFO("after_write_cb: Write completed;"); + free(*sg_write_ptr); + + LOGINFO("Step 2: started async_free_blk: {}", test_blkid_ptr->to_string()); + inst().async_free_blk(*test_blkid_ptr).thenValue([this](auto&& err) { + RELEASE_ASSERT(!err, "Free error"); + LOGINFO("completed async_free_blks"); + this->finish_and_notify(); + }); + }); } void write_io_verify(const uint64_t io_size) { auto sg_write_ptr = std::make_shared< sisl::sg_list >(); auto sg_read_ptr = std::make_shared< sisl::sg_list >(); + auto test_blkid_ptr = std::make_shared< MultiBlkId >(); + + write_sgs(io_size, sg_write_ptr, 1 /* num_iovs */, *test_blkid_ptr) + .thenValue([sg_write_ptr, sg_read_ptr, test_blkid_ptr, this](auto&& err) { + RELEASE_ASSERT(!err, "Write error"); - write_sgs(io_size, sg_write_ptr, 1 /* num_iovs */) - .thenValue([sg_write_ptr, sg_read_ptr, this](const std::vector< BlkId >& out_bids) mutable { // this will be called in write io completion cb; LOGINFO("after_write_cb: Write completed;"); // TODO: verify multiple read blks; - HS_DBG_ASSERT_EQ(out_bids.size(), 1); - - const auto num_iovs = out_bids.size(); + HS_DBG_ASSERT_EQ(test_blkid_ptr->num_pieces(), 1); - for (auto i = 0ul; i < num_iovs; ++i) { - struct iovec iov; - iov.iov_len = out_bids[i].get_nblks() * inst().get_page_size(); - iov.iov_base = iomanager.iobuf_alloc(512, iov.iov_len); - sg_read_ptr->iovs.push_back(iov); - sg_read_ptr->size += iov.iov_len; - } + struct iovec iov; + iov.iov_len = test_blkid_ptr->blk_count() * inst().get_blk_size(); + iov.iov_base = iomanager.iobuf_alloc(512, iov.iov_len); + sg_read_ptr->iovs.push_back(iov); + sg_read_ptr->size = iov.iov_len; - LOGINFO("Step 2: async read on blkid: {}", out_bids[0].to_string()); - return inst().async_read(out_bids[0], *sg_read_ptr, sg_read_ptr->size); + LOGINFO("Step 2: async read on blkid: {}", test_blkid_ptr->to_string()); + return inst().async_read(*test_blkid_ptr, *sg_read_ptr, sg_read_ptr->size); }) - .thenValue([this, sg_write_ptr, sg_read_ptr](auto) mutable { + .thenValue([this, sg_write_ptr, sg_read_ptr](auto&& err) mutable { + RELEASE_ASSERT(!err, "Read error"); + const auto equal = test_common::HSTestHelper::compare(*sg_read_ptr, *sg_write_ptr); - assert(equal); + RELEASE_ASSERT(equal, "Read after write data mismatch"); LOGINFO("Read completed;"); free(*sg_write_ptr); @@ -240,7 +224,8 @@ class BlkDataServiceTest : public testing::Test { // void write_io(uint64_t io_size, uint32_t num_iovs = 1) { auto sg = std::make_shared< sisl::sg_list >(); - write_sgs(io_size, sg, num_iovs).thenValue([this, sg](auto) { + MultiBlkId blkid; + write_sgs(io_size, sg, num_iovs, blkid).thenValue([this, sg](auto) { free(*sg); finish_and_notify(); }); @@ -268,7 +253,8 @@ class BlkDataServiceTest : public testing::Test { // caller should be responsible to call free(sg) to free the iobuf allocated in iovs, // normally it should be freed in after_write_cb; // - folly::Future< std::vector< BlkId > > write_sgs(uint64_t io_size, cshared< sisl::sg_list >& sg, uint32_t num_iovs) { + folly::Future< std::error_code > write_sgs(uint64_t io_size, cshared< sisl::sg_list >& sg, uint32_t num_iovs, + MultiBlkId& out_bids) { // TODO: What if iov_len is not multiple of 4Ki? HS_DBG_ASSERT_EQ(io_size % (4 * Ki * num_iovs), 0, "Expecting iov_len : {} to be multiple of {}.", io_size / num_iovs, 4 * Ki); @@ -282,16 +268,7 @@ class BlkDataServiceTest : public testing::Test { sg->size += iov_len; } - auto out_bids_ptr = std::make_shared< std::vector< BlkId > >(); - return inst() - .async_alloc_write(*(sg.get()), blk_alloc_hints{}, *out_bids_ptr, false /* part_of_batch*/) - .thenValue([sg, this, out_bids_ptr](bool success) { - assert(success); - for (const auto& bid : *out_bids_ptr) { - LOGINFO("bid: {}", bid.to_string()); - } - return folly::makeFuture< std::vector< BlkId > >(std::move(*out_bids_ptr)); - }); + return inst().async_alloc_write(*(sg.get()), blk_alloc_hints{}, out_bids, false /* part_of_batch*/); } void add_read_delay() {