diff --git a/conanfile.py b/conanfile.py index 85adaae85..edaa61d41 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,8 +9,7 @@ class HomestoreConan(ConanFile): name = "homestore" - version = "6.4.40" - + version = "6.4.41" homepage = "https://github.com/eBay/Homestore" description = "HomeStore Storage Engine" topics = ("ebay", "nublox") diff --git a/src/include/homestore/logstore/log_store.hpp b/src/include/homestore/logstore/log_store.hpp index eec7eac50..8a570757c 100644 --- a/src/include/homestore/logstore/log_store.hpp +++ b/src/include/homestore/logstore/log_store.hpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -81,18 +82,6 @@ class HomeLogStore : public std::enable_shared_from_this< HomeLogStore > { */ log_replay_done_cb_t get_log_replay_done_cb() const { return m_replay_done_cb; } - /** - * @brief Write the blob at the user specified seq number in sync manner. Under the covers it will call async - * write and then wait for its completion. As such this is much lesser performing than async version since it - * involves mutex/cv combination - * - * @param seq_num : Sequence number to insert data - * @param b : Data blob to write to log - * - * @return is write completed successfully. - */ - bool write_sync(logstore_seq_num_t seq_num, const sisl::io_blob& b); - /** * @brief Write the blob at the user specified seq number - prepared as a request in async fashion. * @@ -110,17 +99,7 @@ class HomeLogStore : public std::enable_shared_from_this< HomeLogStore > { * @param cookie : Any cookie or context which will passed back in the callback * @param cb Callback upon completion which is called with the status, seq_num and cookie that was passed. */ - void write_async(logstore_seq_num_t seq_num, const sisl::io_blob& b, void* cookie, const log_write_comp_cb_t& cb, - bool flush_wait = false); - - /** - * @brief This method appends the blob into the log and it returns the generated seq number - * - * @param b Blob of data to append - * @return logstore_seq_num_t Returns the seqnum generated by the log - */ - // This method is not implemented yet - logstore_seq_num_t append_sync(const sisl::io_blob& b); + void write_async(logstore_seq_num_t seq_num, const sisl::io_blob& b, void* cookie, const log_write_comp_cb_t& cb); /** * @brief This method appends the blob into the log and makes a callback at the end of the append. @@ -132,6 +111,14 @@ class HomeLogStore : public std::enable_shared_from_this< HomeLogStore > { */ logstore_seq_num_t append_async(const sisl::io_blob& b, void* cookie, const log_write_comp_cb_t& completion_cb); + /** + * @brief Write the blob at the user specified seq number and flush, just like write_sync + * + * @param seq_num: Seq number to write to + * @param b : Blob of data + */ + void write_and_flush(logstore_seq_num_t seq_num, const sisl::io_blob& b); + /** * @brief Read the log provided the sequence number synchronously. This is not the most efficient way to read * as reader will be blocked until read is completed. In addition, it is built on-top of async system by doing @@ -144,25 +131,6 @@ class HomeLogStore : public std::enable_shared_from_this< HomeLogStore > { */ log_buffer read_sync(logstore_seq_num_t seq_num); - /** - * @brief Read the log based on the logstore_req prepared. In case callback is supplied, it uses the callback - * to provide the data it has read. If not overridden, use default callback registered during initialization. - * - * @param req Request containing seq_num - * @param cb [OPTIONAL] Callback to get the data back, if it needs to be different from the default registered - * one. - */ - void read_async(logstore_req* req, const log_found_cb_t& cb = nullptr); - - /** - * @brief Read the log for the seq_num and make the callback with the data - * - * @param seq_num Seqnumber to read the log from - * @param cookie Any cookie or context which will passed back in the callback - * @param cb Callback which contains seq_num, cookie and - */ - void read_async(logstore_seq_num_t seq_num, void* cookie, const log_found_cb_t& cb); - /** * @brief Truncate the logs for this log store upto the seq_num provided (inclusive). Once truncated, the reads * on seq_num <= upto_seq_num will return an error. The truncation in general is a 2 step process, where first @@ -175,7 +143,6 @@ class HomeLogStore : public std::enable_shared_from_this< HomeLogStore > { * expensive and grouping them together yields better results. * * Note: this flag currently is not used, meaning all truncate is in memory only; - * @return number of records to truncate */ void truncate(logstore_seq_num_t upto_seq_num, bool in_memory_truncate_only = true); @@ -188,30 +155,17 @@ class HomeLogStore : public std::enable_shared_from_this< HomeLogStore > { */ void fill_gap(logstore_seq_num_t seq_num); - /** - * @brief Get the safe truncation log dev key from this log store perspective. Please note that the safe idx is - * not globally safe, but it is safe from this log store perspective only. To get global safe id, one should - * access all log stores and get the minimum of them before truncating. - * - * It could return invalid logdev_key which indicates that this log store does not have any valid logdev key - * to truncate. This could happen when there were no ios on this logstore since last truncation or at least no - * ios are flushed yet. The caller should simply ignore this return value. - * - * @return truncation_entry_t: Which contains the logdev key and its corresponding seq_num to truncate and also - * is that entry represents the entire log store. - */ - // truncation_entry_t get_safe_truncation_boundary() const; - /** * @brief Get the last truncated seqnum upto which we have truncated. If called after recovery, it returns the * first seq_num it has seen-1. * - * @return logstore_seq_num_t + * @return the last truncated seqnum upto which we have truncated */ - logstore_seq_num_t truncated_upto() const { - const auto ts{m_safe_truncation_boundary.seq_num.load(std::memory_order_acquire)}; - return (ts == std::numeric_limits< logstore_seq_num_t >::max()) ? -1 : ts; - } + logstore_seq_num_t truncated_upto() const { return m_start_lsn.load(std::memory_order_acquire) - 1; } + + logdev_key get_trunc_ld_key() const { return m_trunc_ld_key; } + + std::tuple< logstore_seq_num_t, logdev_key, logstore_seq_num_t > truncate_info() const; sisl::StreamTracker< logstore_record >& log_records() { return m_records; } @@ -256,60 +210,31 @@ class HomeLogStore : public std::enable_shared_from_this< HomeLogStore > { * * @param seq_num Sequence number upto which logs are to be flushed. If not provided, will wait to flush all seq * numbers issued prior. - * @return True on success */ - void flush_sync(logstore_seq_num_t upto_seq_num = invalid_lsn()); + void flush(logstore_seq_num_t upto_seq_num = invalid_lsn()); /** * @brief Rollback the given instance to the given sequence number * - * @param seq_num Sequence number back which logs are to be rollbacked + * @param to_lsn Sequence number back which logs are to be rollbacked * @return True on success */ - uint64_t rollback_async(logstore_seq_num_t to_lsn, on_rollback_cb_t cb); + bool rollback(logstore_seq_num_t to_lsn); - auto seq_num() const { return m_seq_num.load(std::memory_order_acquire); } - - std::shared_ptr< LogDev > get_logdev() { return m_logdev; } + auto start_lsn() const { return m_start_lsn.load(std::memory_order_acquire); } nlohmann::json dump_log_store(const log_dump_req& dump_req = log_dump_req()); nlohmann::json get_status(int verbosity) const; - /** - * Retrieves the truncation information before device truncation. - * - * @return A constant reference to the truncation_info object representing the truncation information. - */ - const truncation_info& pre_device_truncation(); - - /** - * \brief post device truncation processing. - * - * This function is used to update safe truncation boundary to the specified `trunc_upto_key`. - * - * \param trunc_upto_key The key indicating the log entry up to which truncation has been performed. - */ - void post_device_truncation(const logdev_key& trunc_upto_key); - /** * Handles the completion of a write operation in the log store. * * @param req The logstore_req object representing the completed write operation. * @param ld_key The logdev_key associated with the completed write operation. + * @param flush_ld_key when we truncate to req, which position we should truncate in the logdev. */ - void on_write_completion(logstore_req* req, const logdev_key& ld_key); - - /** - * \brief Handles the completion of a read operation in the log store. - * - * This function is called when a read operation in the log store has completed. - * It takes a pointer to a logstore_req object and a logdev_key object as parameters. - * - * \param req The pointer to the logstore_req object representing the read request. - * \param ld_key The logdev_key object representing the key used for the read operation. - */ - void on_read_completion(logstore_req* req, const logdev_key& ld_key); + void on_write_completion(logstore_req* req, const logdev_key& ld_key, const logdev_key& flush_ld_key); /** * @brief Handles the event when a log is found. @@ -327,27 +252,10 @@ class HomeLogStore : public std::enable_shared_from_this< HomeLogStore > { */ void on_log_found(logstore_seq_num_t seq_num, const logdev_key& ld_key, const logdev_key& flush_ld_key, log_buffer buf); - /** - * @brief Handles the completion of a batch flush operation to update internal state. - * - * This function is called when a batch flush operation is completed. - * It takes a `logdev_key` parameter that represents the key of the flushed batch. - * - * This function is also called during log store recovery; - * - * @param flush_batch_ld_key The key of the flushed batch. - */ - void on_batch_completion(const logdev_key& flush_batch_ld_key); -private: - /** - * Truncates the log store up to the specified sequence number. - * - * @param upto_seq_num The sequence number up to which the log store should be truncated. - */ - void do_truncate(logstore_seq_num_t upto_seq_num); + std::shared_ptr< LogDev > get_logdev() { return m_logdev; } - int search_max_le(logstore_seq_num_t input_sn); + auto get_comp_cb() const { return m_comp_cb; } private: logstore_id_t m_store_id; @@ -357,21 +265,15 @@ class HomeLogStore : public std::enable_shared_from_this< HomeLogStore > { log_req_comp_cb_t m_comp_cb; log_found_cb_t m_found_cb; log_replay_done_cb_t m_replay_done_cb; - std::atomic< logstore_seq_num_t > m_seq_num; + // the first seq_num that is in the log store + std::atomic< logstore_seq_num_t > m_start_lsn; + // the next seq_num that will be put into the log store + std::atomic< logstore_seq_num_t > m_next_lsn; + // the last seq_num that is in the log store + std::atomic< logstore_seq_num_t > m_tail_lsn; std::string m_fq_name; LogStoreServiceMetrics& m_metrics; - // seq_ld_key_pair m_flush_batch_max = {-1, {0, 0}}; // The maximum seqnum we have seen in the prev flushed - // batch - logstore_seq_num_t m_flush_batch_max_lsn{std::numeric_limits< logstore_seq_num_t >::min()}; - - // Sync flush sections - std::atomic< logstore_seq_num_t > m_sync_flush_waiter_lsn{invalid_lsn()}; - std::mutex m_sync_flush_mtx; - std::mutex m_single_sync_flush_mtx; - std::condition_variable m_sync_flush_cv; - - std::vector< seq_ld_key_pair > m_truncation_barriers; // List of truncation barriers - truncation_info m_safe_truncation_boundary; + logdev_key m_trunc_ld_key{0, 0}; }; } // namespace homestore diff --git a/src/include/homestore/logstore/log_store_internal.hpp b/src/include/homestore/logstore/log_store_internal.hpp index 125e35daa..551f15ea8 100644 --- a/src/include/homestore/logstore/log_store_internal.hpp +++ b/src/include/homestore/logstore/log_store_internal.hpp @@ -32,7 +32,6 @@ #include namespace homestore { - ///////////////////// All typedefs /////////////////////////////// class logstore_req; class HomeLogStore; @@ -50,7 +49,6 @@ typedef std::function< void(logstore_seq_num_t, sisl::io_blob&, logdev_key, void typedef std::function< void(logstore_seq_num_t, log_buffer, void*) > log_found_cb_t; typedef std::function< void(std::shared_ptr< HomeLogStore >) > log_store_opened_cb_t; typedef std::function< void(std::shared_ptr< HomeLogStore >, logstore_seq_num_t) > log_replay_done_cb_t; -typedef std::function< void(const std::unordered_map< logdev_id_t, logdev_key >&) > device_truncate_cb_t; typedef int64_t logid_t; @@ -58,8 +56,7 @@ struct logdev_key { logid_t idx; off_t dev_offset; - constexpr logdev_key(const logid_t idx = std::numeric_limits< logid_t >::min(), - const off_t dev_offset = std::numeric_limits< uint64_t >::min()) : + constexpr logdev_key(const logid_t idx = -1, const off_t dev_offset = std::numeric_limits< uint64_t >::min()) : idx{idx}, dev_offset{dev_offset} {} logdev_key(const logdev_key&) = default; logdev_key& operator=(const logdev_key&) = default; @@ -72,11 +69,11 @@ struct logdev_key { operator bool() const { return is_valid(); } bool is_valid() const { return !is_lowest() && !is_highest(); } - bool is_lowest() const { return (idx == std::numeric_limits< logid_t >::min()); } + bool is_lowest() const { return (idx == -1); } bool is_highest() const { return (idx == std::numeric_limits< logid_t >::max()); } void set_lowest() { - idx = std::numeric_limits< logid_t >::min(); + idx = -1; dev_offset = std::numeric_limits< uint64_t >::min(); } @@ -109,9 +106,11 @@ struct log_dump_req { struct logstore_record { logdev_key m_dev_key; + // indicates the safe truncation point of the log store + logdev_key m_trunc_key; logstore_record() = default; - logstore_record(const logdev_key& key) : m_dev_key{key} {} + logstore_record(const logdev_key& key, const logdev_key& trunc_key) : m_dev_key{key}, m_trunc_key{trunc_key} {} }; class HomeLogStore; @@ -121,7 +120,6 @@ struct logstore_req { logstore_seq_num_t seq_num; // Log store specific seq_num (which could be monotonically increaseing with logstore) sisl::io_blob data; // Data blob containing data void* cookie; // User generated cookie (considered as opaque) - bool is_write; // Directon of IO bool is_internal_req; // If the req is created internally by HomeLogStore itself log_req_comp_cb_t cb; // Callback upon completion of write (overridden than default) Clock::time_point start_time; @@ -138,13 +136,11 @@ struct logstore_req { // TODO: Implement this method return 0; } - static logstore_req* make(HomeLogStore* store, logstore_seq_num_t seq_num, const sisl::io_blob& data, - bool is_write_req = true) { + static logstore_req* make(HomeLogStore* store, logstore_seq_num_t seq_num, const sisl::io_blob& data) { logstore_req* req = new logstore_req(); req->log_store = store; req->seq_num = seq_num; req->data = data; - req->is_write = is_write_req; req->is_internal_req = true; req->cb = nullptr; @@ -158,25 +154,6 @@ struct logstore_req { logstore_req() = default; }; -struct seq_ld_key_pair { - logstore_seq_num_t seq_num{-1}; - logdev_key ld_key; -}; - -struct truncation_info { - // Safe log dev location upto which it is truncatable - logdev_key ld_key{std::numeric_limits< logid_t >::min(), 0}; - - // LSN of this log store upto which it is truncated - std::atomic< logstore_seq_num_t > seq_num{-1}; - - // Is there any entry which is already store truncated but waiting for device truncation - bool pending_dev_truncation{false}; - - // Any truncation entries/barriers which are not part of this truncation - bool active_writes_not_part_of_truncation{false}; -}; - #pragma pack(1) struct logstore_superblk { logstore_superblk(const logstore_seq_num_t seq_num = 0) : m_first_seq_num{seq_num} {} @@ -194,4 +171,4 @@ struct logstore_superblk { logstore_seq_num_t m_first_seq_num{0}; }; #pragma pack() -} // namespace homestore +} // namespace homestore \ No newline at end of file diff --git a/src/include/homestore/logstore_service.hpp b/src/include/homestore/logstore_service.hpp index bd5347dd6..44ba1ab53 100644 --- a/src/include/homestore/logstore_service.hpp +++ b/src/include/homestore/logstore_service.hpp @@ -155,13 +155,8 @@ class LogStoreService { /** * @brief Schedule a truncate all the log stores physically on the device. - * - * @param cb [OPTIONAL] Callback once truncation is completed, if provided (Default no callback) - * @param wait_till_done [OPTIONAL] Wait for the truncation to complete before returning from this method. - * Default to false - * @param dry_run: If the truncate is a real one or just dry run to simulate the truncation */ - void device_truncate(const device_truncate_cb_t& cb = nullptr, bool wait_till_done = false, bool dry_run = false); + void device_truncate(); folly::Future< std::error_code > create_vdev(uint64_t size, HSDevType devType, uint32_t chunk_size); std::shared_ptr< VirtualDev > open_vdev(const vdev_info& vinfo, bool load_existing); @@ -178,13 +173,6 @@ class LogStoreService { uint32_t total_size() const; iomgr::io_fiber_t flush_thread() { return m_flush_fiber; } - /** - * This is used when the actual LogDev truncate is triggered; - * - * @return The IO fiber associated with the truncate thread. - */ - iomgr::io_fiber_t truncate_thread() { return m_truncate_fiber; } - void delete_unopened_logdevs(); private: @@ -194,14 +182,13 @@ class LogStoreService { void logdev_super_blk_found(const sisl::byte_view& buf, void* meta_cookie); void rollback_super_blk_found(const sisl::byte_view& buf, void* meta_cookie); void start_threads(); - void flush_if_needed(); + void flush(); private: std::unordered_map< logdev_id_t, std::shared_ptr< LogDev > > m_id_logdev_map; folly::SharedMutexWritePriority m_logdev_map_mtx; std::shared_ptr< JournalVirtualDev > m_logdev_vdev; - iomgr::io_fiber_t m_truncate_fiber; iomgr::io_fiber_t m_flush_fiber; LogStoreServiceMetrics m_metrics; std::unordered_set< logdev_id_t > m_unopened_logdev; diff --git a/src/lib/common/homestore_config.fbs b/src/lib/common/homestore_config.fbs index c283bc9ba..b17927d26 100644 --- a/src/lib/common/homestore_config.fbs +++ b/src/lib/common/homestore_config.fbs @@ -112,6 +112,11 @@ table LogStore { // Logdev will flush the logs only in a dedicated thread. Turn this on, if flush IO doesn't want to // intervene with data IO path. flush_only_in_dedicated_thread: bool = true; + + //we support 3 flush mode , 1(inline), 2 (timer) and 4(explicitly), mixed flush mode is also supportted + //for example, if we want inline and explicitly, we just set the flush mode to 1+4 = 5 + //for nuobject case, we only support explicitly mode + flush_mode: uint32 = 4; } table Generic { diff --git a/src/lib/device/journal_vdev.cpp b/src/lib/device/journal_vdev.cpp index d526103fc..bcddae40a 100644 --- a/src/lib/device/journal_vdev.cpp +++ b/src/lib/device/journal_vdev.cpp @@ -46,15 +46,15 @@ JournalVirtualDev::JournalVirtualDev(DeviceManager& dmgr, const vdev_info& vinfo m_init_private_data = std::make_shared< JournalChunkPrivate >(); m_chunk_pool = std::make_unique< ChunkPool >( dmgr, - ChunkPool::Params{HS_DYNAMIC_CONFIG(generic.journal_chunk_pool_capacity), - [this]() { - m_init_private_data->created_at = get_time_since_epoch_ms(); - m_init_private_data->end_of_chunk = m_vdev_info.chunk_size; - sisl::blob private_blob{r_cast< uint8_t* >(m_init_private_data.get()), - sizeof(JournalChunkPrivate)}; - return private_blob; - }, - m_vdev_info.hs_dev_type, m_vdev_info.vdev_id, m_vdev_info.chunk_size}); + ChunkPool::Params{ + HS_DYNAMIC_CONFIG(generic.journal_chunk_pool_capacity), + [this]() { + m_init_private_data->created_at = get_time_since_epoch_ms(); + m_init_private_data->end_of_chunk = m_vdev_info.chunk_size; + sisl::blob private_blob{r_cast< uint8_t* >(m_init_private_data.get()), sizeof(JournalChunkPrivate)}; + return private_blob; + }, + m_vdev_info.hs_dev_type, m_vdev_info.vdev_id, m_vdev_info.chunk_size}); resource_mgr().register_journal_vdev_exceed_cb([this]([[maybe_unused]] int64_t dirty_buf_count, bool critical) { // either it is critical or non-critical, call cp_flush; @@ -370,16 +370,16 @@ folly::Future< std::error_code > JournalVirtualDev::Descriptor::async_pwritev(co return m_vdev.async_writev(iov, iovcnt, chunk, offset_in_chunk); } -void JournalVirtualDev::Descriptor::sync_pwrite(const uint8_t* buf, size_t size, off_t offset) { +std::error_code JournalVirtualDev::Descriptor::sync_pwrite(const uint8_t* buf, size_t size, off_t offset) { HS_REL_ASSERT_LE(size, m_reserved_sz, "Write size: larger then reserved size is not allowed!"); m_reserved_sz -= size; // update reserved size auto const [chunk, index, offset_in_chunk] = process_pwrite_offset(size, offset); - m_vdev.sync_write(r_cast< const char* >(buf), size, chunk, offset_in_chunk); + return m_vdev.sync_write(r_cast< const char* >(buf), size, chunk, offset_in_chunk); } -void JournalVirtualDev::Descriptor::sync_pwritev(const iovec* iov, int iovcnt, off_t offset) { +std::error_code JournalVirtualDev::Descriptor::sync_pwritev(const iovec* iov, int iovcnt, off_t offset) { auto const size = VirtualDev::get_len(iov, iovcnt); // if size is smaller than reserved size, it means write will never be overlapping start offset; @@ -388,7 +388,7 @@ void JournalVirtualDev::Descriptor::sync_pwritev(const iovec* iov, int iovcnt, o m_reserved_sz -= size; auto const [chunk, _, offset_in_chunk] = process_pwrite_offset(size, offset); - m_vdev.sync_writev(iov, iovcnt, chunk, offset_in_chunk); + return m_vdev.sync_writev(iov, iovcnt, chunk, offset_in_chunk); } /////////////////////////////// Read Section ////////////////////////////////// diff --git a/src/lib/device/journal_vdev.hpp b/src/lib/device/journal_vdev.hpp index c9bb1dc23..460db4012 100644 --- a/src/lib/device/journal_vdev.hpp +++ b/src/lib/device/journal_vdev.hpp @@ -144,10 +144,19 @@ class JournalVirtualDev : public VirtualDev { /// @param buf : buffer pointing to the data being written /// @param size : size of buffer to be written /// @param offset : offset to be written - /// @return : On success, the number of bytes written is returned, or -1 on error. - void sync_pwrite(const uint8_t* buf, size_t size, off_t offset); + /// @return : error_code + std::error_code sync_pwrite(const uint8_t* buf, size_t size, off_t offset); - void sync_pwritev(const iovec* iov, int iovcnt, off_t offset); + /// @brief writes up to count bytes from the buffer starting at buf at offset offset. The cursor is not + /// changed. pwrite always use offset returned from alloc_next_append_blk to do the write;pwrite should not + /// across chunk boundaries because alloc_next_append_blk guarantees offset returned always doesn't across chunk + /// boundary; + /// + /// @param iov : the iovec that holds vector of data buffers + /// @param offset : offset to be written + /// @param offset : offset to be written + /// @return : error_code + std::error_code sync_pwritev(const iovec* iov, int iovcnt, off_t offset); /** * @brief : read up to count bytes into the buffer starting at buf. diff --git a/src/lib/logstore/log_dev.cpp b/src/lib/logstore/log_dev.cpp index f35429e3c..0e22c9825 100644 --- a/src/lib/logstore/log_dev.cpp +++ b/src/lib/logstore/log_dev.cpp @@ -43,16 +43,18 @@ SISL_LOGGING_DECL(logstore) static bool has_data_service() { return HomeStore::instance()->has_data_service(); } // static BlkDataService& data_service() { return HomeStore::instance()->data_service(); } -LogDev::LogDev(const logdev_id_t id, JournalVirtualDev* vdev) : m_logdev_id{id}, m_vdev(vdev) { +LogDev::LogDev(logdev_id_t id, flush_mode_t flush_mode) : m_logdev_id{id}, m_flush_mode{flush_mode} { m_flush_size_multiple = HS_DYNAMIC_CONFIG(logstore->flush_size_multiple_logdev); - // Each logdev has one journal descriptor. - m_vdev_jd = m_vdev->open(m_logdev_id); - RELEASE_ASSERT(m_vdev_jd, "Journal descriptor is null"); } LogDev::~LogDev() = default; -void LogDev::start(bool format) { +void LogDev::start(bool format, std::shared_ptr< JournalVirtualDev > vdev) { + // Each logdev has one journal descriptor. + m_vdev = vdev; + m_vdev_jd = m_vdev->open(m_logdev_id); + RELEASE_ASSERT(m_vdev_jd, "Journal descriptor is null"); + if (m_flush_size_multiple == 0) { m_flush_size_multiple = m_vdev->optimal_page_size(); } THIS_LOGDEV_LOG(INFO, "Initializing logdev with flush size multiple={}", m_flush_size_multiple); @@ -86,7 +88,7 @@ void LogDev::start(bool format) { m_last_flush_idx = m_log_idx - 1; } - start_timer(); + if (allow_timer_flush()) start_timer(); handle_unopened_log_stores(format); { @@ -96,7 +98,7 @@ void LogDev::start(bool format) { for (auto& p : m_id_logstore_map) { auto& lstore{p.second.log_store}; if (lstore && lstore->get_log_replay_done_cb()) { - lstore->get_log_replay_done_cb()(lstore, lstore->seq_num() - 1); + lstore->get_log_replay_done_cb()(lstore, lstore->start_lsn() - 1); lstore->truncate(lstore->truncated_upto()); } } @@ -105,26 +107,17 @@ void LogDev::start(bool format) { } void LogDev::stop() { - THIS_LOGDEV_LOG(INFO, "Logdev stopping log_dev={}", m_logdev_id); - HS_LOG_ASSERT((m_pending_flush_size == 0), "LogDev stop attempted while writes to logdev are pending completion"); - const bool locked_now = run_under_flush_lock([this]() { - { - std::unique_lock< std::mutex > lk{m_block_flush_q_mutex}; - m_stopped = true; - } - m_block_flush_q_cv.notify_one(); - return true; - }); - - if (!locked_now) { THIS_LOGDEV_LOG(INFO, "LogDev stop is queued because of pending flush or truncation ongoing"); } - + THIS_LOGDEV_LOG(INFO, "Logdev stopping id {}", m_logdev_id); + HS_LOG_ASSERT((m_pending_flush_size.load() == 0), + "LogDev stop attempted while writes to logdev are pending completion"); { - // Wait for the stopped to be completed - std::unique_lock< std::mutex > lk{m_block_flush_q_mutex}; - m_block_flush_q_cv.wait(lk, [&] { return m_stopped; }); + std::unique_lock lg = flush_guard(); + m_stopped = true; } + // after we call stop, we need to do any pending device truncations + truncate(); - stop_timer(); + if (allow_timer_flush()) stop_timer(); { folly::SharedMutexWritePriority::WriteHolder holder(m_store_map_mtx); @@ -135,13 +128,10 @@ void LogDev::stop() { m_logdev_meta.reset(); m_log_idx.store(0); m_pending_flush_size.store(0); - m_is_flushing.store(false); m_last_flush_idx = -1; m_last_truncate_idx = -1; m_last_crc = INVALID_CRC32_VALUE; - if (m_block_flush_q != nullptr) { - sisl::VectorPool< flush_blocked_callback >::free(m_block_flush_q, false /* no_cache */); - } + for (size_t i{0}; i < max_log_group; ++i) { m_log_group_pool[i].stop(); } @@ -150,6 +140,11 @@ void LogDev::stop() { m_hs.reset(); } +bool LogDev::is_stopped() { + std::unique_lock lg = flush_guard(); + return m_stopped; +} + void LogDev::destroy() { THIS_LOGDEV_LOG(INFO, "Logdev destroy metablks log_dev={}", m_logdev_id); m_logdev_meta.destroy(); @@ -157,13 +152,12 @@ void LogDev::destroy() { void LogDev::start_timer() { // Currently only tests set it to 0. - if (HS_DYNAMIC_CONFIG(logstore.flush_timer_frequency_us) == 0) { return; } - - iomanager.run_on_wait(logstore_service().flush_thread(), [this]() { - m_flush_timer_hdl = iomanager.schedule_thread_timer(HS_DYNAMIC_CONFIG(logstore.flush_timer_frequency_us) * 1000, - true /* recurring */, nullptr /* cookie */, - [this](void*) { flush_if_needed(); }); - }); + if (HS_DYNAMIC_CONFIG(logstore.flush_timer_frequency_us)) + iomanager.run_on_wait(logstore_service().flush_thread(), [this]() { + m_flush_timer_hdl = iomanager.schedule_thread_timer( + HS_DYNAMIC_CONFIG(logstore.flush_timer_frequency_us) * 1000, true /* recurring */, nullptr /* cookie */, + [this](void*) { flush_if_necessary(); }); + }); } void LogDev::stop_timer() { @@ -174,12 +168,12 @@ void LogDev::stop_timer() { } } -void LogDev::do_load(const off_t device_cursor) { +void LogDev::do_load(off_t device_cursor) { log_stream_reader lstream{device_cursor, m_vdev, m_vdev_jd, m_flush_size_multiple}; logid_t loaded_from{-1}; off_t group_dev_offset = 0; - THIS_LOGDEV_LOG(TRACE, "LogDev::do_load start log_dev={} ", m_logdev_id); + THIS_LOGDEV_LOG(TRACE, "LogDev::do_load start log_dev={} offset = {} ", m_logdev_id, device_cursor); do { const auto buf = lstream.next_group(&group_dev_offset); @@ -192,7 +186,8 @@ void LogDev::do_load(const off_t device_cursor) { if (loaded_from == -1 && header->start_idx() < m_log_idx) { // log dev is truncated completely assert_next_pages(lstream); - THIS_LOGDEV_LOG(INFO, "LogDev loaded log_idx in range of [{} - {}]", loaded_from, m_log_idx - 1); + THIS_LOGDEV_LOG(INFO, "LogDev {} loaded log_idx in range of [{} - {}], header {}", m_logdev_id, loaded_from, + m_log_idx - 1, *header); break; } @@ -204,11 +199,10 @@ void LogDev::do_load(const off_t device_cursor) { decltype(header->nrecords()) i{0}; HS_REL_ASSERT_GT(header->nrecords(), 0, "nrecords greater then zero"); const auto flush_ld_key = - logdev_key{header->start_idx() + header->nrecords() - 1, group_dev_offset + header->total_size()}; + logdev_key{header->start_idx() + header->nrecords(), group_dev_offset + header->total_size()}; while (i < header->nrecords()) { const auto* rec = header->nth_record(i); const uint32_t data_offset = (rec->offset + (rec->get_inlined() ? 0 : header->oob_data_offset)); - // Do a callback on the found log entry sisl::byte_view b = buf; b.move_forward(data_offset); @@ -225,11 +219,10 @@ void LogDev::do_load(const off_t device_cursor) { on_logfound(rec->store_id, rec->store_seq_num, {header->start_idx() + i, group_dev_offset}, flush_ld_key, b, (header->nrecords() - (i + 1))); } - ++i; } - m_log_idx = header->start_idx() + i; + m_log_idx.store(header->start_idx() + i, std::memory_order_release); m_last_crc = header->cur_grp_crc; } while (true); @@ -255,54 +248,25 @@ void LogDev::assert_next_pages(log_stream_reader& lstream) { } } -int64_t LogDev::append_async(const logstore_id_t store_id, const logstore_seq_num_t seq_num, const sisl::io_blob& data, - void* cb_context, bool flush_wait) { - auto prev_size = m_pending_flush_size.fetch_add(data.size(), std::memory_order_relaxed); +int64_t LogDev::append_async(logstore_id_t store_id, logstore_seq_num_t seq_num, const sisl::io_blob& data, + void* cb_context) { const auto idx = m_log_idx.fetch_add(1, std::memory_order_acq_rel); - auto threshold_size = LogDev::flush_data_threshold_size(); + m_pending_flush_size.fetch_add(data.size(), std::memory_order_relaxed); m_log_records->create(idx, store_id, seq_num, data, cb_context); - - if (HS_DYNAMIC_CONFIG(logstore.flush_threshold_size) == 0) { - // This is set in tests to disable implicit flush. This will be removed in future. - return idx; - } - - if (flush_wait || - ((prev_size < threshold_size && ((prev_size + data.size()) >= threshold_size) && - !m_is_flushing.load(std::memory_order_relaxed)))) { - flush_if_needed(flush_wait ? 1 : -1); - } + if (allow_inline_flush()) flush_if_necessary(); return idx; } -log_buffer LogDev::read(const logdev_key& key, serialized_log_record& return_record_header) { +log_buffer LogDev::read(const logdev_key& key) { auto buf = sisl::make_byte_array(initial_read_size, m_flush_size_multiple, sisl::buftag::logread); auto ec = m_vdev_jd->sync_pread(buf->bytes(), initial_read_size, key.dev_offset); if (ec) { - LOGERROR("Failed to read from ournal vdev log_dev={} {} {}", m_logdev_id, ec.value(), ec.message()); + LOGERROR("Failed to read from Journal vdev log_dev={} {} {}", m_logdev_id, ec.value(), ec.message()); return {}; } auto* header = r_cast< const log_group_header* >(buf->cbytes()); - // THIS_LOGDEV_LOG(TRACE, "Logdev read log group header {}", *header); - HS_REL_ASSERT_EQ(header->magic_word(), LOG_GROUP_HDR_MAGIC, "Log header corrupted with magic mismatch! {} {}", - m_logdev_id, *header); - HS_REL_ASSERT_EQ(header->get_version(), log_group_header::header_version, "Log header version mismatch! {} {}", - m_logdev_id, *header); - HS_REL_ASSERT_LE(header->start_idx(), key.idx, "log key offset does not match with log_idx {} }{}", m_logdev_id, - *header); - HS_REL_ASSERT_GT((header->start_idx() + header->nrecords()), key.idx, - "log key offset does not match with log_idx {} {}", m_logdev_id, *header); - HS_LOG_ASSERT_GE(header->total_size(), header->_inline_data_offset(), "Inconsistent size data in log group {} {}", - m_logdev_id, *header); - - // We can only do crc match in read if we have read all the blocks. We don't want to aggressively read more data - // than we need to just to compare CRC for read operation. It can be done during recovery. - if (header->total_size() <= initial_read_size) { - crc32_t const crc = crc32_ieee(init_crc32, (buf->cbytes() + sizeof(log_group_header)), - header->total_size() - sizeof(log_group_header)); - HS_REL_ASSERT_EQ(header->this_group_crc(), crc, "CRC mismatch on read data"); - } + verify_log_group_header(key.idx, header); auto record_header = header->nth_record(key.idx - header->start_log_idx); uint32_t const data_offset = (record_header->offset + (record_header->get_inlined() ? 0 : header->oob_data_offset)); @@ -318,11 +282,42 @@ log_buffer LogDev::read(const logdev_key& key, serialized_log_record& return_rec ret_view = sisl::byte_view{new_buf, s_cast< uint32_t >(data_offset - rounded_data_offset), record_header->size}; } + return ret_view; +} + +void LogDev::read_record_header(const logdev_key& key, serialized_log_record& return_record_header) { + auto buf = sisl::make_byte_array(initial_read_size, m_flush_size_multiple, sisl::buftag::logread); + auto ec = m_vdev_jd->sync_pread(buf->bytes(), initial_read_size, key.dev_offset); + if (ec) LOGERROR("Failed to read from Journal vdev log_dev={} {} {}", m_logdev_id, ec.value(), ec.message()); + + auto* header = r_cast< const log_group_header* >(buf->cbytes()); + verify_log_group_header(key.idx, header); + + auto record_header = header->nth_record(key.idx - header->start_log_idx); return_record_header = serialized_log_record(record_header->size, record_header->offset, record_header->get_inlined(), record_header->store_seq_num, record_header->store_id); +} - return ret_view; +void LogDev::verify_log_group_header(const logid_t idx, const log_group_header* header) { + HS_REL_ASSERT_EQ(header->magic_word(), LOG_GROUP_HDR_MAGIC, "Log header corrupted with magic mismatch! {} {}", + m_logdev_id, *header); + HS_REL_ASSERT_EQ(header->get_version(), log_group_header::header_version, "Log header version mismatch! {} {}", + m_logdev_id, *header); + HS_REL_ASSERT_LE(header->start_idx(), idx, "log key offset does not match with log_idx {} }{}", m_logdev_id, + *header); + HS_REL_ASSERT_GT((header->start_idx() + header->nrecords()), idx, + "log key offset does not match with log_idx {} {}", m_logdev_id, *header); + HS_LOG_ASSERT_GE(header->total_size(), header->_inline_data_offset(), "Inconsistent size data in log group {} {}", + m_logdev_id, *header); + + // We can only do crc match in read if we have read all the blocks. We don't want to aggressively read more data + // than we need to just to compare CRC for read operation. It can be done during recovery. + if (header->total_size() <= initial_read_size) { + crc32_t const crc = crc32_ieee(init_crc32, (r_cast< const uint8_t* >(header) + sizeof(log_group_header)), + header->total_size() - sizeof(log_group_header)); + HS_REL_ASSERT_EQ(header->this_group_crc(), crc, "CRC mismatch on read data"); + } } logstore_id_t LogDev::reserve_store_id() { @@ -330,7 +325,7 @@ logstore_id_t LogDev::reserve_store_id() { return m_logdev_meta.reserve_store(true /* persist_now */); } -void LogDev::unreserve_store_id(const logstore_id_t store_id) { +void LogDev::unreserve_store_id(logstore_id_t store_id) { std::unique_lock lg{m_meta_mutex}; /* Get the current log_idx as marker and insert into garbage store id. Upon device truncation, these ids will @@ -354,7 +349,7 @@ void LogDev::get_registered_store_ids(std::vector< logstore_id_t >& registered, /* * This method prepares the log records to be flushed and returns the log_group which is fully prepared */ -LogGroup* LogDev::prepare_flush(const int32_t estimated_records) { +LogGroup* LogDev::prepare_flush(int32_t estimated_records) { int64_t flushing_upto_idx{-1}; assert(estimated_records > 0); @@ -369,14 +364,13 @@ LogGroup* LogDev::prepare_flush(const int32_t estimated_records) { } }); - lg->finish(m_logdev_id, get_prev_crc()); + lg->finish(m_logdev_id, m_last_crc); if (sisl_unlikely(flushing_upto_idx == -1)) { return nullptr; } lg->m_flush_log_idx_from = m_last_flush_idx + 1; lg->m_flush_log_idx_upto = flushing_upto_idx; HS_DBG_ASSERT_GE(lg->m_flush_log_idx_upto, lg->m_flush_log_idx_from, "log indx upto is smaller then log indx from"); HS_DBG_ASSERT_GT(lg->header()->oob_data_offset, 0); - THIS_LOGDEV_LOG(DEBUG, "Flushing upto log_idx={}", flushing_upto_idx); return lg; } @@ -386,13 +380,11 @@ bool LogDev::can_flush_in_this_thread() { return (!HS_DYNAMIC_CONFIG(logstore.flush_only_in_dedicated_thread) && iomanager.am_i_worker_reactor()); } -// This method checks if in case we were to add a record of size provided, do we enter into a state which exceeds -// our threshold. If so, it first flushes whats accumulated so far and then add the pending flush size counter with -// the new record size -bool LogDev::flush_if_needed(int64_t threshold_size) { - { - std::unique_lock< std::mutex > lk{m_block_flush_q_mutex}; - if (m_stopped) { return false; } +bool LogDev::flush_if_necessary(int64_t threshold_size) { + if (!can_flush_in_this_thread()) { + iomanager.run_on_forget(logstore_service().flush_thread(), + [this, threshold_size]() { flush_if_necessary(threshold_size); }); + return false; } // If after adding the record size, if we have enough to flush or if its been too much time before we actually @@ -404,242 +396,174 @@ bool LogDev::flush_if_needed(int64_t threshold_size) { bool const flush_by_size = (pending_sz >= threshold_size); bool const flush_by_time = !flush_by_size && pending_sz && (elapsed_time > HS_DYNAMIC_CONFIG(logstore.max_time_between_flush_us)); - if (flush_by_size || flush_by_time) { - // First off, check if we can flush in this thread itself, if not, schedule it into different thread - if (!can_flush_in_this_thread()) { - iomanager.run_on_forget(logstore_service().flush_thread(), - [this, threshold_size]() { flush_if_needed(threshold_size); }); - return false; - } - - bool expected_flushing{false}; - if (!m_is_flushing.compare_exchange_strong(expected_flushing, true, std::memory_order_acq_rel)) { - return false; - } - THIS_LOGDEV_LOG(TRACE, - "Flushing now because either pending_size={} is greater than data_threshold={} or " - "elapsed time since last flush={} us is greater than max_time_between_flush={} us", - pending_sz, threshold_size, elapsed_time, - HS_DYNAMIC_CONFIG(logstore.max_time_between_flush_us)); - - m_last_flush_time = Clock::now(); - // We were able to win the flushing competition and now we gather all the flush data and reserve a slot. - auto new_idx = m_log_idx.load(std::memory_order_relaxed) - 1; - if (m_last_flush_idx >= new_idx) { - THIS_LOGDEV_LOG(TRACE, "Log idx {} is just flushed", new_idx); - unlock_flush(false); - return false; - } - - auto* lg = prepare_flush(new_idx - m_last_flush_idx + 4); // Estimate 4 more extra in case of parallel writes - if (sisl_unlikely(!lg)) { - THIS_LOGDEV_LOG(TRACE, "Log idx {} last_flush_idx {} prepare flush failed", new_idx, m_last_flush_idx); - unlock_flush(false); - return false; + std::unique_lock lck(m_flush_mtx, std::try_to_lock); + if (lck.owns_lock()) { + if (m_stopped) return false; + return flush(); } - auto sz = m_pending_flush_size.fetch_sub(lg->actual_data_size(), std::memory_order_relaxed); - HS_REL_ASSERT_GE((sz - lg->actual_data_size()), 0, "size {} lg size{}", sz, lg->actual_data_size()); - - off_t offset = m_vdev_jd->alloc_next_append_blk(lg->header()->total_size()); - lg->m_log_dev_offset = offset; - HS_REL_ASSERT_NE(lg->m_log_dev_offset, INVALID_OFFSET, "log dev is full"); - THIS_LOGDEV_LOG(TRACE, "Flushing log group data size={} at offset=0x{} log_group={}", lg->actual_data_size(), - to_hex(offset), *lg); - // THIS_LOGDEV_LOG(DEBUG, "Log Group: {}", *lg); - do_flush(lg); - return true; - } else { - return false; } + return false; } -void LogDev::do_flush(LogGroup* lg) { +bool LogDev::flush_under_guard() { + std::unique_lock lg = flush_guard(); + #ifdef _PRERELEASE if (iomgr_flip::instance()->delay_flip< int >( - "simulate_log_flush_delay", [this, lg]() { do_flush_write(lg); }, m_logdev_id)) { + "simulate_log_flush_delay", [this]() { return flush(); }, m_logdev_id)) { THIS_LOGDEV_LOG(INFO, "Delaying flush by rescheduling the async write"); - return; + return true; } #endif - // if (has_data_service() && data_service().is_fsync_needed()) { - // data_service().fsync([this, lg]() { do_flush_write(lg); }) - // } else { - // do_flush_write(lg); - // } - do_flush_write(lg); + return flush(); } -void LogDev::do_flush_write(LogGroup* lg) { +bool LogDev::flush() { + m_last_flush_time = Clock::now(); + // We were able to win the flushing competition and now we gather all the flush data and reserve a slot. + auto new_idx = m_log_idx.load(std::memory_order_acquire) - 1; + if (m_last_flush_idx >= new_idx) { + THIS_LOGDEV_LOG(TRACE, "Log idx {} is just flushed", new_idx); + return false; + } + + LogGroup* lg = prepare_flush(new_idx - m_last_flush_idx + 4); // Estimate 4 more extra in case of parallel writes + if (sisl_unlikely(!lg)) { + THIS_LOGDEV_LOG(TRACE, "Log idx {} last_flush_idx {} prepare flush failed", new_idx, m_last_flush_idx); + return false; + } + auto sz = m_pending_flush_size.fetch_sub(lg->actual_data_size(), std::memory_order_relaxed); + HS_REL_ASSERT_GE((sz - lg->actual_data_size()), 0, "size {} lg size {}", sz, lg->actual_data_size()); + off_t offset = m_vdev_jd->alloc_next_append_blk(lg->header()->total_size()); + lg->m_log_dev_offset = offset; + + HS_REL_ASSERT_NE(lg->m_log_dev_offset, INVALID_OFFSET, "log dev is full"); + THIS_LOGDEV_LOG(TRACE, "Flushing log group data size={} at offset={} log_group={}", lg->actual_data_size(), offset, + *lg); + HISTOGRAM_OBSERVE(logstore_service().m_metrics, logdev_flush_records_distribution, lg->nrecords()); HISTOGRAM_OBSERVE(logstore_service().m_metrics, logdev_flush_size_distribution, lg->actual_data_size()); - THIS_LOGDEV_LOG(TRACE, "vdev offset={} log group total size={}", lg->m_log_dev_offset, lg->header()->total_size()); - // write log - m_vdev_jd->async_pwritev(lg->iovecs().data(), int_cast(lg->iovecs().size()), lg->m_log_dev_offset) - .thenValue([this, lg](auto) { on_flush_completion(lg); }); + // FIXME:: add logic to handle this error in upper layer + auto error = m_vdev_jd->sync_pwritev(lg->iovecs().data(), int_cast(lg->iovecs().size()), lg->m_log_dev_offset); + if (error) { + THIS_LOGDEV_LOG(ERROR, "Fail to sync write to journal vde , error code {} : {}", error.value(), + error.message()); + return false; + } + + on_flush_completion(lg); + return true; } void LogDev::on_flush_completion(LogGroup* lg) { - lg->m_flush_finish_time = Clock::now(); - lg->m_post_flush_msg_rcvd_time = Clock::now(); + auto done_time = Clock::now(); THIS_LOGDEV_LOG(TRACE, "Flush completed for logid[{} - {}]", lg->m_flush_log_idx_from, lg->m_flush_log_idx_upto); m_log_records->complete(lg->m_flush_log_idx_from, lg->m_flush_log_idx_upto); - m_last_flush_idx = lg->m_flush_log_idx_upto; - const auto flush_ld_key = logdev_key{m_last_flush_idx, lg->m_log_dev_offset + lg->header()->total_size()}; m_last_crc = lg->header()->cur_grp_crc; + std::unordered_map< logid_t, logstore_req* > req_map; auto from_indx = lg->m_flush_log_idx_from; auto upto_indx = lg->m_flush_log_idx_upto; auto dev_offset = lg->m_log_dev_offset; for (auto idx = from_indx; idx <= upto_indx; ++idx) { auto& record = m_log_records->at(idx); - on_io_completion(record.store_id, logdev_key{idx, dev_offset}, flush_ld_key, upto_indx - idx, record.context); - } - lg->m_post_flush_process_done_time = Clock::now(); - - HISTOGRAM_OBSERVE(logstore_service().m_metrics, logdev_flush_done_msg_time_ns, - get_elapsed_time_us(lg->m_flush_finish_time, lg->m_post_flush_msg_rcvd_time)); + logstore_req* req = s_cast< logstore_req* >(record.context); + HomeLogStore* log_store = req->log_store; + HS_LOG_ASSERT_EQ(log_store->get_store_id(), record.store_id, + "Expecting store id in log store and flush completion to match"); + HISTOGRAM_OBSERVE(logstore_service().m_metrics, logstore_append_latency, get_elapsed_time_us(req->start_time)); + log_store->on_write_completion(req, logdev_key{idx, dev_offset}, logdev_key{from_indx, dev_offset}); + req_map[idx] = req; + } + HISTOGRAM_OBSERVE(logstore_service().m_metrics, logdev_flush_time_us, + get_elapsed_time_us(m_last_flush_time, done_time)); HISTOGRAM_OBSERVE(logstore_service().m_metrics, logdev_post_flush_processing_latency, - get_elapsed_time_us(lg->m_post_flush_msg_rcvd_time, lg->m_post_flush_process_done_time)); + get_elapsed_time_us(done_time)); free_log_group(lg); - unlock_flush(); -} + m_log_records->truncate(upto_indx); + m_last_flush_idx = upto_indx; + + // since we support out-of-order lsn write, so no need to guarantee the order of logstore write completion + // TODO:: add some logic to guarantee all the callback is done when stop. + for (auto const& [idx, req] : req_map) + iomanager.run_on_forget(iomgr::reactor_regex::random_worker, iomgr::fiber_regex::syncio_only, + [this, dev_offset, idx, req]() { + auto ld_key = logdev_key{idx, dev_offset}; + auto comp_cb = req->log_store->get_comp_cb(); + (req->cb) ? req->cb(req, ld_key) : comp_cb(req, ld_key); + }); +} + +uint64_t LogDev::truncate() { + // Order of this lock has to be preserved. We take externally visible lock which is flush lock first. This + // prevents any further update to tail_lsn and also flushes conurrently with truncation. Then we take the store + // map lock, which is contained in this class and then meta_mutex. Reason for this is, we take meta_mutex under + // other store_map lock, so reversing could cause deadlock + std::unique_lock fg = flush_guard(); + folly::SharedMutexWritePriority::ReadHolder holder(m_store_map_mtx); + std::unique_lock mg{m_meta_mutex}; -bool LogDev::run_under_flush_lock(const flush_blocked_callback& cb) { - { - std::unique_lock lk{m_block_flush_q_mutex}; - if (m_stopped) { - THIS_LOGDEV_LOG(WARN, "Trying to lock a flush on a stopped logdev, not locking the flush"); - return false; + logdev_key min_safe_ld_key = logdev_key::out_of_bound_ld_key(); + // Walk through all the stores and find the least logdev_key that we can truncate + for (auto& [store_id, store] : m_id_logstore_map) { + auto lstore = store.log_store; + if (lstore == nullptr) { continue; } + auto const [trunc_lsn, trunc_ld_key, tail_lsn] = lstore->truncate_info(); + if (trunc_lsn == tail_lsn) { + THIS_LOGDEV_LOG(DEBUG, "Store_id={} didn't have any writes since last truncation, skipping ", store_id); + m_logdev_meta.remove_all_rollback_records(store_id, m_stopped /* persist_now */); + continue; } + HS_DBG_ASSERT_GE(trunc_ld_key.idx, m_last_truncate_idx, "Trying to truncate logid which is already truncated"); + m_logdev_meta.update_store_superblk(store_id, logstore_superblk(trunc_lsn + 1), m_stopped /* persist_now */); - bool expected_flushing{false}; - if (!m_is_flushing.compare_exchange_strong(expected_flushing, true, std::memory_order_acq_rel)) { - // Flushing is blocked already, add it to the callback q - if (m_block_flush_q == nullptr) { m_block_flush_q = sisl::VectorPool< flush_blocked_callback >::alloc(); } - m_block_flush_q->emplace_back(cb); - return false; - } + // We found a new minimum logdev_key that we can truncate to + if (trunc_ld_key.idx > 0 && trunc_ld_key.idx < min_safe_ld_key.idx) { min_safe_ld_key = trunc_ld_key; } } - // the contract here is if cb return falses, it means it will unlock_flush by itself (in another thread); - if (cb()) { unlock_flush(); } - return true; -} + // There are no writes or no truncation called for any of the store, so we can't truncate anything + if (min_safe_ld_key == logdev_key::out_of_bound_ld_key() || min_safe_ld_key.idx <= m_last_truncate_idx) return 0; -void LogDev::unlock_flush(bool do_flush) { - std::vector< flush_blocked_callback >* flush_q{nullptr}; - { - std::unique_lock lk{m_block_flush_q_mutex}; - if (m_block_flush_q != nullptr) { - flush_q = m_block_flush_q; - m_block_flush_q = nullptr; - } - } + uint64_t const num_records_to_truncate = uint64_cast(min_safe_ld_key.idx - m_last_truncate_idx); - if (flush_q) { - for (auto it = flush_q->begin(); it != flush_q->end(); ++it) { - auto& cb = *it; - if (m_stopped) { - THIS_LOGDEV_LOG(INFO, "Logdev is stopped and thus not processing outstanding flush_lock_q"); - return; - } - if (!cb()) { - // NOTE: Under this if condition DO NOT ASSUME flush lock is still being held. This is because - // callee is saying, I will unlock the flush lock on my own and before returning from cb to here, - // the callee could have schedule a job in other thread and unlock the flush. - std::unique_lock lk{m_block_flush_q_mutex}; - THIS_LOGDEV_LOG(DEBUG, - "flush cb wanted to hold onto the flush lock, so putting the {} remaining entries back " - "to the q at head position which already has {} entries", - std::distance(flush_q->begin(), it), m_block_flush_q ? m_block_flush_q->size() : 0); - - flush_q->erase(flush_q->begin(), it + 1); - if (m_block_flush_q == nullptr) { - m_block_flush_q = flush_q; - } else { - m_block_flush_q->insert(m_block_flush_q->begin(), flush_q->begin(), flush_q->end()); - } + // Truncate them in vdev + m_vdev_jd->truncate(min_safe_ld_key.dev_offset); - return; - } - } - sisl::VectorPool< flush_blocked_callback >::free(flush_q); - } - m_is_flushing.store(false, std::memory_order_release); + // Update the start offset to be read upon restart + m_last_truncate_idx = min_safe_ld_key.idx; + m_logdev_meta.set_start_dev_offset(min_safe_ld_key.dev_offset, min_safe_ld_key.idx, m_stopped /* persist_now */); - // Try to do chain flush if its really needed. - THIS_LOGDEV_LOG(TRACE, "Unlocked the flush, try doing chain flushing if needed"); - // send a message to see if a new flush can be triggered - if (do_flush) { flush_if_needed(); } -} + // When a logstore is removed, it unregisteres the store and keeps the store id in garbage list. We can capture + // these store_ids upto the log_idx which is truncated and then unreserve those. Now on we can re-use the + // store_id on new store creation + for (auto it{std::cbegin(m_garbage_store_ids)}; it != std::cend(m_garbage_store_ids);) { + if (it->first > min_safe_ld_key.idx) { break; } -uint64_t LogDev::truncate(const logdev_key& key) { - HS_DBG_ASSERT_GE(key.idx, m_last_truncate_idx); - uint64_t const num_records_to_truncate = static_cast< uint64_t >(key.idx - m_last_truncate_idx); - THIS_LOGDEV_LOG(DEBUG, "LogDev::truncate num {} idx {}", num_records_to_truncate, key.idx); - if (num_records_to_truncate > 0) { - HS_PERIODIC_LOG(INFO, logstore, - "Truncating log device upto log_dev={} log_id={} vdev_offset={} truncated {} log records", - m_logdev_id, key.idx, key.dev_offset, num_records_to_truncate); - m_log_records->truncate(key.idx); - off_t new_offset = m_vdev_jd->truncate(key.dev_offset); - THIS_LOGDEV_LOG(DEBUG, "LogDev::truncate done {} offset old {} new {}", key.idx, key.dev_offset, new_offset); - m_last_truncate_idx = key.idx; - - { - std::unique_lock< std::mutex > lk{m_meta_mutex}; - - // Update the start offset to be read upon restart - m_logdev_meta.set_start_dev_offset(new_offset, key.idx + 1, false /* persist_now */); - - // Now that store is truncated, we can reclaim the store ids which are garbaged (if any) earlier -#ifdef _PRERELEASE - bool garbage_collect = false; -#endif - for (auto it{std::cbegin(m_garbage_store_ids)}; it != std::cend(m_garbage_store_ids);) { - if (it->first > key.idx) break; + HS_PERIODIC_LOG(DEBUG, logstore, "Garbage collecting log_store={} in log_dev={} log_idx={}", it->second, + m_logdev_id, it->first); + m_logdev_meta.unreserve_store(it->second, m_stopped /* persist_now */); + it = m_garbage_store_ids.erase(it); + } - HS_PERIODIC_LOG(DEBUG, logstore, "Garbage collecting the log_dev={} log_store={} log_idx={}", - m_logdev_id, it->second, it->first); - m_logdev_meta.unreserve_store(it->second, false /* persist_now */); - it = m_garbage_store_ids.erase(it); -#ifdef _PRERELEASE - garbage_collect = true; -#endif - } + // We can remove the rollback records of those upto which logid is getting truncated + m_logdev_meta.remove_rollback_record_upto(min_safe_ld_key.idx, m_stopped /* persist_now */); + THIS_LOGDEV_LOG(DEBUG, "LogDev::truncate remove rollback {}", min_safe_ld_key.idx); - // We can remove the rollback records of those upto which logid is getting truncated - m_logdev_meta.remove_rollback_record_upto(key.idx, false /* persist_now */); - THIS_LOGDEV_LOG(DEBUG, "LogDev::truncate remove rollback {}", key.idx); - m_logdev_meta.persist(); -#ifdef _PRERELEASE - if (garbage_collect && iomgr_flip::instance()->test_flip("logdev_abort_after_garbage")) { - THIS_LOGDEV_LOG(INFO, "logdev aborting after unreserving garbage ids"); - hs()->crash_simulator().crash(); - return num_records_to_truncate; - } -#endif - } - } + // All logdev meta information is updated in-memory, persist now + m_logdev_meta.persist(); return num_records_to_truncate; } -void LogDev::update_store_superblk(logstore_id_t store_id, const logstore_superblk& lsb, bool persist_now) { - std::unique_lock lg{m_meta_mutex}; - m_logdev_meta.update_store_superblk(store_id, lsb, persist_now); -} - void LogDev::rollback(logstore_id_t store_id, logid_range_t id_range) { std::unique_lock lg{m_meta_mutex}; m_logdev_meta.add_rollback_record(store_id, id_range, true); } /////////////////////////////// LogStore Section /////////////////////////////////////// - void LogDev::handle_unopened_log_stores(bool format) { for (auto it{std::begin(m_unopened_store_io)}; it != std::end(m_unopened_store_io); ++it) { LOGINFO("skip log entries for store id {}-{}, ios {}", m_logdev_id, it->first, it->second); @@ -674,7 +598,7 @@ std::shared_ptr< HomeLogStore > LogDev::create_new_log_store(bool append_mode) { HS_REL_ASSERT((it == m_id_logstore_map.end()), "store_id {}-{} already exists", m_logdev_id, store_id); m_id_logstore_map.insert(std::pair(store_id, logstore_info{.log_store = lstore, .append_mode = append_mode})); } - HS_LOG(INFO, logstore, "Created log store log_dev={} log_store={}", m_logdev_id, store_id); + HS_LOG(DEBUG, logstore, "Created log store log_dev={} log_store={}", m_logdev_id, store_id); return lstore; } @@ -703,29 +627,6 @@ void LogDev::remove_log_store(logstore_id_t store_id) { unreserve_store_id(store_id); } -void LogDev::device_truncate_under_lock(const std::shared_ptr< truncate_req > treq) { - run_under_flush_lock([this, treq]() { - iomanager.run_on_forget(logstore_service().truncate_thread(), [this, treq]() { - const logdev_key trunc_upto = do_device_truncate(treq->dry_run); - bool done{false}; - if (treq->cb || treq->wait_till_done) { - { - std::lock_guard< std::mutex > lk{treq->mtx}; - done = (--treq->trunc_outstanding == 0); - treq->m_trunc_upto_result[m_logdev_id] = trunc_upto; - } - } - if (done) { - if (treq->cb) { treq->cb(treq->m_trunc_upto_result); } - if (treq->wait_till_done) { treq->cv.notify_one(); } - } - - unlock_flush(); - }); - return false; // Do not release the flush lock yet, the scheduler will unlock it. - }); -} - void LogDev::on_log_store_found(logstore_id_t store_id, const logstore_superblk& sb) { folly::SharedMutexWritePriority::ReadHolder holder(m_store_map_mtx); auto it = m_id_logstore_map.find(store_id); @@ -737,137 +638,30 @@ void LogDev::on_log_store_found(logstore_id_t store_id, const logstore_superblk& return; } - LOGINFO("Found a logstore log_dev={} log_store={} with start seq_num={}, Creating a new HomeLogStore instance", - m_logdev_id, store_id, sb.m_first_seq_num); + LOGDEBUG("Found a logstore log_dev={} log_store={} with start lsn={}, Creating a new HomeLogStore instance", + m_logdev_id, store_id, sb.m_first_seq_num); logstore_info& info = it->second; info.log_store = std::make_shared< HomeLogStore >(shared_from_this(), store_id, info.append_mode, sb.m_first_seq_num); info.promise.setValue(info.log_store); } -static thread_local std::vector< std::shared_ptr< HomeLogStore > > s_cur_flush_batch_stores; - -void LogDev::on_io_completion(logstore_id_t id, logdev_key ld_key, logdev_key flush_ld_key, - uint32_t nremaining_in_batch, void* ctx) { - auto* req = s_cast< logstore_req* >(ctx); - HomeLogStore* log_store = req->log_store; - - if (req->is_write) { - HS_LOG_ASSERT_EQ(log_store->get_store_id(), id, "Expecting store id in log store and io completion to match"); - log_store->on_write_completion(req, ld_key); - on_batch_completion(log_store, nremaining_in_batch, flush_ld_key); - } else { - log_store->on_read_completion(req, ld_key); - } -} - -void LogDev::on_logfound(logstore_id_t id, logstore_seq_num_t seq_num, logdev_key ld_key, logdev_key flush_ld_key, +void LogDev::on_logfound(logstore_id_t id, logstore_seq_num_t lsn, logdev_key ld_key, logdev_key flush_ld_key, log_buffer buf, uint32_t nremaining_in_batch) { HomeLogStore* log_store{nullptr}; - { folly::SharedMutexWritePriority::ReadHolder holder(m_store_map_mtx); auto const it = m_id_logstore_map.find(id); if (it == m_id_logstore_map.end()) { auto [unopened_it, inserted] = m_unopened_store_io.insert(std::make_pair<>(id, 0)); - if (inserted) { - // HS_REL_ASSERT(0, "log id {}-{} not found", m_logdev_id, id); - } ++unopened_it->second; return; } log_store = it->second.log_store.get(); } - if (!log_store) { return; } - log_store->on_log_found(seq_num, ld_key, flush_ld_key, buf); - on_batch_completion(log_store, nremaining_in_batch, flush_ld_key); -} - -void LogDev::on_batch_completion(HomeLogStore* log_store, uint32_t nremaining_in_batch, logdev_key flush_ld_key) { - /* check if it is a first update on this log store */ - auto id = log_store->get_store_id(); - const auto it = m_last_flush_info.find(id); - if ((it == std::end(m_last_flush_info)) || (it->second != flush_ld_key.idx)) { - // first time completion in this batch for a given store_id - m_last_flush_info.insert_or_assign(id, flush_ld_key.idx); - if (it == std::end(m_last_flush_info)) { s_cur_flush_batch_stores.push_back(log_store->shared_from_this()); } - } - if (nremaining_in_batch == 0) { - // This batch is completed, call all log stores participated in this batch about the end of batch - HS_LOG_ASSERT_GT(s_cur_flush_batch_stores.size(), 0U, "Expecting one store to be flushed in batch"); - - for (auto& l : s_cur_flush_batch_stores) { - l->on_batch_completion(flush_ld_key); - } - s_cur_flush_batch_stores.clear(); - m_last_flush_info.clear(); - } -} - -logdev_key LogDev::do_device_truncate(bool dry_run) { - static thread_local std::vector< std::shared_ptr< HomeLogStore > > m_min_trunc_stores; - static thread_local std::vector< std::shared_ptr< HomeLogStore > > m_non_participating_stores; - - m_min_trunc_stores.clear(); - m_non_participating_stores.clear(); - logdev_key min_safe_ld_key = logdev_key::out_of_bound_ld_key(); - - std::string dbg_str{"Format [store_id:trunc_lsn:logidx:dev_trunc_pending?:active_writes_in_trucate?] "}; - - { - folly::SharedMutexWritePriority::ReadHolder holder(m_store_map_mtx); - for (auto& id_logstore : m_id_logstore_map) { - auto& store_ptr = id_logstore.second.log_store; - const auto& trunc_info = store_ptr->pre_device_truncation(); - - if (!trunc_info.pending_dev_truncation && !trunc_info.active_writes_not_part_of_truncation) { - // This log store neither has any pending device truncation nor active logstore io going on for now. - // Ignore this log store for min safe boundary calculation. - fmt::format_to(std::back_inserter(dbg_str), "[{}:None] ", store_ptr->get_store_id()); - m_non_participating_stores.push_back(store_ptr); - continue; - } - - fmt::format_to(std::back_inserter(dbg_str), "[{}:{}:{}:{}:{}] ", store_ptr->get_store_id(), - trunc_info.seq_num.load(), trunc_info.ld_key.idx, trunc_info.pending_dev_truncation, - trunc_info.active_writes_not_part_of_truncation); - if (trunc_info.ld_key.idx > min_safe_ld_key.idx) { continue; } - - if (trunc_info.ld_key.idx < min_safe_ld_key.idx) { - // New minimum safe l entry - min_safe_ld_key = trunc_info.ld_key; - m_min_trunc_stores.clear(); - } - m_min_trunc_stores.push_back(store_ptr); - } - } - - if ((min_safe_ld_key == logdev_key::out_of_bound_ld_key()) || (min_safe_ld_key.idx < 0)) { - HS_PERIODIC_LOG(DEBUG, logstore, - "[log_dev={}] No log store append on any log stores, skipping device truncation, " - "all_logstore_info:<{}>", - m_logdev_id, dbg_str); - return min_safe_ld_key; - } - - // Got the safest log id to truncate and actually truncate upto the safe log idx to the log device - if (!dry_run) { truncate(min_safe_ld_key); } - HS_PERIODIC_LOG(INFO, logstore, - "[log_dev={}] LogDevice truncate, all_logstore_info:<{}> safe log dev key to truncate={}", - m_logdev_id, dbg_str, min_safe_ld_key); - - // We call post device truncation only to the log stores whose prepared truncation points are fully - // truncated or to stores which didn't particpate in this device truncation. - for (auto& store_ptr : m_min_trunc_stores) { - store_ptr->post_device_truncation(min_safe_ld_key); - } - for (auto& store_ptr : m_non_participating_stores) { - store_ptr->post_device_truncation(min_safe_ld_key); - } - m_min_trunc_stores.clear(); // Not clearing here, would cause a shared_ptr ref holding. - m_non_participating_stores.clear(); + if (!log_store) return; - return min_safe_ld_key; + log_store->on_log_found(lsn, ld_key, flush_ld_key, buf); } nlohmann::json LogDev::dump_log_store(const log_dump_req& dump_req) { @@ -905,7 +699,6 @@ nlohmann::json LogDev::get_status(int verbosity) const { js["time_since_last_log_flush_ns"] = get_elapsed_time_ns(m_last_flush_time); if (verbosity == 2) { js["logdev_stopped?"] = m_stopped; - js["is_log_flushing_now?"] = m_is_flushing.load(std::memory_order_relaxed); js["logdev_sb_start_offset"] = m_logdev_meta.get_start_dev_offset(); js["logdev_sb_num_stores_reserved"] = m_logdev_meta.num_stores_reserved(); } @@ -939,20 +732,13 @@ logdev_superblk* LogDevMetadata::create(logdev_id_t id) { return sb; } -void LogDevMetadata::destroy() { - m_rollback_sb.destroy(); - m_sb.destroy(); -} - void LogDevMetadata::reset() { m_id_reserver.reset(); m_store_info.clear(); } void LogDevMetadata::logdev_super_blk_found(const sisl::byte_view& buf, void* meta_cookie) { - m_sb.load(buf, meta_cookie); - LOGINFO("Logdev superblk found log_dev={}", m_sb->logdev_id); HS_REL_ASSERT_EQ(m_sb->get_magic(), logdev_superblk::LOGDEV_SB_MAGIC, "Invalid logdev metablk, magic mismatch"); HS_REL_ASSERT_EQ(m_sb->get_version(), logdev_superblk::LOGDEV_SB_VERSION, "Invalid version of logdev metablk"); } @@ -1027,7 +813,7 @@ void LogDevMetadata::unreserve_store(logstore_id_t store_id, bool persist_now) { remove_all_rollback_records(store_id, persist_now); resize_logdev_sb_if_needed(); - if (store_id < *m_store_info.rbegin()) { + if (!m_store_info.empty() && store_id < *m_store_info.rbegin()) { HS_LOG(DEBUG, logstore, "logdev meta not shrunk log_idx={} highest indx {}", store_id, *m_store_info.rbegin(), m_sb->num_stores); // We have not shrunk the store info, so we need to explicitly clear the store meta in on-disk meta @@ -1173,4 +959,9 @@ bool LogDevMetadata::resize_rollback_sb_if_needed() { return false; } } + +void LogDevMetadata::destroy() { + m_rollback_sb.destroy(); + m_sb.destroy(); +} } // namespace homestore diff --git a/src/lib/logstore/log_dev.hpp b/src/lib/logstore/log_dev.hpp index 2614ba25c..d5a545453 100644 --- a/src/lib/logstore/log_dev.hpp +++ b/src/lib/logstore/log_dev.hpp @@ -261,7 +261,6 @@ class LogGroup { log_group_header* header() { return reinterpret_cast< log_group_header* >(m_cur_log_buf); } const log_group_header* header() const { return reinterpret_cast< const log_group_header* >(m_cur_log_buf); } iovec_array const& iovecs() const { return m_iovecs; } - // uint32_t data_size() const { return header()->group_size - sizeof(log_group_header); } uint32_t actual_data_size() const { return m_actual_data_size; } uint32_t nrecords() const { return m_nrecords; } @@ -293,9 +292,6 @@ class LogGroup { off_t m_log_dev_offset; uint64_t m_flush_multiple_size{0}; - Clock::time_point m_flush_finish_time; // Time at which flush is completed - Clock::time_point m_post_flush_msg_rcvd_time; // Time at which flush done message delivered - Clock::time_point m_post_flush_process_done_time; // Time at which entire log group cb is called private: log_group_footer* add_and_get_footer(); @@ -386,21 +382,6 @@ std::basic_ostream< charT, traits >& operator<<(std::basic_ostream< charT, trait } // namespace homestore -namespace fmt { -template <> -struct formatter< homestore::logdev_key > { - template < typename ParseContext > - constexpr auto parse(ParseContext& ctx) { - return ctx.begin(); - } - - template < typename FormatContext > - auto format(homestore::logdev_key const& k, FormatContext& ctx) { - return format_to(ctx.out(), "[idx={} dev_offset={}]", k.idx, k.dev_offset); - } -}; -} // namespace fmt - namespace homestore { using log_buffer = sisl::byte_view; @@ -501,7 +482,6 @@ class LogDevMetadata { ~LogDevMetadata() = default; logdev_superblk* create(logdev_id_t id); - void destroy(); void reset(); std::vector< std::pair< logstore_id_t, logstore_superblk > > load(); void persist(); @@ -530,6 +510,7 @@ class LogDevMetadata { void logdev_super_blk_found(const sisl::byte_view& buf, void* meta_cookie); void rollback_super_blk_found(const sisl::byte_view& buf, void* meta_cookie); + void destroy(); private: bool resize_logdev_sb_if_needed(); @@ -540,7 +521,6 @@ class LogDevMetadata { } uint32_t store_capacity() const; - void remove_all_rollback_records(logstore_id_t id); private: superblk< logdev_superblk > m_sb; @@ -557,8 +537,8 @@ class JournalVirtualDev; class log_stream_reader { public: - log_stream_reader(off_t device_cursor, JournalVirtualDev* vdev, shared< JournalVirtualDev::Descriptor > vdev_jd, - uint64_t min_read_size); + log_stream_reader(off_t device_cursor, std::shared_ptr< JournalVirtualDev > vdev, + shared< JournalVirtualDev::Descriptor > vdev_jd, uint64_t min_read_size); log_stream_reader(const log_stream_reader&) = delete; log_stream_reader& operator=(const log_stream_reader&) = delete; log_stream_reader(log_stream_reader&&) noexcept = delete; @@ -572,7 +552,7 @@ class log_stream_reader { sisl::byte_view read_next_bytes(uint64_t nbytes, bool& end_of_stream); private: - JournalVirtualDev* m_vdev; + std::shared_ptr< JournalVirtualDev > m_vdev; shared< JournalVirtualDev::Descriptor > m_vdev_jd; // Journal descriptor. sisl::byte_view m_cur_log_buf; off_t m_first_group_cursor; @@ -586,36 +566,26 @@ struct logstore_info { bool append_mode; folly::SharedPromise< std::shared_ptr< HomeLogStore > > promise{}; }; -struct truncate_req { - std::mutex mtx; - std::condition_variable cv; - bool wait_till_done{false}; - bool dry_run{false}; - device_truncate_cb_t cb; - std::unordered_map< logdev_id_t, logdev_key > m_trunc_upto_result; - int trunc_outstanding{0}; -}; static std::string const logdev_sb_meta_name{"Logdev_sb"}; static std::string const logdev_rollback_sb_meta_name{"Logdev_rollback_sb"}; +VENUM(flush_mode_t, uint32_t, // Various flush modes (can be or'ed together) + INLINE = 1 << 0, // Allow flush inline with the append + TIMER = 1 << 1, // Allow timer based automatic flush + EXPLICIT = 1 << 2, // Allow explcitly user calling flush +); + class LogDev : public std::enable_shared_from_this< LogDev > { friend class HomeLogStore; public: - // NOTE: Possibly change these in future to include constant correctness - typedef std::function< void(logstore_id_t, logdev_key, logdev_key, uint32_t nremaining_in_batch, void*) > - log_append_comp_callback; - typedef std::function< void(logstore_id_t, logstore_seq_num_t, logdev_key, logdev_key, log_buffer, uint32_t) > - log_found_callback; - typedef std::function< void(logstore_id_t, const logstore_superblk&) > store_found_callback; - typedef std::function< bool(void) > flush_blocked_callback; - static inline int64_t flush_data_threshold_size() { return HS_DYNAMIC_CONFIG(logstore.flush_threshold_size) - sizeof(log_group_header); } - LogDev(logdev_id_t logdev_id, JournalVirtualDev* vdev); + LogDev(logdev_id_t logdev_id, + flush_mode_t flush_mode = static_cast< flush_mode_t >(HS_DYNAMIC_CONFIG(logstore.flush_mode))); LogDev(const LogDev&) = delete; LogDev& operator=(const LogDev&) = delete; LogDev(LogDev&&) noexcept = delete; @@ -627,8 +597,9 @@ class LogDev : public std::enable_shared_from_this< LogDev > { * to the recovery. It is expected that all callbacks are registered before calling the start. * * @param format: Do we need to format the logdev or not. + * @param blk_store: The blk_store associated to this logdev */ - void start(bool format); + void start(bool format, std::shared_ptr< JournalVirtualDev > vdev); /** * @brief Stop the logdev. It resets all the parameters it is using and thus can be started later @@ -637,22 +608,16 @@ class LogDev : public std::enable_shared_from_this< LogDev > { void stop(); /** - * @brief Destroy the logdev metablks. + * @brief return whether the logdev is stopped or not * */ - void destroy(); + bool is_stopped(); /** - * @brief Start the flush timer. - * - */ - void start_timer(); - - /** - * @brief Stop the flush timer. + * @brief Destroy the logdev metablks. * */ - void stop_timer(); + void destroy(); /** * @brief Append the data to the log device asynchronously. The buffer that is passed is expected to be valid, till @@ -668,60 +633,50 @@ class LogDev : public std::enable_shared_from_this< LogDev > { * @return logid_t : log_idx of the log of the data. */ logid_t append_async(logstore_id_t store_id, logstore_seq_num_t seq_num, const sisl::io_blob& data, - void* cb_context, bool flush_wait = false); + void* cb_context); /** * @brief Read the log id from the device offset * - * @param idx log_id to read - * @param dev_offset device offset of the log id which was provided upon append. This is needed to locate the log - * idx within the device. A log data can be looked up only by pair of log_id and dev_offset. - * - * @param record_header Pass the pointer to the header of the read record + * @param logdev_key : log_id and dev_offset pair to read * * @return log_buffer : Opaque structure which contains the data blob and its size. It is safe buffer and hence it * need not be freed and can be cheaply passed it around. */ - log_buffer read(const logdev_key& key, serialized_log_record& record_header); + log_buffer read(const logdev_key& key); /** - * @brief Load the data from the blkstore starting with offset. This method loads data in bulk and then call - * the registered logfound_cb with key and buffer. NOTE: This method is not thread safe. It is expected to be called - * during recovery - * - * @param offset Log blkstore device offset. - */ - void load(uint64_t offset); - - // callback from blkstore, registered at vdev creation; - // void process_logdev_completions(const boost::intrusive_ptr< virtualdev_req >& vd_req); - - /** - * @brief Reserve logstore id and persist if needed. It persists the entire map about the logstore id inside the + * @brief Read the log id from the device offset * - * @return uint32_t : Return the reserved id - */ - logstore_id_t reserve_store_id(); - - /** - * @brief Unreserve the logstore id. It does not immediately unregisters and persist the unregistered map, but it - * will add to the waiting list (garbage list) and then during truncation, it actually unreserves and persits map. + * @param logdev_key : log_id and dev_offset pair to read * - * @param store_id + * @param record_header Pass the pointer to the header of the read record */ - void unreserve_store_id(logstore_id_t store_id); + void read_record_header(const logdev_key& key, serialized_log_record& record_header); + + /// @brief Flush the log device in case if pending data size is at least the threshold size. This is a blocking call + /// and hence it is required to run on thread/fiber which can run blocking io. If not run on such thread, it will + /// redirect the flush to a flush thread and run there. + /// + /// @param threshold_size [Optional]: Size in bytes after which it will flush, if set to -1, will use default size + /// + /// @return bool : True if it has flushed the data, false otherwise + bool flush_if_necessary(int64_t threshold_size = -1); + + /// @brief : Look at all logstore and find out the safest point upto which it can truncate and truncate them. + /// + /// @return number of log records it has truncated + uint64_t truncate(); /** - * @brief Is the given store id already reserved. + * @brief Rollback the logid range specific to the given store id. This method persists the information + * synchronously to the underlying storage. Once rolledback those logids in this range are ignored (only for + * this logstore) during load. * - * @return true or false - */ - bool is_reserved_store_id(logstore_id_t id); - - /** - * @brief This method persist the store ids reserved/unreserved inside the vdev super block + * @param store_id : Store id whose logids are to be rolled back or invalidated + * @param id_range : Log id range to rollback/invalidate */ - void persist_store_ids(); + void rollback(logstore_id_t store_id, logid_range_t id_range); /** * @brief This method get all the store ids that are registered already and out of them which are being garbaged @@ -732,103 +687,74 @@ class LogDev : public std::enable_shared_from_this< LogDev > { */ void get_registered_store_ids(std::vector< logstore_id_t >& registered, std::vector< logstore_id_t >& garbage); - crc32_t get_prev_crc() const { return m_last_crc; } - - /** - * @brief This method attempts to block the log flush and then make a callback cb. If it is already blocked, - * then after previous flush is completed, it will make the callback (while log flush is still under blocked state) - * - * @param cb Callback - * @return true or false based on if it is able to block the flush right away. - */ - bool run_under_flush_lock(const flush_blocked_callback& cb); - - /** - * @brief Unblock the flush. While unblocking if there are other requests to block or any flush pending it first - * executes them before unblocking - */ - void unlock_flush(bool do_flush = true); - - /** - * @brief Rollback the logid range specific to the given store id. This method persists the information - * synchronously to the underlying storage. Once rolledback those logids in this range are ignored (only for - * this logstore) during load. - * - * @param store_id : Store id whose logids are to be rolled back or invalidated - * @param id_range : Log id range to rollback/invalidate - */ - void rollback(logstore_id_t store_id, logid_range_t id_range); - - void update_store_superblk(logstore_id_t idx, const logstore_superblk& meta, bool persist_now); - nlohmann::json dump_log_store(const log_dump_req& dum_req); nlohmann::json get_status(int verbosity) const; - bool flush_if_needed(int64_t threshold_size = -1); + //////////////////// Logstore management /////////////////////// + /// @brief Create a new log store under this log device + /// @param append_mode Is this log store is append mode or not. If append mode, write_async call fails and only + /// append_async calls succeed. + /// + /// @return shared< HomeLogStore > : The newly created log store + shared< HomeLogStore > create_new_log_store(bool append_mode = false); + + /// @brief Open the log store which was created under this log device. It expects that log store id is already + /// created. Behavior of opening a log store which was never created is unknown. One can create log store in + /// non-append mode, but upon restart, it can be opened in append_mode. Log store is not usable until the future is + /// armed with logstore. It is expected that caller calls this method before LogDev::start() is called, otherwise + /// unopened log devs and log stores are removed. + /// + /// @param store_id Store id to open the log store + /// @param append_mode Is this log store is append mode or not. If append mode, write_async call fails and only + /// append_async calls succeed. + /// @return future< shared< HomeLogStore > > : Future which will be set with the log store once it is opened + folly::Future< shared< HomeLogStore > > open_log_store(logstore_id_t store_id, bool append_mode); - bool is_aligned_buf_needed(size_t size) const { - return (log_record::is_size_inlineable(size, m_flush_size_multiple) == false); - } + /// @brief Remove the log store and its associated resources + /// @param store_id Store id that was created/opened + void remove_log_store(logstore_id_t store_id); - uint64_t get_flush_size_multiple() const { return m_flush_size_multiple; } - logdev_key get_last_flush_ld_key() const { return logdev_key{m_last_flush_idx, m_last_flush_dev_offset}; } + /// @return externally visible lock to avoid flush concurrently + auto flush_guard() { return std::unique_lock(m_flush_mtx); } - LogDevMetadata& log_dev_meta() { return m_logdev_meta; } - static bool can_flush_in_this_thread(); + /// @brief do flush under the protection of flush guard + bool flush_under_guard(); - // Logstore management. - std::shared_ptr< HomeLogStore > create_new_log_store(bool append_mode = false); - folly::Future< shared< HomeLogStore > > open_log_store(logstore_id_t store_id, bool append_mode); - bool close_log_store(logstore_id_t store_id) { - // TODO: Implement this method - return true; - } - void remove_log_store(logstore_id_t store_id); - void on_io_completion(logstore_id_t id, logdev_key ld_key, logdev_key flush_idx, uint32_t nremaining_in_batch, - void* ctx); - void on_log_store_found(logstore_id_t store_id, const logstore_superblk& sb); - void on_logfound(logstore_id_t id, logstore_seq_num_t seq_num, logdev_key ld_key, logdev_key flush_ld_key, - log_buffer buf, uint32_t nremaining_in_batch); - void on_batch_completion(HomeLogStore* log_store, uint32_t nremaining_in_batch, logdev_key flush_ld_key); + ///////////////// Getters /////////////////////// + LogDevMetadata& log_dev_meta() { return m_logdev_meta; } + logdev_id_t get_id() const { return m_logdev_id; } + uint64_t get_flush_size_multiple() const { return m_flush_size_multiple; } - /** - * Truncates the device under lock. - * - * This function is responsible for truncating the device based on the provided truncate request. - * The truncation operation is performed under a lock to ensure thread safety. - * - * @param treq The truncate request to be processed. - */ - void device_truncate_under_lock(const std::shared_ptr< truncate_req > treq); +private: + void start_timer(); + void stop_timer(); - void handle_unopened_log_stores(bool format); - logdev_id_t get_id() { return m_logdev_id; } - shared< JournalVirtualDev::Descriptor > get_journal_descriptor() const { return m_vdev_jd; } - bool is_stopped() { return m_stopped; } + bool allow_inline_flush() const { return uint32_cast(m_flush_mode) & uint32_cast(flush_mode_t::INLINE); } + bool allow_timer_flush() const { return uint32_cast(m_flush_mode) & uint32_cast(flush_mode_t::TIMER); } + bool allow_explicit_flush() const { return uint32_cast(m_flush_mode) & uint32_cast(flush_mode_t::EXPLICIT); } - // bool ready_for_truncate() const { return m_vdev_jd->ready_for_truncate(); } + void verify_log_group_header(const logid_t idx, const log_group_header* header); -private: /** - * @brief : truncate up to input log id; + * @brief Reserve logstore id and persist if needed. It persists the entire map about the logstore id inside the * - * @param key : the key containing log id that needs to be truncate up to; - * @return number of records to truncate + * @return uint32_t : Return the reserved id */ - uint64_t truncate(const logdev_key& key); + logstore_id_t reserve_store_id(); /** - * Truncates the device. - * - * This function truncates the device and returns the corresponding logdev_key. - * - * @param dry_run If set to true, the function performs a dry run without actually truncating the device, it only - * updates the corresponding truncation barriers, pretending the truncation happened without actually discarding the - * log entries on device. + * @brief Unreserve the logstore id. It does not immediately unregisters and persist the unregistered map, but it + * will add to the waiting list (garbage list) and then during truncation, it actually unreserves and persits map. * - * @return The logdev_key representing the truncated device. + * @param store_id */ - logdev_key do_device_truncate(bool dry_run = false); + void unreserve_store_id(logstore_id_t store_id); + + void on_flush_completion(LogGroup* lg); + void on_log_store_found(logstore_id_t store_id, const logstore_superblk& sb); + void handle_unopened_log_stores(bool format); + void on_logfound(logstore_id_t id, logstore_seq_num_t seq_num, logdev_key ld_key, logdev_key flush_ld_key, + log_buffer buf, uint32_t nremaining_in_batch); LogGroup* make_log_group(uint32_t estimated_records) { m_log_group_pool[m_log_group_idx].reset(estimated_records); @@ -838,73 +764,52 @@ class LogDev : public std::enable_shared_from_this< LogDev > { void free_log_group(LogGroup* lg) { m_log_group_idx = !m_log_group_idx; } LogGroup* prepare_flush(int32_t estimated_record); - - void do_flush(LogGroup* lg); - void do_flush_write(LogGroup* lg); - void flush_by_size(uint32_t min_threshold, uint32_t new_record_size = 0, logid_t new_idx = -1); - void on_flush_completion(LogGroup* lg); void do_load(off_t offset); + void assert_next_pages(log_stream_reader& lstream); -#if 0 - log_group_header* read_validate_header(uint8_t* buf, uint32_t size, bool* read_more); - sisl::byte_array read_next_header(uint32_t max_buf_reads); -#endif + /// @brief force to flush the log device + /// @return whether real flush is done + bool flush(); - void _persist_info_block(); - void assert_next_pages(log_stream_reader& lstream); - void set_flush_status(bool flush_status); - bool get_flush_status(); + bool can_flush_in_this_thread(); private: - std::unique_ptr< sisl::StreamTracker< log_record > > - m_log_records; // The container which stores all in-memory log records - std::atomic< logid_t > m_log_idx{0}; // Generator of log idx - std::atomic< int64_t > m_pending_flush_size{0}; // How much flushable logs are pending - std::atomic< bool > m_is_flushing{false}; // Is LogDev currently flushing (so far supports one flusher at a time) - bool m_stopped{true}; // Is Logdev stopped. We don't need lock here, because it is updated under flush lock + std::unique_ptr< sisl::StreamTracker< log_record > > m_log_records; // Container stores all in-memory log records + std::atomic< logid_t > m_log_idx{0}; // Generator of log idx + std::atomic< int64_t > m_pending_flush_size{0}; // How much flushable logs are pending + bool m_stopped{false}; // Is Logdev stopped. We don't need lock here, because it is updated under flush lock logdev_id_t m_logdev_id; - JournalVirtualDev* m_vdev{nullptr}; + std::shared_ptr< JournalVirtualDev > m_vdev; shared< JournalVirtualDev::Descriptor > m_vdev_jd; // Journal descriptor. HomeStoreSafePtr m_hs; // Back pointer to homestore + flush_mode_t m_flush_mode; folly::SharedMutexWritePriority m_store_map_mtx; std::unordered_map< logstore_id_t, logstore_info > m_id_logstore_map; std::unordered_map< logstore_id_t, uint64_t > m_unopened_store_io; std::unordered_set< logstore_id_t > m_unopened_store_id; - std::unordered_map< logstore_id_t, logid_t > m_last_flush_info; - std::multimap< logid_t, logstore_id_t > m_garbage_store_ids; Clock::time_point m_last_flush_time; logid_t m_last_flush_idx{-1}; // Track last flushed, last device offset and truncated log idx - off_t m_last_flush_dev_offset{0}; - logid_t m_last_truncate_idx{-1}; - + logid_t m_last_truncate_idx{std::numeric_limits< logid_t >::min()}; // logdev truncate up to this idx crc32_t m_last_crc{INVALID_CRC32_VALUE}; - log_append_comp_callback m_append_comp_cb{nullptr}; - log_found_callback m_logfound_cb{nullptr}; - store_found_callback m_store_found_cb{nullptr}; // LogDev Info block related fields std::mutex m_meta_mutex; LogDevMetadata m_logdev_meta; - - // Block flush Q request Q - std::mutex m_block_flush_q_mutex; - std::condition_variable m_block_flush_q_cv; - std::mutex m_comp_mutex; - std::vector< flush_blocked_callback >* m_block_flush_q{nullptr}; - - void* m_sb_cookie{nullptr}; uint64_t m_flush_size_multiple{0}; // Pool for creating log group LogGroup m_log_group_pool[max_log_group]; uint32_t m_log_group_idx{0}; - std::atomic< bool > m_flush_status = false; // Timer handle iomgr::timer_handle_t m_flush_timer_hdl{iomgr::null_timer_handle}; + // if we support inline flush mode, we might schedule flush operation in the same thread(for exampel, in the + // callback of the append_async we schedule another flush.), so we need the lock to be locked for multitimes in the + // same thread. + iomgr::FiberManagerLib::mutex m_flush_mtx; }; // LogDev } // namespace homestore diff --git a/src/lib/logstore/log_store.cpp b/src/lib/logstore/log_store.cpp index 672f05558..7c2fbe82f 100644 --- a/src/lib/logstore/log_store.cpp +++ b/src/lib/logstore/log_store.cpp @@ -38,77 +38,34 @@ HomeLogStore::HomeLogStore(std::shared_ptr< LogDev > logdev, logstore_id_t id, b m_logdev{logdev}, m_records{"HomeLogStoreRecords", start_lsn - 1}, m_append_mode{append_mode}, - m_seq_num{start_lsn}, + m_start_lsn{start_lsn}, + m_next_lsn{start_lsn}, + m_tail_lsn{start_lsn - 1}, m_fq_name{fmt::format("{} log_dev={}", id, logdev->get_id())}, - m_metrics{logstore_service().metrics()} { - m_truncation_barriers.reserve(10000); - m_safe_truncation_boundary.ld_key = m_logdev->get_last_flush_ld_key(); - m_safe_truncation_boundary.seq_num.store(start_lsn - 1, std::memory_order_release); - THIS_LOGSTORE_LOG(TRACE, "m_safe_truncation_boundary.ld_key={}", m_safe_truncation_boundary.ld_key); -} - -bool HomeLogStore::write_sync(logstore_seq_num_t seq_num, const sisl::io_blob& b) { - HS_LOG_ASSERT((!iomanager.am_i_worker_reactor()), "Sync can not be done in worker reactor thread"); - - // these should be static so that they stay in scope in the lambda in case function ends before lambda completes - struct Context { - std::mutex write_mutex; - std::condition_variable write_cv; - bool write_done{false}; - bool ret{false}; - }; - auto ctx = std::make_shared< Context >(); - this->write_async( - seq_num, b, nullptr, - [seq_num, this, ctx](homestore::logstore_seq_num_t seq_num_cb, [[maybe_unused]] const sisl::io_blob& b, - homestore::logdev_key ld_key, [[maybe_unused]] void* cb_ctx) { - HS_DBG_ASSERT((ld_key && seq_num == seq_num_cb), "Write_Async failed or corrupted"); - { - std::unique_lock< std::mutex > lk{ctx->write_mutex}; - ctx->write_done = true; - ctx->ret = true; - } - ctx->write_cv.notify_one(); - }, - true /* flush_wait */); - - { - std::unique_lock< std::mutex > lk{ctx->write_mutex}; - ctx->write_cv.wait(lk, [&ctx] { return ctx->write_done; }); - } - - return ctx->ret; -} + m_metrics{logstore_service().metrics()} {} void HomeLogStore::write_async(logstore_req* req, const log_req_comp_cb_t& cb) { HS_LOG_ASSERT((cb || m_comp_cb), "Expected either cb is not null or default cb registered"); req->cb = (cb ? cb : m_comp_cb); req->start_time = Clock::now(); - if (req->seq_num == 0) { - m_safe_truncation_boundary.ld_key = m_logdev->get_last_flush_ld_key(); - THIS_LOGSTORE_LOG(TRACE, "m_safe_truncation_boundary.ld_key={}", m_safe_truncation_boundary.ld_key); - } + #ifndef NDEBUG - const auto trunc_upto_lsn = truncated_upto(); - if (req->seq_num <= trunc_upto_lsn) { - THIS_LOGSTORE_LOG(ERROR, "Assert: Appending lsn={} lesser than or equal to truncated_upto_lsn={}", req->seq_num, - trunc_upto_lsn); + if (req->seq_num < start_lsn()) { + THIS_LOGSTORE_LOG(ERROR, "Assert: Writing lsn={} lesser than start_lsn={}", req->seq_num, start_lsn()); HS_DBG_ASSERT(0, "Assertion"); } #endif - m_records.create(req->seq_num); COUNTER_INCREMENT(m_metrics, logstore_append_count, 1); HISTOGRAM_OBSERVE(m_metrics, logstore_record_size, req->data.size()); - m_logdev->append_async(m_store_id, req->seq_num, req->data, static_cast< void* >(req), req->flush_wait); + m_logdev->append_async(m_store_id, req->seq_num, req->data, static_cast< void* >(req)); } void HomeLogStore::write_async(logstore_seq_num_t seq_num, const sisl::io_blob& b, void* cookie, - const log_write_comp_cb_t& cb, bool flush_wait) { + const log_write_comp_cb_t& cb) { // Form an internal request and issue the write - auto* req = logstore_req::make(this, seq_num, b, true /* is_write_req */); + auto* req = logstore_req::make(this, seq_num, b); req->cookie = cookie; - req->flush_wait = flush_wait; write_async(req, [cb](logstore_req* req, logdev_key written_lkey) { if (cb) { cb(req->seq_num, req->data, written_lkey, req->cookie); } @@ -118,20 +75,30 @@ void HomeLogStore::write_async(logstore_seq_num_t seq_num, const sisl::io_blob& logstore_seq_num_t HomeLogStore::append_async(const sisl::io_blob& b, void* cookie, const log_write_comp_cb_t& cb) { HS_DBG_ASSERT_EQ(m_append_mode, true, "append_async can be called only on append only mode"); - const auto seq_num = m_seq_num.fetch_add(1, std::memory_order_acq_rel); + const auto seq_num = m_next_lsn.fetch_add(1, std::memory_order_acq_rel); write_async(seq_num, b, cookie, cb); return seq_num; } +void HomeLogStore::write_and_flush(logstore_seq_num_t seq_num, const sisl::io_blob& b) { + HS_LOG_ASSERT(iomanager.am_i_sync_io_capable(), + "Write and flush is a blocking IO, which can't run in this thread, please reschedule to a fiber"); + if (seq_num > m_next_lsn.load(std::memory_order_relaxed)) m_next_lsn.store(seq_num + 1, std::memory_order_relaxed); + write_async(seq_num, b, nullptr /* cookie */, nullptr /* cb */); + m_logdev->flush_under_guard(); +} + log_buffer HomeLogStore::read_sync(logstore_seq_num_t seq_num) { + HS_LOG_ASSERT(iomanager.am_i_sync_io_capable(), + "Read sync is a blocking IO, which can't run in this thread, reschedule to a fiber"); + // If seq_num has not been flushed yet, but issued, then we flush them before reading auto const s = m_records.status(seq_num); if (s.is_out_of_range || s.is_hole) { - // THIS_LOGSTORE_LOG(ERROR, "ld_key not valid {}", seq_num); - throw std::out_of_range("key not valid"); + throw std::out_of_range("key not valid since it has been truncated"); } else if (!s.is_completed) { THIS_LOGSTORE_LOG(TRACE, "Reading lsn={}:{} before flushed, doing flush first", m_store_id, seq_num); - flush_sync(seq_num); + m_logdev->flush_under_guard(); } const auto record = m_records.at(seq_num); @@ -142,167 +109,104 @@ log_buffer HomeLogStore::read_sync(logstore_seq_num_t seq_num) { } const auto start_time = Clock::now(); - // THIS_LOGSTORE_LOG(TRACE, "Reading lsn={}:{} mapped to logdev_key=[idx={} dev_offset={}]", m_store_id, seq_num, - // ld_key.idx, ld_key.dev_offset); COUNTER_INCREMENT(m_metrics, logstore_read_count, 1); - serialized_log_record header; - const auto b = m_logdev->read(ld_key, header); + const auto b = m_logdev->read(ld_key); HISTOGRAM_OBSERVE(m_metrics, logstore_read_latency, get_elapsed_time_us(start_time)); return b; } -#if 0 -void HomeLogStore::read_async(logstore_req* req, const log_found_cb_t& cb) { - HS_LOG_ASSERT( ((cb != nullptr) || (m_comp_cb != nullptr)), - "Expected either cb is not null or default cb registered"); - auto record = m_records.at(req->seq_num); - logdev_key ld_key = record.m_dev_key; - req->cb = cb; - m_logdev->read_async(ld_key, (void*)req); -} -void HomeLogStore::read_async(logstore_seq_num_t seq_num, void* cookie, const log_found_cb_t& cb) { - auto record = m_records.at(seq_num); - logdev_key ld_key = record.m_dev_key; - sisl::io_blob b; - auto* req = logstore_req::make(this, seq_num, &b, false /* not write */); - read_async(req, [cookie, cb](logstore_seq_num_t seq_num, log_buffer log_buf, void* cookie) { - cb(seq, log_buf, cookie); - logstore_req::free(req); - }); -} -#endif +void HomeLogStore::on_write_completion(logstore_req* req, const logdev_key& ld_key, const logdev_key& flush_ld_key) { + // Logstore supports out-of-order lsn writes, in that case we need to mark the truncation key for this lsn as the + // one which is being written by the higher lsn. This is to ensure that we don't truncate higher lsn's logdev_key + // when we truncate the lower lsns. + // + // out-of-order means we can write lsns in any order and flush them, say we have + //-> Write lsn=1 + //-> Write lsn=4 + //-> Write lsn=2 + // and can flush. When we check for contiguous completion or recovery we get upto lsn=2. The moment we have + // lsn=3, 4 also will be visible. + // This is an additional feature outside of a typical logstore, to allow external replication engine to control the + // logstore. This feature isn't used by RAFT, as we start logstores in append_only mode. + + // TODO: In case of out-of-order lsns, it needs to read the records of the tail_lsn and get their truncation key. + // This involves a read lock and an atomic operation. We can optimize this in case if the ld_key is updated for the + // same batch. + logdev_key trunc_key; + if (m_tail_lsn < req->seq_num) { + m_tail_lsn = req->seq_num; + trunc_key = flush_ld_key; + } else { + // this means out-of-order happens. for example , if lsn=1, 4 are written and flushed , they will be flushed in + // LogGroup1. when lsn=2 , 3 is written and flushed , they will be flushed in LogGroup2. the m_log_dev_offset of + // LogGroup2 is larger than that of LogGroup1. now , if we want to truncate to lsn 3, we can not remove logGroup + // 1 from logDev, since that will not only remove lsn 1 but also will remove lsn 4. so we set the m_trunc_key of + // lsn 2 and 3 to the same as lsn 4(tail_lsn).when truncation is shecduled, the safe truncation ld_key of this + // logstore will be the m_trunc_key of lsn 4, which will keep LogGroup 1 from being removed. + trunc_key = m_records.at(m_tail_lsn).m_trunc_key; + } -void HomeLogStore::on_write_completion(logstore_req* req, const logdev_key& ld_key) { - std::unique_lock lk(m_sync_flush_mtx); + atomic_update_max(m_next_lsn, req->seq_num + 1, std::memory_order_acq_rel); // Upon completion, create the mapping between seq_num and log dev key - m_records.update(req->seq_num, [&](logstore_record& rec) -> bool { + m_records.update(req->seq_num, [&ld_key, &trunc_key](logstore_record& rec) -> bool { rec.m_dev_key = ld_key; - THIS_LOGSTORE_LOG(DEBUG, "Completed write of store lsn {} logdev_key={}", req->seq_num, ld_key); + rec.m_trunc_key = trunc_key; return true; }); - // assert(flush_ld_key.idx >= m_last_flush_ldkey.idx); - - // Update the maximum lsn we have seen for this batch for this store, it is needed to create truncation barrier - m_flush_batch_max_lsn = std::max(m_flush_batch_max_lsn, req->seq_num); - HISTOGRAM_OBSERVE(m_metrics, logstore_append_latency, get_elapsed_time_us(req->start_time)); - auto lsn = req->seq_num; - (req->cb) ? req->cb(req, ld_key) : m_comp_cb(req, ld_key); - - if (m_sync_flush_waiter_lsn.load() == lsn) { - // Sync flush is waiting for this lsn to be completed, wake up the sync flush cv - m_sync_flush_cv.notify_one(); - } -} - -void HomeLogStore::on_read_completion(logstore_req* req, const logdev_key& ld_key) { - (req->cb) ? req->cb(req, ld_key) : m_comp_cb(req, ld_key); } void HomeLogStore::on_log_found(logstore_seq_num_t seq_num, const logdev_key& ld_key, const logdev_key& flush_ld_key, log_buffer buf) { - THIS_LOGSTORE_LOG(DEBUG, "Found a log lsn={} logdev_key={}", seq_num, ld_key); - - // Create the mapping between seq_num and log dev key - m_records.create_and_complete(seq_num, ld_key); - atomic_update_max(m_seq_num, seq_num + 1, std::memory_order_acq_rel); - m_flush_batch_max_lsn = std::max(m_flush_batch_max_lsn, seq_num); - - if (seq_num <= m_safe_truncation_boundary.seq_num.load(std::memory_order_acquire)) { - THIS_LOGSTORE_LOG(TRACE, "Log lsn={} is already truncated on per device, ignoring", seq_num); - return; - } - if (m_found_cb != nullptr) m_found_cb(seq_num, buf, nullptr); -} + if (seq_num < m_start_lsn) { return; } -void HomeLogStore::on_batch_completion(const logdev_key& flush_batch_ld_key) { - assert(m_flush_batch_max_lsn != std::numeric_limits< logstore_seq_num_t >::min()); - - // Create a new truncation barrier for this completion key - if (m_truncation_barriers.size() && (m_truncation_barriers.back().seq_num >= m_flush_batch_max_lsn)) { - m_truncation_barriers.back().ld_key = flush_batch_ld_key; + logdev_key trunc_key; + if (m_tail_lsn < seq_num) { + m_tail_lsn = seq_num; + trunc_key = flush_ld_key; } else { - m_truncation_barriers.push_back({m_flush_batch_max_lsn, flush_batch_ld_key}); + trunc_key = m_records.at(m_tail_lsn).m_trunc_key; } - m_flush_batch_max_lsn = std::numeric_limits< logstore_seq_num_t >::min(); // Reset the flush batch for next batch. -} -void HomeLogStore::truncate(logstore_seq_num_t upto_seq_num, bool in_memory_truncate_only) { -#if 0 - if (!iomanager.is_io_thread()) { - LOGDFATAL("Expected truncate to be called from iomanager thread. Ignoring truncate"); - return; - } -#endif + // Create the mapping between seq_num and log dev key + m_records.create_and_complete(seq_num, logstore_record(ld_key, trunc_key)); -#ifndef NDEBUG - const auto s = m_safe_truncation_boundary.seq_num.load(std::memory_order_acquire); - if (s != -1) { - auto cs = get_contiguous_completed_seq_num(s); - if (upto_seq_num > cs) { - THIS_LOGSTORE_LOG(WARN, - "Truncation issued on seq_num={} outside of contiguous completions={}, " - "still proceeding to truncate", - upto_seq_num, cs); - } - } -#endif + atomic_update_max(m_next_lsn, seq_num + 1, std::memory_order_acq_rel); - // First try to block the flushing of logdevice and if we are successfully able to do, then - auto shared_this = shared_from_this(); - m_logdev->run_under_flush_lock([shared_this, upto_seq_num]() { - shared_this->do_truncate(upto_seq_num); - return true; - }); + if (m_found_cb != nullptr) { m_found_cb(seq_num, buf, nullptr); } } -// NOTE: This method assumes the flush lock is already acquired by the caller -void HomeLogStore::do_truncate(logstore_seq_num_t upto_seq_num) { - m_records.truncate(upto_seq_num); - m_safe_truncation_boundary.seq_num.store(upto_seq_num, std::memory_order_release); - - // Need to update the superblock with meta, we don't persist yet, will be done as part of log dev truncation - m_logdev->update_store_superblk(m_store_id, logstore_superblk{upto_seq_num + 1}, false /* persist_now */); - - const int ind = search_max_le(upto_seq_num); - if (ind < 0) { - // m_safe_truncation_boundary.pending_dev_truncation = false; - THIS_LOGSTORE_PERIODIC_LOG(DEBUG, - "Truncate upto lsn={}, possibly already truncated so ignoring. Current safe device " - "truncation barrier=", - upto_seq_num, m_safe_truncation_boundary.ld_key); - return; +void HomeLogStore::truncate(logstore_seq_num_t upto_lsn, bool in_memory_truncate_only) { + if (upto_lsn < m_start_lsn) { return; } +#ifndef NDEBUG + auto cs = get_contiguous_completed_seq_num(0); + if (upto_lsn > cs) { + THIS_LOGSTORE_LOG(WARN, + "Truncation issued on seq_num={} outside of contiguous completions={}, " + "still proceeding to truncate", + upto_lsn, cs); } - THIS_LOGSTORE_PERIODIC_LOG( - DEBUG, "Truncate upto lsn={}, nearest safe device truncation barrier , is_last_barrier={}", - upto_seq_num, ind, m_truncation_barriers[ind].ld_key, - (ind == static_cast< int >(m_truncation_barriers.size() - 1))); - - m_safe_truncation_boundary.ld_key = m_truncation_barriers[ind].ld_key; - THIS_LOGSTORE_LOG(TRACE, "m_safe_truncation_boundary.ld_key={}", m_safe_truncation_boundary.ld_key); - m_safe_truncation_boundary.pending_dev_truncation = true; - - m_truncation_barriers.erase(m_truncation_barriers.begin(), m_truncation_barriers.begin() + ind + 1); -} - -// NOTE: This method assumes the flush lock is already acquired by the caller -const truncation_info& HomeLogStore::pre_device_truncation() { - m_safe_truncation_boundary.active_writes_not_part_of_truncation = (m_truncation_barriers.size() > 0); - return m_safe_truncation_boundary; -} +#endif -// NOTE: This method assumes the flush lock is already acquired by the caller -void HomeLogStore::post_device_truncation(const logdev_key& trunc_upto_loc) { - if (trunc_upto_loc.idx >= m_safe_truncation_boundary.ld_key.idx) { - // This method is expected to be called always with this - m_safe_truncation_boundary.pending_dev_truncation = false; - m_safe_truncation_boundary.ld_key = trunc_upto_loc; - THIS_LOGSTORE_LOG(TRACE, "m_safe_truncation_boundary.ld_key={}", m_safe_truncation_boundary.ld_key); + if (upto_lsn > m_tail_lsn) { + THIS_LOGSTORE_LOG(WARN, + "Truncating issued on lsn={} which is greater than tail_lsn={}, truncating upto tail_lsn", + upto_lsn, m_tail_lsn.load(std::memory_order_relaxed)); + m_trunc_ld_key = m_records.at(m_tail_lsn).m_trunc_key; + upto_lsn = m_tail_lsn; } else { - HS_REL_ASSERT(0, - "We expect post_device_truncation to be called only for logstores which has min of all " - "truncation boundaries"); + m_trunc_ld_key = m_records.at(upto_lsn).m_trunc_key; + THIS_LOGSTORE_LOG(TRACE, "Truncating logstore upto lsn={} , m_trunc_ld_key index {} offset {}", upto_lsn, + m_trunc_ld_key.idx, m_trunc_ld_key.dev_offset); } + m_records.truncate(upto_lsn); + m_start_lsn.store(upto_lsn + 1); + if (!in_memory_truncate_only) { m_logdev->truncate(); } +} + +std::tuple< logstore_seq_num_t, logdev_key, logstore_seq_num_t > HomeLogStore::truncate_info() const { + auto const trunc_lsn = m_start_lsn.load(std::memory_order_relaxed) - 1; + return std::make_tuple(trunc_lsn, m_trunc_ld_key, m_tail_lsn.load(std::memory_order_relaxed)); } void HomeLogStore::fill_gap(logstore_seq_num_t seq_num) { @@ -310,58 +214,30 @@ void HomeLogStore::fill_gap(logstore_seq_num_t seq_num) { seq_num); logdev_key empty_ld_key; - m_records.create_and_complete(seq_num, empty_ld_key); -} - -int HomeLogStore::search_max_le(logstore_seq_num_t input_sn) { - int mid{0}; - int start{-1}; - int end = int_cast(m_truncation_barriers.size()); - - while ((end - start) > 1) { - mid = start + (end - start) / 2; - const auto& mid_entry = m_truncation_barriers[mid]; - - if (mid_entry.seq_num == input_sn) { - return mid; - } else if (mid_entry.seq_num > input_sn) { - end = mid; - } else { - start = mid; - } - } - - return (end - 1); + m_records.create_and_complete(seq_num, logstore_record(empty_ld_key, empty_ld_key)); } nlohmann::json HomeLogStore::dump_log_store(const log_dump_req& dump_req) { nlohmann::json json_dump{}; // create root object json_dump["store_id"] = this->m_store_id; - const auto trunc_upto = this->truncated_upto(); - std::remove_const_t< decltype(trunc_upto) > idx{trunc_upto + 1}; - if (dump_req.start_seq_num != 0) { idx = dump_req.start_seq_num; } + int64_t start_idx = std::max(dump_req.start_seq_num, start_lsn()); // must use move operator= operation instead of move copy constructor nlohmann::json json_records = nlohmann::json::array(); - bool end_iterate{false}; - m_records.foreach_contiguous_completed( - idx, - [&json_records, &dump_req, &end_iterate, this](int64_t cur_idx, int64_t max_idx, - const homestore::logstore_record& record) -> bool { - // do a sync read - // must use move operator= operation instead of move copy constructor + m_records.foreach_all_completed( + start_idx, [this, &dump_req, &json_records](int64_t, homestore::logstore_record const& rec) -> bool { nlohmann::json json_val = nlohmann::json::object(); serialized_log_record record_header; - const auto log_buffer{m_logdev->read(record.m_dev_key, record_header)}; - + const auto log_buffer = m_logdev->read(rec.m_dev_key); + m_logdev->read_record_header(rec.m_dev_key, record_header); try { - json_val["size"] = static_cast< uint32_t >(record_header.size); - json_val["offset"] = static_cast< uint32_t >(record_header.offset); - json_val["is_inlined"] = static_cast< uint32_t >(record_header.get_inlined()); - json_val["store_seq_num"] = static_cast< uint64_t >(record_header.store_seq_num); - json_val["store_id"] = static_cast< logstore_id_t >(record_header.store_id); + json_val["size"] = uint32_cast(record_header.size); + json_val["offset"] = uint32_cast(record_header.offset); + json_val["is_inlined"] = uint32_cast(record_header.get_inlined()); + json_val["lsn"] = uint64_cast(record_header.store_seq_num); + json_val["store_id"] = s_cast< logstore_id_t >(record_header.store_id); } catch (const std::exception& ex) { THIS_LOGSTORE_LOG(ERROR, "Exception in json dump- {}", ex.what()); } if (dump_req.verbosity_level == homestore::log_dump_verbosity::CONTENT) { @@ -371,9 +247,7 @@ nlohmann::json HomeLogStore::dump_log_store(const log_dump_req& dump_req) { json_val["content"] = std::move(content); } json_records.emplace_back(std::move(json_val)); - int64_t end_idx = std::min(max_idx, dump_req.end_seq_num); - end_iterate = (cur_idx < end_idx) ? true : false; - return end_iterate; + return true; }); json_dump["log_records"] = std::move(json_records); @@ -382,9 +256,7 @@ nlohmann::json HomeLogStore::dump_log_store(const log_dump_req& dump_req) { void HomeLogStore::foreach (int64_t start_idx, const std::function< bool(logstore_seq_num_t, log_buffer) >& cb) { m_records.foreach_all_completed(start_idx, [&](int64_t cur_idx, homestore::logstore_record& record) -> bool { - // do a sync read - serialized_log_record header; - auto log_buf = m_logdev->read(record.m_dev_key, header); + auto log_buf = m_logdev->read(record.m_dev_key); return cb(cur_idx, log_buf); }); } @@ -397,96 +269,67 @@ logstore_seq_num_t HomeLogStore::get_contiguous_completed_seq_num(logstore_seq_n return (logstore_seq_num_t)m_records.completed_upto(from + 1); } -void HomeLogStore::flush_sync(logstore_seq_num_t upto_seq_num) { - // Logdev flush is async call and if flush_sync is called on the same thread which could potentially do logdev - // flush, waiting sync would cause deadlock. - HS_DBG_ASSERT_EQ(LogDev::can_flush_in_this_thread(), false, - "Logstore flush sync cannot be called on same thread which could do logdev flush"); - - std::unique_lock lk(m_single_sync_flush_mtx); - if (upto_seq_num == invalid_lsn()) { upto_seq_num = m_records.active_upto(); } - - // if we have flushed already, we are done - if (!m_records.status(upto_seq_num).is_active) { return; } - - { - std::unique_lock lk(m_sync_flush_mtx); - - // Step 1: Mark the waiter lsn to the seqnum we wanted to wait for. The completion of every lsn checks - // for this and if this lsn is completed, will make a callback which signals the cv. - m_sync_flush_waiter_lsn.store(upto_seq_num); - - // Step 2: After marking this lsn, we again do a check, to avoid a race where completion checked for no lsn - // and the lsn is stored in step 1 above. - if (!m_records.status(upto_seq_num).is_active) { return; } - - // Step 3: Force a flush (with least threshold) - m_logdev->flush_if_needed(1); +void HomeLogStore::flush(logstore_seq_num_t upto_lsn) { + if (!m_logdev->allow_explicit_flush()) { + HS_LOG_ASSERT(false, + "Explicit flush is turned off or calling flush on wrong thread for this logdev, ignoring flush"); + return; + } - // Step 4: Wait for completion - m_sync_flush_cv.wait(lk, [this, upto_seq_num] { return !m_records.status(upto_seq_num).is_active; }); + if (upto_lsn == invalid_lsn()) { upto_lsn = m_records.active_upto(); } - // NOTE: We are not resetting the lsn because same seq number should never have 2 completions and thus not - // doing it saves an atomic instruction - THIS_LOGSTORE_LOG(TRACE, "flush_sync over upto_seq_num {}", upto_seq_num); - } + // if we have flushed already, we are done, else issue a flush + if (m_records.status(upto_lsn).is_active) m_logdev->flush_under_guard(); } -uint64_t HomeLogStore::rollback_async(logstore_seq_num_t to_lsn, on_rollback_cb_t cb) { +bool HomeLogStore::rollback(logstore_seq_num_t to_lsn) { // Validate if the lsn to which it is rolledback to is not truncated. auto ret = m_records.status(to_lsn + 1); if (ret.is_out_of_range) { HS_LOG_ASSERT(false, "Attempted to rollback to {} which is already truncated", to_lsn); - return 0; + return false; } - // Ensure that there are no pending lsn to flush. If so lets flush them now. - const auto from_lsn = get_contiguous_issued_seq_num(0); - if (get_contiguous_completed_seq_num(0) < from_lsn) { flush_sync(); } - HS_DBG_ASSERT_EQ(get_contiguous_completed_seq_num(0), get_contiguous_issued_seq_num(0), - "Still some pending lsns to flush, concurrent write and rollback is not supported"); - - // Do an in-memory rollback of lsns before we persist the log ids. This is done, so that subsequent appends can - // be queued without waiting for rollback async operation completion. It is safe to do so, since before returning - // from this method, we will queue ourselves to the flush lock and thus subsequent writes are guaranteed to go after - // this rollback is completed. - m_seq_num.store(to_lsn + 1, std::memory_order_release); // Rollback the next append lsn - logid_range_t logid_range = std::make_pair(m_records.at(to_lsn + 1).m_dev_key.idx, - m_records.at(from_lsn).m_dev_key.idx); // Get the logid range to rollback - m_records.rollback(to_lsn); // Rollback all bitset records and from here on, we can't access any lsns beyond to_lsn - - m_logdev->run_under_flush_lock([logid_range, to_lsn, this, comp_cb = std::move(cb)]() { - iomanager.run_on_forget(logstore_service().truncate_thread(), [logid_range, to_lsn, this, comp_cb]() { - // Rollback the log_ids in the range, for this log store (which persists this info in its superblk) - m_logdev->rollback(m_store_id, logid_range); - - // Remove all truncation barriers on rolled back lsns - for (auto it = std::rbegin(m_truncation_barriers); it != std::rend(m_truncation_barriers); ++it) { - if (it->seq_num > to_lsn) { - m_truncation_barriers.erase(std::next(it).base()); - } else { - break; - } + bool do_flush{false}; + do { + { + std::unique_lock lg = m_logdev->flush_guard(); + if (m_tail_lsn + 1 < m_next_lsn.load()) { + // We should flush any outstanding writes before we proceed with rollback + THIS_LOGSTORE_LOG(INFO, + "Rollback is issued while while there are some oustanding writes, tail_lsn={}, " + "next_lsn={}, will flush and retry rollback", + m_tail_lsn.load(std::memory_order_relaxed), + m_next_lsn.load(std::memory_order_relaxed)); + do_flush = true; + } else { + do_flush = false; + logid_range_t logid_range = + std::make_pair(m_records.at(to_lsn + 1).m_dev_key.idx, + m_records.at(m_tail_lsn).m_dev_key.idx); // Get the logid range to rollback + + // Update the next_lsn and tail lsn back to to_lsn and also rollback all stream records and now on, we + // can't access any lsns beyond to_lsn + m_next_lsn.store(to_lsn + 1, std::memory_order_release); // Rollback the next append lsn + m_tail_lsn = to_lsn; + m_records.rollback(to_lsn); + + // Rollback the log_ids in the range, for this log store (which persists this info in its superblk) + m_logdev->rollback(m_store_id, logid_range); } - m_flush_batch_max_lsn = invalid_lsn(); // Reset the flush batch for next batch. - if (comp_cb) { comp_cb(to_lsn); } - m_logdev->unlock_flush(); - }); - return false; - }); + } + if (do_flush) m_logdev->flush_under_guard(); + } while (do_flush); - return from_lsn - to_lsn; + return true; } nlohmann::json HomeLogStore::get_status(int verbosity) const { nlohmann::json js; js["append_mode"] = m_append_mode; - js["highest_lsn"] = m_seq_num.load(std::memory_order_relaxed); - js["max_lsn_in_prev_flush_batch"] = m_flush_batch_max_lsn; - js["truncated_upto_logdev_key"] = m_safe_truncation_boundary.ld_key.to_string(); - js["truncated_upto_lsn"] = m_safe_truncation_boundary.seq_num.load(std::memory_order_relaxed); - js["truncation_pending_on_device?"] = m_safe_truncation_boundary.pending_dev_truncation; - js["truncation_parallel_to_writes?"] = m_safe_truncation_boundary.active_writes_not_part_of_truncation; + js["start_lsn"] = m_start_lsn.load(std::memory_order_relaxed); + js["next_lsn"] = m_next_lsn.load(std::memory_order_relaxed); + js["tail_lsn"] = m_tail_lsn.load(std::memory_order_relaxed); js["logstore_records"] = m_records.get_status(verbosity); js["logstore_sb_first_lsn"] = m_logdev->log_dev_meta().store_superblk(m_store_id).m_first_seq_num; return js; diff --git a/src/lib/logstore/log_store_service.cpp b/src/lib/logstore/log_store_service.cpp index 5d9e0049f..d09d02f7c 100644 --- a/src/lib/logstore/log_store_service.cpp +++ b/src/lib/logstore/log_store_service.cpp @@ -115,7 +115,7 @@ void LogStoreService::start(bool format) { start_threads(); for (auto& [logdev_id, logdev] : m_id_logdev_map) { - logdev->start(format); + logdev->start(format, m_logdev_vdev); } } @@ -140,7 +140,7 @@ logdev_id_t LogStoreService::create_new_logdev() { folly::SharedMutexWritePriority::WriteHolder holder(m_logdev_map_mtx); logdev_id_t logdev_id = get_next_logdev_id(); auto logdev = create_new_logdev_internal(logdev_id); - logdev->start(true /* format */); + logdev->start(true /* format */, m_logdev_vdev); COUNTER_INCREMENT(m_metrics, logdevs_count, 1); HS_LOG(INFO, logstore, "Created log_dev={}", logdev_id); return logdev_id; @@ -181,7 +181,7 @@ void LogStoreService::delete_unopened_logdevs() { } std::shared_ptr< LogDev > LogStoreService::create_new_logdev_internal(logdev_id_t logdev_id) { - auto logdev = std::make_shared< LogDev >(logdev_id, m_logdev_vdev.get()); + auto logdev = std::make_shared< LogDev >(logdev_id); const auto it = m_id_logdev_map.find(logdev_id); HS_REL_ASSERT((it == m_id_logdev_map.end()), "logdev id {} already exists", logdev_id); m_id_logdev_map.insert(std::make_pair<>(logdev_id, logdev)); @@ -192,7 +192,7 @@ void LogStoreService::open_logdev(logdev_id_t logdev_id) { folly::SharedMutexWritePriority::WriteHolder holder(m_logdev_map_mtx); const auto it = m_id_logdev_map.find(logdev_id); if (it == m_id_logdev_map.end()) { - auto logdev = std::make_shared< LogDev >(logdev_id, m_logdev_vdev.get()); + auto logdev = std::make_shared< LogDev >(logdev_id); m_id_logdev_map.emplace(logdev_id, logdev); LOGDEBUGMOD(logstore, "log_dev={} does not exist, created!", logdev_id); } @@ -231,7 +231,7 @@ void LogStoreService::logdev_super_blk_found(const sisl::byte_view& buf, void* m logdev = it->second; HS_LOG(DEBUG, logstore, "Log dev superblk found log_dev={}", id); } else { - logdev = std::make_shared< LogDev >(id, m_logdev_vdev.get()); + logdev = std::make_shared< LogDev >(id); m_id_logdev_map.emplace(id, logdev); // when recover logdev meta blk, we get all the logdevs from the superblk. we put them in m_unopened_logdev // too. after logdev meta blks are all recovered, when a client opens a logdev, we remove it from @@ -290,27 +290,15 @@ void LogStoreService::remove_log_store(logdev_id_t logdev_id, logstore_id_t stor COUNTER_DECREMENT(m_metrics, logstores_count, 1); } -void LogStoreService::device_truncate(const device_truncate_cb_t& cb, bool wait_till_done, bool dry_run) { - const auto treq = std::make_shared< truncate_req >(); - treq->wait_till_done = wait_till_done; - treq->dry_run = dry_run; - treq->cb = cb; - if (treq->wait_till_done) { treq->trunc_outstanding = m_id_logdev_map.size(); } - +void LogStoreService::device_truncate() { // TODO: make device_truncate_under_lock return future and do collectAllFutures; - for (auto& [id, logdev] : m_id_logdev_map) { - logdev->device_truncate_under_lock(treq); - } - - if (treq->wait_till_done) { - std::unique_lock< std::mutex > lk{treq->mtx}; - treq->cv.wait(lk, [&] { return (treq->trunc_outstanding == 0); }); - } + for (auto& [id, logdev] : m_id_logdev_map) + logdev->truncate(); } -void LogStoreService::flush_if_needed() { +void LogStoreService::flush() { for (auto& [id, logdev] : m_id_logdev_map) { - logdev->flush_if_needed(); + logdev->flush_under_guard(); } } @@ -334,22 +322,9 @@ void LogStoreService::start_threads() { ctx->cv.notify_one(); } }); - - m_truncate_fiber = nullptr; - iomanager.create_reactor("logstore_truncater", iomgr::INTERRUPT_LOOP, 2 /* num_fibers */, - [this, ctx](bool is_started) { - if (is_started) { - m_truncate_fiber = iomanager.sync_io_capable_fibers()[0]; - { - std::unique_lock< std::mutex > lk{ctx->mtx}; - ++(ctx->thread_cnt); - } - ctx->cv.notify_one(); - } - }); { std::unique_lock< std::mutex > lk{ctx->mtx}; - ctx->cv.wait(lk, [ctx] { return (ctx->thread_cnt == 2); }); + ctx->cv.wait(lk, [ctx] { return (ctx->thread_cnt == 1); }); } } @@ -398,6 +373,7 @@ LogStoreServiceMetrics::LogStoreServiceMetrics() : sisl::MetricsGroup("LogStores REGISTER_HISTOGRAM(logdev_flush_done_msg_time_ns, "Logdev flush completion msg time in ns"); REGISTER_HISTOGRAM(logdev_post_flush_processing_latency, "Logdev post flush processing (including callbacks) latency"); + REGISTER_HISTOGRAM(logdev_flush_time_us, "time elapsed since last flush time in us"); REGISTER_HISTOGRAM(logdev_fsync_time_us, "Logdev fsync completion time in us"); register_me_to_farm(); diff --git a/src/lib/logstore/log_stream.cpp b/src/lib/logstore/log_stream.cpp index dd5eb5660..c5e732ccf 100644 --- a/src/lib/logstore/log_stream.cpp +++ b/src/lib/logstore/log_stream.cpp @@ -23,7 +23,7 @@ namespace homestore { SISL_LOGGING_DECL(logstore) -log_stream_reader::log_stream_reader(off_t device_cursor, JournalVirtualDev* vdev, +log_stream_reader::log_stream_reader(off_t device_cursor, shared< JournalVirtualDev > vdev, shared< JournalVirtualDev::Descriptor > vdev_jd, uint64_t read_size_multiple) : m_vdev{vdev}, m_vdev_jd{std::move(vdev_jd)}, @@ -164,7 +164,7 @@ sisl::byte_view log_stream_reader::group_in_next_page() { } sisl::byte_view log_stream_reader::read_next_bytes(uint64_t nbytes, bool& end_of_stream) { - // TO DO: Might need to address alignment based on data or fast type + // TODO: Might need to address alignment based on data or fast type const auto prev_pos = m_vdev_jd->seeked_pos(); auto sz_to_read = m_vdev_jd->sync_next_read(nullptr, nbytes); if (sz_to_read == -1) { diff --git a/src/lib/replication/log_store/home_raft_log_store.cpp b/src/lib/replication/log_store/home_raft_log_store.cpp index f49e5a48f..85f55d7dc 100644 --- a/src/lib/replication/log_store/home_raft_log_store.cpp +++ b/src/lib/replication/log_store/home_raft_log_store.cpp @@ -37,8 +37,12 @@ SISL_LOGGING_DECL(replication) msg, ##__VA_ARGS__); namespace homestore { -static constexpr store_lsn_t to_store_lsn(uint64_t raft_lsn) { return s_cast< store_lsn_t >(raft_lsn) - 1; } -static constexpr store_lsn_t to_store_lsn(repl_lsn_t repl_lsn) { return repl_lsn - 1; } +static constexpr logstore_seq_num_t to_store_lsn(uint64_t raft_lsn) { + return static_cast< logstore_seq_num_t >(raft_lsn - 1); +} +static constexpr logstore_seq_num_t to_store_lsn(repl_lsn_t repl_lsn) { + return static_cast< logstore_seq_num_t >(repl_lsn - 1); +} static constexpr repl_lsn_t to_repl_lsn(store_lsn_t store_lsn) { return store_lsn + 1; } static uint64_t extract_term(const log_buffer& log_bytes) { @@ -70,12 +74,18 @@ void HomeRaftLogStore::truncate(uint32_t num_reserved_cnt, repl_lsn_t compact_ls REPL_STORE_LOG(INFO, "LogDev={}: Truncating log entries from {} to {}, compact_lsn={}, last_lsn={}", m_logdev_id, start_lsn, truncate_lsn, compact_lsn, last_lsn); + // this will only truncate in memory. + // we rely on resrouce mgr timer to trigger real truncate for all log stores in system; + // this will be friendly for multiple logstore on same logdev; m_log_store->truncate(truncate_lsn); } } HomeRaftLogStore::HomeRaftLogStore(logdev_id_t logdev_id, logstore_id_t logstore_id, log_found_cb_t const& log_found_cb, - log_replay_done_cb_t const& log_replay_done_cb) { + log_replay_done_cb_t const& log_replay_done_cb) : + // TODO: make this capacity configurable if necessary + // repl_lsn starts from 1, so we set lsn 0 to be dummy + m_log_entry_cache(100, std::make_pair(0, nullptr)) { m_dummy_log_entry = nuraft::cs_new< nuraft::log_entry >(0, nuraft::buffer::alloc(0), nuraft::log_val_type::app_log); if (logstore_id == UINT32_MAX) { @@ -127,6 +137,13 @@ ulong HomeRaftLogStore::start_index() const { nuraft::ptr< nuraft::log_entry > HomeRaftLogStore::last_entry() const { store_lsn_t max_seq = m_log_store->get_contiguous_issued_seq_num(m_last_durable_lsn); if (max_seq < 0) { return m_dummy_log_entry; } + ulong lsn = to_repl_lsn(max_seq); + auto position_in_cache = lsn % m_log_entry_cache.size(); + { + std::shared_lock lk(m_mutex); + auto nle = m_log_entry_cache[position_in_cache]; + if (nle.first == lsn) return nle.second; + } nuraft::ptr< nuraft::log_entry > nle; try { @@ -147,13 +164,20 @@ ulong HomeRaftLogStore::append(nuraft::ptr< nuraft::log_entry >& entry) { auto const next_seq = m_log_store->append_async(sisl::io_blob{buf->data_begin(), uint32_cast(buf->size()), false /* is_aligned */}, nullptr /* cookie */, [buf](int64_t, sisl::io_blob&, logdev_key, void*) {}); - return to_repl_lsn(next_seq); + ulong lsn = to_repl_lsn(next_seq); + + auto position_in_cache = lsn % m_log_entry_cache.size(); + { + std::unique_lock lk(m_mutex); + m_log_entry_cache[position_in_cache] = std::make_pair(lsn, entry); + } + return lsn; } void HomeRaftLogStore::write_at(ulong index, nuraft::ptr< nuraft::log_entry >& entry) { auto buf = entry->serialize(); - m_log_store->rollback_async(to_store_lsn(index) - 1, nullptr); + m_log_store->rollback(to_store_lsn(index) - 1); // we need to reset the durable lsn, because its ok to set to lower number as it will be updated on next flush // calls, but it is dangerous to set higher number. @@ -164,10 +188,10 @@ void HomeRaftLogStore::write_at(ulong index, nuraft::ptr< nuraft::log_entry >& e } void HomeRaftLogStore::end_of_append_batch(ulong start, ulong cnt) { - store_lsn_t end_lsn = to_store_lsn(start + cnt - 1); - m_log_store->flush_sync(end_lsn); - REPL_STORE_LOG(DEBUG, "end_of_append_batch flushed upto start={} cnt={} lsn={}", start, cnt, start + cnt - 1); + auto end_lsn = to_store_lsn(start + cnt - 1); + m_log_store->flush(end_lsn); m_last_durable_lsn = end_lsn; + REPL_STORE_LOG(TRACE, "end_of_append_batch flushed upto start={} cnt={} lsn={}", start, cnt, start + cnt - 1); } nuraft::ptr< std::vector< nuraft::ptr< nuraft::log_entry > > > HomeRaftLogStore::log_entries(ulong start, ulong end) { @@ -185,6 +209,13 @@ nuraft::ptr< std::vector< nuraft::ptr< nuraft::log_entry > > > HomeRaftLogStore: } nuraft::ptr< nuraft::log_entry > HomeRaftLogStore::entry_at(ulong index) { + auto positio_in_cache = index % m_log_entry_cache.size(); + { + std::shared_lock lk(m_mutex); + auto nle = m_log_entry_cache[positio_in_cache]; + if (nle.first == index) return nle.second; + } + nuraft::ptr< nuraft::log_entry > nle; try { auto log_bytes = m_log_store->read_sync(to_store_lsn(index)); @@ -197,6 +228,13 @@ nuraft::ptr< nuraft::log_entry > HomeRaftLogStore::entry_at(ulong index) { } ulong HomeRaftLogStore::term_at(ulong index) { + auto positio_in_cache = index % m_log_entry_cache.size(); + { + std::shared_lock lk(m_mutex); + auto nle = m_log_entry_cache[positio_in_cache]; + if (nle.first == index) return nle.second->get_term(); + } + ulong term; try { auto log_bytes = m_log_store->read_sync(to_store_lsn(index)); @@ -247,7 +285,7 @@ void HomeRaftLogStore::apply_pack(ulong index, nuraft::buffer& pack) { auto slot = next_slot(); if (index < slot) { // We are asked to apply/insert data behind next slot, so we must rollback before index and then append - m_log_store->rollback_async(to_store_lsn(index) - 1, nullptr); + m_log_store->rollback(to_store_lsn(index) - 1); } else if (index > slot) { // We are asked to apply/insert data after next slot, so we need to fill in with dummy entries upto the slot // before append the entries @@ -285,11 +323,11 @@ bool HomeRaftLogStore::compact(ulong compact_lsn) { } } - m_log_store->flush_sync(to_store_lsn(compact_lsn)); + m_log_store->flush(to_store_lsn(compact_lsn)); - // we rely on resrouce mgr timer to trigger truncate for all log stores in system; + // this will only truncate in memory, and not on disk; + // we rely on resrouce mgr timer to trigger real truncate for all log stores in system; // this will be friendly for multiple logstore on same logdev; - #ifdef _PRERELEASE if (iomgr_flip::instance()->test_flip("force_home_raft_log_truncate")) { REPL_STORE_LOG(TRACE, "Flip force_home_raft_log_truncate is enabled, force truncation, compact_lsn={}", @@ -302,7 +340,7 @@ bool HomeRaftLogStore::compact(ulong compact_lsn) { } bool HomeRaftLogStore::flush() { - m_log_store->flush_sync(); + m_log_store->flush(); return true; } diff --git a/src/lib/replication/log_store/home_raft_log_store.h b/src/lib/replication/log_store/home_raft_log_store.h index bf1103537..d39842af5 100644 --- a/src/lib/replication/log_store/home_raft_log_store.h +++ b/src/lib/replication/log_store/home_raft_log_store.h @@ -200,6 +200,10 @@ class HomeRaftLogStore : public nuraft::log_store { nuraft::ptr< nuraft::log_entry > m_dummy_log_entry; store_lsn_t m_last_durable_lsn{-1}; folly::Future< folly::Unit > m_log_store_future; + + // raft log entry cache related members + std::shared_mutex m_mutex; + std::vector< std::pair< ulong, nuraft::ptr< nuraft::log_entry > > > m_log_entry_cache; }; // helper methods diff --git a/src/tests/test_common/homestore_test_common.hpp b/src/tests/test_common/homestore_test_common.hpp index 7a25c9b12..7aa90b94a 100644 --- a/src/tests/test_common/homestore_test_common.hpp +++ b/src/tests/test_common/homestore_test_common.hpp @@ -190,7 +190,10 @@ class HSTestHelper { homestore::HomeStore::reset_instance(); iomanager.stop(); - if (cleanup) { remove_files(m_generated_devs); } + if (cleanup) { + remove_files(m_generated_devs); + m_generated_devs.clear(); + } } void change_start_cb(hs_before_services_starting_cb_t cb) { m_token.cb() = cb; } diff --git a/src/tests/test_log_dev.cpp b/src/tests/test_log_dev.cpp index ada551940..52208ce89 100644 --- a/src/tests/test_log_dev.cpp +++ b/src/tests/test_log_dev.cpp @@ -149,8 +149,7 @@ class LogDevTest : public ::testing::Test { void insert_sync(std::shared_ptr< HomeLogStore > log_store, logstore_seq_num_t lsn, uint32_t fixed_size = 0) { bool io_memory{false}; auto* d = prepare_data(lsn, io_memory, fixed_size); - const bool succ = log_store->write_sync(lsn, {uintptr_cast(d), d->total_size(), false}); - EXPECT_TRUE(succ); + log_store->write_and_flush(lsn, {uintptr_cast(d), d->total_size(), false}); LOGINFO("Written sync data for LSN -> {}:{}", log_store->get_store_id(), lsn); if (io_memory) { iomanager.iobuf_free(uintptr_cast(d)); @@ -188,26 +187,7 @@ class LogDevTest : public ::testing::Test { try { read_verify(log_store, lsn); } catch (const std::exception& ex) { - logstore_seq_num_t trunc_upto = 0; - std::mutex mtx; - std::condition_variable cv; - bool get_trunc_upto = false; - log_store->get_logdev()->run_under_flush_lock( - [this, log_store, &trunc_upto, &get_trunc_upto, &mtx, &cv] { - // In case we run truncation in parallel to read, it is possible - // the truncated_upto accordingly. - trunc_upto = log_store->truncated_upto(); - std::unique_lock lock(mtx); - get_trunc_upto = true; - cv.notify_one(); - return true; - }); - std::unique_lock lock(mtx); - cv.wait(lock, [&get_trunc_upto] { return get_trunc_upto == true; }); - if (lsn <= trunc_upto) { - lsn = trunc_upto; - continue; - } + auto trunc_upto = log_store->truncated_upto(); LOGFATAL("Failed to read at upto {} lsn {} trunc_upto {}", upto, lsn, trunc_upto); } } @@ -215,27 +195,10 @@ class LogDevTest : public ::testing::Test { void rollback_validate(std::shared_ptr< HomeLogStore > log_store, logstore_seq_num_t& cur_lsn, uint32_t num_lsns_to_rollback) { - std::mutex mtx; - std::condition_variable cv; - bool rollback_done = false; cur_lsn -= num_lsns_to_rollback; auto const upto_lsn = cur_lsn - 1; - log_store->rollback_async(upto_lsn, [&](logstore_seq_num_t) { - ASSERT_EQ(log_store->get_contiguous_completed_seq_num(-1), upto_lsn) - << "Last completed seq num is not reset after rollback"; - ASSERT_EQ(log_store->get_contiguous_issued_seq_num(-1), upto_lsn) - << "Last issued seq num is not reset after rollback"; - read_all_verify(log_store); - { - std::unique_lock lock(mtx); - rollback_done = true; - } - cv.notify_one(); - }); - - // We wait till async rollback is finished as we do validation. - std::unique_lock lock(mtx); - cv.wait(lock, [&rollback_done] { return rollback_done == true; }); + log_store->rollback(upto_lsn); + read_all_verify(log_store); } void truncate_validate(std::shared_ptr< HomeLogStore > log_store) { @@ -243,7 +206,7 @@ class LogDevTest : public ::testing::Test { LOGINFO("truncate_validate upto {}", upto); log_store->truncate(upto); read_all_verify(log_store); - logstore_service().device_truncate(nullptr /* cb */, true /* wait_till_done */); + logstore_service().device_truncate(); } void rollback_records_validate(std::shared_ptr< HomeLogStore > log_store, uint32_t expected_count) { diff --git a/src/tests/test_log_store.cpp b/src/tests/test_log_store.cpp index d2cadd5f3..e76244bff 100644 --- a/src/tests/test_log_store.cpp +++ b/src/tests/test_log_store.cpp @@ -165,7 +165,7 @@ class SampleLogStoreClient { } // Because of restart in tests, we have torce the flush of log entries. - m_log_store->get_logdev()->flush_if_needed(1); + m_log_store->get_logdev()->flush_if_necessary(1); } void iterate_validate(const bool expect_all_completed = false) { @@ -273,25 +273,7 @@ class SampleLogStoreClient { } } - logstore_seq_num_t get_truncated_upto() { - std::mutex mtx; - std::condition_variable cv; - bool get_trunc_upto = false; - logstore_seq_num_t trunc_upto = 0; - m_log_store->get_logdev()->run_under_flush_lock([this, &trunc_upto, &get_trunc_upto, &mtx, &cv]() { - // In case we run truncation in parallel to read, it is possible truncate moved, so adjust - // the truncated_upto accordingly. - trunc_upto = m_log_store->truncated_upto(); - std::unique_lock lock(mtx); - get_trunc_upto = true; - cv.notify_one(); - return true; - }); - - std::unique_lock lock(mtx); - cv.wait(lock, [&get_trunc_upto] { return get_trunc_upto == true; }); - return trunc_upto; - } + logstore_seq_num_t get_truncated_upto() { return m_log_store->truncated_upto(); } void fill_hole_and_validate() { const auto start = m_log_store->truncated_upto(); @@ -321,7 +303,8 @@ class SampleLogStoreClient { LOGINFO("Totally recovered {} non-truncated lsns and {} truncated lsns for store {} log_dev {}", m_n_recovered_lsns, m_n_recovered_truncated_lsns, m_log_store->get_store_id(), m_log_store->get_logdev()->get_id()); - if (m_n_recovered_lsns != (m_cur_lsn.load() - m_truncated_upto_lsn.load() - 1)) { + if (m_truncated_upto_lsn.load() > 0 && + m_n_recovered_lsns != (m_cur_lsn.load() - m_truncated_upto_lsn.load() - 1)) { EXPECT_EQ(m_n_recovered_lsns, m_cur_lsn.load() - m_truncated_upto_lsn.load() - 1) << "Recovered " << m_n_recovered_lsns << " valid lsns for store " << m_log_store->get_store_id() << " Expected to have " << m_cur_lsn.load() - m_truncated_upto_lsn.load() - 1 @@ -331,26 +314,13 @@ class SampleLogStoreClient { } void rollback_validate(uint32_t num_lsns_to_rollback) { - std::mutex mtx; - std::condition_variable cv; - bool rollback_done = false; auto const upto_lsn = m_cur_lsn.fetch_sub(num_lsns_to_rollback) - num_lsns_to_rollback - 1; - m_log_store->rollback_async(upto_lsn, [&](logstore_seq_num_t) { - ASSERT_EQ(m_log_store->get_contiguous_completed_seq_num(-1), upto_lsn) - << "Last completed seq num is not reset after rollback"; - ASSERT_EQ(m_log_store->get_contiguous_issued_seq_num(-1), upto_lsn) - << "Last issued seq num is not reset after rollback"; - read_validate(true); - { - std::unique_lock lock(mtx); - rollback_done = true; - } - cv.notify_one(); - }); - - // We wait till async rollback is finished as we do validation. - std::unique_lock lock(mtx); - cv.wait(lock, [&rollback_done] { return rollback_done == true; }); + m_log_store->rollback(upto_lsn); + ASSERT_EQ(m_log_store->get_contiguous_completed_seq_num(-1), upto_lsn) + << "Last completed seq num is not reset after rollback"; + ASSERT_EQ(m_log_store->get_contiguous_issued_seq_num(-1), upto_lsn) + << "Last issued seq num is not reset after rollback"; + read_validate(true); } void read(const logstore_seq_num_t lsn) { @@ -379,7 +349,7 @@ class SampleLogStoreClient { m_truncated_upto_lsn = lsn; } - void flush() { m_log_store->flush_sync(); } + void flush() { m_log_store->flush(); } bool has_all_lsns_truncated() const { return (m_truncated_upto_lsn.load() == (m_cur_lsn.load() - 1)); } @@ -508,13 +478,13 @@ class SampleDB { }); std::vector< logdev_id_t > logdev_id_vec; - for (uint32_t i{0}; i < n_log_stores; ++i) { + for (uint32_t i{0}; i < n_log_devs; ++i) { logdev_id_vec.push_back(logstore_service().create_new_logdev()); } for (uint32_t i{0}; i < n_log_stores; ++i) { m_log_store_clients.push_back(std::make_unique< SampleLogStoreClient >( - logdev_id_vec[i], bind_this(SampleDB::on_log_insert_completion, 3))); + logdev_id_vec[i % n_log_devs], bind_this(SampleDB::on_log_insert_completion, 3))); } SampleLogStoreClient::s_max_flush_multiple = logstore_service().get_logdev(logdev_id_vec[0])->get_flush_size_multiple(); @@ -553,7 +523,6 @@ class SampleDB { } private: - const static std::string s_fpath_root; std::vector< std::string > m_dev_names; std::function< void() > m_on_schedule_io_cb; test_log_store_comp_cb_t m_io_closure; @@ -562,8 +531,6 @@ class SampleDB { test_common::HSTestHelper m_helper; }; -const std::string SampleDB::s_fpath_root{"/tmp/log_store_dev_"}; - class LogStoreTest : public ::testing::Test { public: LogStoreTest() = default; @@ -574,8 +541,8 @@ class LogStoreTest : public ::testing::Test { virtual ~LogStoreTest() override = default; protected: - virtual void SetUp() override {}; - virtual void TearDown() override {}; + virtual void SetUp() override { SampleDB::instance().start_homestore(); }; + virtual void TearDown() override { SampleDB::instance().shutdown(); }; void init(uint64_t n_total_records, const std::vector< std::pair< size_t, int > >& inp_freqs = {}) { // m_nrecords_waiting_to_issue = std::lround(n_total_records / _batch_size) * _batch_size; @@ -735,7 +702,7 @@ class LogStoreTest : public ::testing::Test { ++skip_truncation; continue; } - lsc->truncate(lsc->m_log_store->get_contiguous_completed_seq_num(-1)); + lsc->truncate(c_seq_num); lsc->read_validate(); } @@ -744,42 +711,41 @@ class LogStoreTest : public ::testing::Test { return; } - bool failed{false}; - logstore_service().device_truncate( - [this, is_parallel_to_write, &failed](auto& trunc_lds) { - bool expect_forward_progress{true}; - uint32_t n_fully_truncated{0}; - if (is_parallel_to_write) { - for (const auto& lsc : SampleDB::instance().m_log_store_clients) { - if (lsc->has_all_lsns_truncated()) ++n_fully_truncated; - } + logstore_service().device_truncate(); - // While inserts are going on, truncation can guaranteed to be forward progressed if none of the - // log stores are fully truncated. If all stores are fully truncated, its obvious no progress, - // but even if one of the store is fully truncated, then it might be possible that logstore is - // holding lowest logdev location and waiting for next flush to finish to move the safe logdev - // location. - expect_forward_progress = (n_fully_truncated == 0); - } + bool expect_forward_progress{true}; + uint32_t n_fully_truncated{0}; + if (is_parallel_to_write) { + for (const auto& lsc : SampleDB::instance().m_log_store_clients) { + if (lsc->has_all_lsns_truncated()) ++n_fully_truncated; + } - if (expect_forward_progress) { - for (auto [fid, trunc_loc] : trunc_lds) { - if (trunc_loc == logdev_key::out_of_bound_ld_key()) { - LOGINFO("No forward progress for device truncation yet."); - } else { - // Validate the truncation is actually moving forward - auto idx = m_truncate_log_idx.count(fid) ? m_truncate_log_idx[fid].load() : -1; - if (trunc_loc.idx <= idx) { failed = true; } - ASSERT_GT(trunc_loc.idx, idx); - m_truncate_log_idx[fid].store(trunc_loc.idx); - } - } + // While inserts are going on, truncation can guaranteed to be forward progressed if none of the + // log stores are fully truncated. If all stores are fully truncated, its obvious no progress, + // but even if one of the store is fully truncated, then it might be possible that logstore is + // holding lowest logdev location and waiting for next flush to finish to move the safe logdev + // location. + expect_forward_progress = (n_fully_truncated == 0); + } + + if (expect_forward_progress) { + // we have multiple log_store for each logdev, so we need to update the highest truncated lsn and then do + // the verification + for (const auto& lsc : SampleDB::instance().m_log_store_clients) { + auto fid = lsc->m_log_store->get_logdev()->get_id(); + auto trunc_loc = lsc->m_log_store->get_trunc_ld_key(); + if (trunc_loc == logdev_key::out_of_bound_ld_key()) { + LOGINFO("No forward progress for device truncation yet.") } else { - LOGINFO("Do not expect forward progress for device truncation"); + auto idx = m_truncate_log_idx.count(fid) ? m_truncate_log_idx[fid].load() : -1; + if (trunc_loc.idx > m_truncate_log_idx[fid].load()) { + m_truncate_log_idx[fid].store(trunc_loc.idx); + } } - }, - true /* wait_till_done */); - ASSERT_FALSE(failed); + } + } else { + LOGINFO("Do not expect forward progress for device truncation"); + } for (auto& logdev : logstore_service().get_all_logdevs()) { auto fid = logdev->get_id(); @@ -790,6 +756,7 @@ class LogStoreTest : public ::testing::Test { it = m_garbage_stores_upto[fid].erase(it); } } + validate_num_stores(); } @@ -965,8 +932,6 @@ TEST_F(LogStoreTest, BurstRandInsertThenTruncate) { } } -// TODO evaluate this test is valid and enable after fixing the flush lock. -#if 0 TEST_F(LogStoreTest, BurstSeqInsertAndTruncateInParallel) { const auto num_records = SISL_OPTIONS["num_records"].as< uint32_t >(); const auto iterations = SISL_OPTIONS["iterations"].as< uint32_t >(); @@ -1004,7 +969,6 @@ TEST_F(LogStoreTest, BurstSeqInsertAndTruncateInParallel) { this->truncate_validate(); } } -#endif TEST_F(LogStoreTest, RandInsertsWithHoles) { const auto num_records = SISL_OPTIONS["num_records"].as< uint32_t >(); @@ -1122,7 +1086,7 @@ TEST_F(LogStoreTest, ThrottleSeqInsertThenRecover) { this->iterate_validate(true); LOGINFO("Step 5: Restart homestore"); - SampleDB::instance().start_homestore(true /* restart */); + SampleDB::instance().start_homestore(true); this->recovery_validate(); this->init(num_records); @@ -1130,7 +1094,7 @@ TEST_F(LogStoreTest, ThrottleSeqInsertThenRecover) { this->read_validate(true); LOGINFO("Step 6: Restart homestore again to validate recovery on consecutive restarts"); - SampleDB::instance().start_homestore(true /* restart */); + SampleDB::instance().start_homestore(true); this->recovery_validate(); this->init(num_records); @@ -1147,7 +1111,7 @@ TEST_F(LogStoreTest, ThrottleSeqInsertThenRecover) { this->iterate_validate(true); LOGINFO("Step 10: Restart homestore again to validate recovery after inserts"); - SampleDB::instance().start_homestore(true /* restart */); + SampleDB::instance().start_homestore(true); this->recovery_validate(); this->init(num_records); @@ -1263,8 +1227,7 @@ TEST_F(LogStoreTest, WriteSyncThenRead) { bool io_memory{false}; auto* d = SampleLogStoreClient::prepare_data(i, io_memory); - const bool succ = tmp_log_store->write_sync(i, {uintptr_cast(d), d->total_size(), false}); - EXPECT_TRUE(succ); + tmp_log_store->write_and_flush(i, {uintptr_cast(d), d->total_size(), false}); LOGINFO("Written sync data for LSN -> {}", i); if (io_memory) { @@ -1292,7 +1255,8 @@ SISL_OPTIONS_ENABLE(logging, test_log_store, iomgr, test_common_setup) SISL_OPTION_GROUP(test_log_store, (num_logdevs, "", "num_logdevs", "number of log devs", ::cxxopts::value< uint32_t >()->default_value("4"), "number"), - (num_logstores, "", "num_logstores", "number of log stores", + (num_logstores, "", "num_logstores", + "number of log stores in all, they will spread to each logdev evenly", ::cxxopts::value< uint32_t >()->default_value("16"), "number"), (num_records, "", "num_records", "number of record to test", ::cxxopts::value< uint32_t >()->default_value("10000"), "number"), @@ -1307,9 +1271,5 @@ int main(int argc, char* argv[]) { spdlog::set_pattern("[%D %T%z] [%^%l%$] [%t] %v"); sisl::logging::SetModuleLogLevel("logstore", spdlog::level::level_enum::trace); sisl::logging::SetModuleLogLevel("journalvdev", spdlog::level::level_enum::debug); - - SampleDB::instance().start_homestore(); - const int ret = RUN_ALL_TESTS(); - SampleDB::instance().shutdown(SISL_OPTIONS["num_devs"].as< uint32_t >()); - return ret; + return RUN_ALL_TESTS(); } diff --git a/src/tests/test_log_store_long_run.cpp b/src/tests/test_log_store_long_run.cpp index a5e6009ba..53ecf317e 100644 --- a/src/tests/test_log_store_long_run.cpp +++ b/src/tests/test_log_store_long_run.cpp @@ -105,6 +105,8 @@ class SampleLogStoreClient { void insert_next_batch(uint32_t batch_size) { const auto cur_lsn = m_cur_lsn.fetch_add(batch_size); + LOGINFO("cur_lsn is {} for store {} log_dev {}", cur_lsn, m_log_store->get_store_id(), + m_log_store->get_logdev()->get_id()); insert(cur_lsn, batch_size, false); } @@ -135,7 +137,7 @@ class SampleLogStoreClient { } // Because of restart in tests, we have torce the flush of log entries. - m_log_store->get_logdev()->flush_if_needed(1); + m_log_store->get_logdev()->flush_if_necessary(1); } void read_validate(const bool expect_all_completed = false) { @@ -164,35 +166,19 @@ class SampleLogStoreClient { } void rollback_validate(uint32_t num_lsns_to_rollback) { - std::mutex mtx; - std::condition_variable cv; - bool rollback_done = false; if (m_log_store->truncated_upto() == m_log_store->get_contiguous_completed_seq_num(-1)) { // No records to rollback. return; } - if ((m_cur_lsn - num_lsns_to_rollback - 1) <= m_log_store->get_contiguous_issued_seq_num(-1)) { return; } - auto const upto_lsn = m_cur_lsn.fetch_sub(num_lsns_to_rollback) - num_lsns_to_rollback - 1; - - m_log_store->rollback_async(upto_lsn, [&](logstore_seq_num_t) { - ASSERT_EQ(m_log_store->get_contiguous_completed_seq_num(-1), upto_lsn) - << "Last completed seq num is not reset after rollback"; - ASSERT_EQ(m_log_store->get_contiguous_issued_seq_num(-1), upto_lsn) - << "Last issued seq num is not reset after rollback"; - read_validate(true); - { - std::unique_lock lock(mtx); - rollback_done = true; - } - cv.notify_one(); - }); - - // We wait till async rollback is finished as we do validation. - std::unique_lock lock(mtx); - cv.wait(lock, [&rollback_done] { return rollback_done == true; }); + m_log_store->rollback(upto_lsn); + ASSERT_EQ(m_log_store->get_contiguous_completed_seq_num(-1), upto_lsn) + << "Last completed seq num is not reset after rollback"; + ASSERT_EQ(m_log_store->get_contiguous_issued_seq_num(-1), upto_lsn) + << "Last issued seq num is not reset after rollback"; + read_validate(true); } void recovery_validate() { @@ -204,13 +190,14 @@ class SampleLogStoreClient { EXPECT_EQ(m_n_recovered_lsns, m_cur_lsn.load() - m_truncated_upto_lsn.load() - 1) << "Recovered " << m_n_recovered_lsns << " valid lsns for store " << m_log_store->get_store_id() << " Expected to have " << m_cur_lsn.load() - m_truncated_upto_lsn.load() - 1 - << " lsns: m_cur_lsn=" << m_cur_lsn.load() << " truncated_upto_lsn=" << m_truncated_upto_lsn; + << " lsns: m_cur_lsn=" << m_cur_lsn.load() << " truncated_upto_lsn=" << m_truncated_upto_lsn + << "store id=" << m_log_store->get_store_id() << " log_dev id=" << m_log_store->get_logdev()->get_id(); assert(false); } } void on_log_found(const logstore_seq_num_t lsn, const log_buffer buf, void* ctx) { - LOGDEBUG("Recovered lsn {}:{} with log data of size {}", m_log_store->get_store_id(), lsn, buf.size()) + // LOGINFO("Recovered lsn {}:{} with log data of size {}", m_log_store->get_store_id(), lsn, buf.size()) EXPECT_LE(lsn, m_cur_lsn.load()) << "Recovered incorrect lsn " << m_log_store->get_store_id() << ":" << lsn << "Expected less than cur_lsn " << m_cur_lsn.load(); auto* tl = r_cast< test_log_data const* >(buf.bytes()); @@ -221,11 +208,12 @@ class SampleLogStoreClient { } void truncate(const logstore_seq_num_t lsn) { + if (lsn <= m_truncated_upto_lsn) return; m_log_store->truncate(lsn); m_truncated_upto_lsn = lsn; } - void flush() { m_log_store->flush_sync(); } + void flush() { m_log_store->flush(); } bool has_all_lsns_truncated() const { return (m_truncated_upto_lsn.load() == (m_cur_lsn.load() - 1)); } @@ -292,6 +280,7 @@ class LogStoreLongRun : public ::testing::Test { public: void start_homestore(bool restart = false) { auto n_log_stores = SISL_OPTIONS["num_logstores"].as< uint32_t >(); + auto n_log_devs = SISL_OPTIONS["num_logdevs"].as< uint32_t >(); if (n_log_stores < 4u) { LOGINFO("Log store test needs minimum 4 log stores for testing, setting them to 4"); n_log_stores = 4u; @@ -322,7 +311,7 @@ class LogStoreLongRun : public ::testing::Test { "test_log_store_long_run", {{HS_SERVICE::META, {.size_pct = 5.0}}, {HS_SERVICE::LOG, - {.size_pct = 84.0, .chunk_size = 8 * 1024 * 1024, .min_chunk_size = 8 * 1024 * 1024}}}, + {.size_pct = 84.0, .chunk_size = 64 * 1024 * 1024, .min_chunk_size = 8 * 1024 * 1024}}}, [this, restart, n_log_stores]() { HS_SETTINGS_FACTORY().modifiable_settings([](auto& s) { // Disable flush and resource mgr timer in UT. @@ -332,41 +321,33 @@ class LogStoreLongRun : public ::testing::Test { HS_SETTINGS_FACTORY().save(); }); - logdev_id_t logdev_id; - for (uint32_t i{0}; i < n_log_stores; ++i) { - logdev_id = logstore_service().create_new_logdev(); + std::vector< logdev_id_t > logdev_id_vec; + for (uint32_t i{0}; i < n_log_devs; ++i) + logdev_id_vec.push_back(logstore_service().create_new_logdev()); + + for (uint32_t i{0}; i < n_log_stores; ++i) m_log_store_clients.push_back(std::make_unique< SampleLogStoreClient >( - logdev_id, bind_this(LogStoreLongRun::on_log_insert_completion, 3))); - } + logdev_id_vec[i % n_log_devs], bind_this(LogStoreLongRun::on_log_insert_completion, 3))); + SampleLogStoreClient::s_max_flush_multiple = - logstore_service().get_logdev(logdev_id)->get_flush_size_multiple(); + logstore_service().get_logdev(logdev_id_vec[0])->get_flush_size_multiple(); } } void shutdown(bool cleanup = true) { m_helper.shutdown_homestore(cleanup); - if (cleanup) { - m_log_store_clients.clear(); - m_highest_log_idx.clear(); - } + if (cleanup) { m_log_store_clients.clear(); } } - bool delete_log_store(logstore_id_t store_id) { - bool removed{false}; + void delete_log_store(logstore_id_t store_id) { for (auto it = std::begin(m_log_store_clients); it != std::end(m_log_store_clients); ++it) { if ((*it)->m_log_store->get_store_id() == store_id) { - logstore_service().remove_log_store((*it)->m_logdev_id, store_id); - logstore_service().destroy_log_dev((*it)->m_logdev_id); + auto log_dev_id = (*it)->m_logdev_id; + logstore_service().remove_log_store(log_dev_id, store_id); m_log_store_clients.erase(it); - removed = true; break; } } - return removed; - } - - logid_t highest_log_idx(logdev_id_t fid) { - return m_highest_log_idx.count(fid) ? m_highest_log_idx[fid].load() : -1; } void init() { @@ -408,8 +389,6 @@ class LogStoreLongRun : public ::testing::Test { } void on_log_insert_completion(logdev_id_t fid, logstore_seq_num_t lsn, logdev_key ld_key) { - if (m_highest_log_idx.count(fid) == 0) { m_highest_log_idx[fid] = std::atomic{-1}; } - atomic_update_max(m_highest_log_idx[fid], ld_key.idx); on_insert_completion(fid, lsn, ld_key); } @@ -451,18 +430,13 @@ class LogStoreLongRun : public ::testing::Test { for (size_t i{0}; i < m_log_store_clients.size(); ++i) { const auto& lsc = m_log_store_clients[i]; - - // lsc->truncate(lsc->m_cur_lsn.load() - 1); - const auto t_seq_num = lsc->m_log_store->truncated_upto(); + const auto t_seq_num = lsc->m_log_store->truncated_upto() + 1; const auto c_seq_num = lsc->m_log_store->get_contiguous_completed_seq_num(-1); if (t_seq_num == c_seq_num) { ++skip_truncation; continue; } - - // Truncate at random point between last truncated and completed seq num. - std::uniform_int_distribution< int64_t > gen_data_size{t_seq_num, c_seq_num}; - lsc->truncate(gen_data_size(rd)); + lsc->truncate(c_seq_num); lsc->read_validate(); } @@ -471,13 +445,8 @@ class LogStoreLongRun : public ::testing::Test { return; } - logstore_service().device_truncate( - [this](auto& trunc_lds) { - for (auto [fid, trunc_loc] : trunc_lds) { - m_truncate_log_idx[fid].store(trunc_loc.idx); - } - }, - true /* wait_till_done */); + logstore_service().device_truncate(); + validate_num_stores(); } @@ -498,8 +467,6 @@ class LogStoreLongRun : public ::testing::Test { // Delete a random logstore. std::uniform_int_distribution< uint64_t > gen{0, m_log_store_clients.size() - 1}; uint64_t idx = gen(rd); - auto fid = m_log_store_clients[idx]->m_logdev_id; - delete_log_store(m_log_store_clients[idx]->m_store_id); validate_num_stores(); @@ -512,20 +479,13 @@ class LogStoreLongRun : public ::testing::Test { void validate_num_stores() { size_t actual_valid_ids{0}; - size_t actual_garbage_ids{0}; - size_t exp_garbage_store_count{0}; - size_t actual_logdevs{0}; for (auto& logdev : logstore_service().get_all_logdevs()) { - auto fid = logdev->get_id(); std::vector< logstore_id_t > reg_ids, garbage_ids; - logdev->get_registered_store_ids(reg_ids, garbage_ids); actual_valid_ids += reg_ids.size() - garbage_ids.size(); - actual_logdevs++; } - ASSERT_EQ(actual_logdevs, m_log_store_clients.size()); ASSERT_EQ(actual_valid_ids, m_log_store_clients.size()); } @@ -549,12 +509,10 @@ class LogStoreLongRun : public ::testing::Test { private: std::vector< std::unique_ptr< SampleLogStoreClient > > m_log_store_clients; - std::map< logdev_id_t, std::atomic< logid_t > > m_highest_log_idx; uint64_t m_nrecords_waiting_to_issue{0}; uint64_t m_nrecords_waiting_to_complete{0}; std::mutex m_pending_mtx; std::condition_variable m_pending_cv; - std::map< logdev_id_t, std::atomic< logid_t > > m_truncate_log_idx; uint32_t m_q_depth{64}; uint32_t m_batch_size{1}; std::random_device rd{}; @@ -565,6 +523,7 @@ class LogStoreLongRun : public ::testing::Test { TEST_F(LogStoreLongRun, LongRunning) { auto run_time = SISL_OPTIONS["run_time"].as< uint64_t >(); auto num_iterations = SISL_OPTIONS["num_iterations"].as< uint32_t >(); + auto num_records = SISL_OPTIONS["num_records"].as< uint32_t >(); auto start_time = Clock::now(); uint32_t iterations = 1; @@ -572,7 +531,7 @@ TEST_F(LogStoreLongRun, LongRunning) { while (true) { // Start insert of 100 log entries on all logstores with num batch of 10 in parallel fashion as a burst. - kickstart_inserts(100 /* num_records */, 10 /* batch */, 5000 /* q_depth */); + kickstart_inserts(num_records, 10 /* batch */, 5000 /* q_depth */); // Wait for inserts. wait_for_inserts(); @@ -620,9 +579,11 @@ TEST_F(LogStoreLongRun, LongRunning) { SISL_OPTIONS_ENABLE(logging, test_log_store_long_run, iomgr, test_common_setup) SISL_OPTION_GROUP(test_log_store_long_run, (num_logstores, "", "num_logstores", "number of log stores", - ::cxxopts::value< uint32_t >()->default_value("1000"), "number"), + ::cxxopts::value< uint32_t >()->default_value("100"), "number"), + (num_logdevs, "", "num_logdevs", "number of log devs", + ::cxxopts::value< uint32_t >()->default_value("10"), "number"), (num_records, "", "num_records", "number of record to test", - ::cxxopts::value< uint32_t >()->default_value("10000"), "number"), + ::cxxopts::value< uint32_t >()->default_value("100"), "number"), (num_iterations, "", "num_iterations", "Iterations", ::cxxopts::value< uint32_t >()->default_value("1"), "the number of iterations to run each test"), (run_time, "", "run_time", "running time in seconds", @@ -634,9 +595,15 @@ int main(int argc, char* argv[]) { SISL_OPTIONS_LOAD(parsed_argc, argv, logging, test_log_store_long_run, iomgr, test_common_setup); sisl::logging::SetLogger("test_log_store_long_run"); spdlog::set_pattern("[%D %T%z] [%^%l%$] [%t] %v"); - // sisl::logging::SetModuleLogLevel("logstore", spdlog::level::level_enum::debug); - // sisl::logging::SetModuleLogLevel("journalvdev", spdlog::level::level_enum::info); - const int ret = RUN_ALL_TESTS(); - return ret; + auto num_logstores = SISL_OPTIONS["num_logstores"].as< uint32_t >(); + auto num_logdevs = SISL_OPTIONS["num_logdevs"].as< uint32_t >(); + + if (num_logstores < num_logdevs) { + LOGFATAL("num_logstores {} should be greater or equal than num_logdevs {} to make sure there is at least one " + "logstore per logdev", + num_logstores, num_logdevs); + } + + return RUN_ALL_TESTS(); }