From 83d0375300828f5da5b148a514e6e17ccb684964 Mon Sep 17 00:00:00 2001 From: Jie Yao Date: Fri, 8 May 2026 16:56:18 +0800 Subject: [PATCH] SDSTOR-21465: scrubber phase 1 --- CHANGELOG.md | 13 - conanfile.py | 2 +- src/include/homeobject/common.hpp | 2 +- src/lib/homeobject_impl.hpp | 6 +- src/lib/homestore_backend/CMakeLists.txt | 23 +- .../homestore_backend/MPMCPriorityQueue.hpp | 192 ++ src/lib/homestore_backend/gc_manager.cpp | 16 +- src/lib/homestore_backend/gc_manager.hpp | 12 +- .../homestore_backend/heap_chunk_selector.cpp | 2 - src/lib/homestore_backend/hs_blob_manager.cpp | 16 +- src/lib/homestore_backend/hs_homeobject.cpp | 30 +- src/lib/homestore_backend/hs_homeobject.hpp | 41 +- .../hs_backend_config.fbs | 4 + .../resync_blob_data.fbs | 0 .../resync_pg_data.fbs | 0 .../resync_shard_data.fbs | 0 .../hs_homeobject_fbs/scrub_common.fbs | 25 + .../hs_homeobject_fbs/scrub_req.fbs | 17 + .../hs_homeobject_fbs/scrub_result.fbs | 11 + src/lib/homestore_backend/hs_http_manager.cpp | 400 +++- src/lib/homestore_backend/hs_http_manager.hpp | 54 + src/lib/homestore_backend/hs_pg_manager.cpp | 268 ++- .../homestore_backend/hs_shard_manager.cpp | 14 +- .../replication_state_machine.cpp | 35 +- .../replication_state_machine.hpp | 8 + src/lib/homestore_backend/scrub_manager.cpp | 2123 +++++++++++++++++ src/lib/homestore_backend/scrub_manager.hpp | 366 +++ .../homestore_backend/tests/CMakeLists.txt | 9 + .../tests/homeobj_fixture.hpp | 23 +- .../tests/hs_scrubber_tests.cpp | 1251 ++++++++++ .../tests/test_mpmc_priority_queue.cpp | 413 ++++ 31 files changed, 5274 insertions(+), 102 deletions(-) delete mode 100644 CHANGELOG.md create mode 100644 src/lib/homestore_backend/MPMCPriorityQueue.hpp rename src/lib/homestore_backend/{ => hs_homeobject_fbs}/hs_backend_config.fbs (93%) rename src/lib/homestore_backend/{ => hs_homeobject_fbs}/resync_blob_data.fbs (100%) rename src/lib/homestore_backend/{ => hs_homeobject_fbs}/resync_pg_data.fbs (100%) rename src/lib/homestore_backend/{ => hs_homeobject_fbs}/resync_shard_data.fbs (100%) create mode 100644 src/lib/homestore_backend/hs_homeobject_fbs/scrub_common.fbs create mode 100644 src/lib/homestore_backend/hs_homeobject_fbs/scrub_req.fbs create mode 100644 src/lib/homestore_backend/hs_homeobject_fbs/scrub_result.fbs create mode 100644 src/lib/homestore_backend/scrub_manager.cpp create mode 100644 src/lib/homestore_backend/scrub_manager.hpp create mode 100644 src/lib/homestore_backend/tests/hs_scrubber_tests.cpp create mode 100644 src/lib/homestore_backend/tests/test_mpmc_priority_queue.cpp diff --git a/CHANGELOG.md b/CHANGELOG.md deleted file mode 100644 index 51f00cd2e..000000000 --- a/CHANGELOG.md +++ /dev/null @@ -1,13 +0,0 @@ -# Changelog -All notable changes to this project will be documented in this file. - -The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), -and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). - -## [Unreleased] - -### Added - -- Created repository - -[Unreleased]: https://github.com/eBay/HomeObject/compare/...HEAD diff --git a/conanfile.py b/conanfile.py index 917669267..81eac46ed 100644 --- a/conanfile.py +++ b/conanfile.py @@ -10,7 +10,7 @@ class HomeObjectConan(ConanFile): name = "homeobject" - version = "4.1.10" + version = "4.2.0" homepage = "https://github.com/eBay/HomeObject" description = "Blob Store built on HomeStore" diff --git a/src/include/homeobject/common.hpp b/src/include/homeobject/common.hpp index 29a0589a9..63eff1305 100644 --- a/src/include/homeobject/common.hpp +++ b/src/include/homeobject/common.hpp @@ -14,7 +14,7 @@ SISL_LOGGING_DECL(homeobject); -#define HOMEOBJECT_LOG_MODS homeobject, blobmgr, shardmgr, gcmgr +#define HOMEOBJECT_LOG_MODS homeobject, blobmgr, shardmgr, gcmgr, scrubmgr #ifndef Ki constexpr uint64_t Ki = 1024ul; diff --git a/src/lib/homeobject_impl.hpp b/src/lib/homeobject_impl.hpp index 4eb2af48f..b905cbc96 100644 --- a/src/lib/homeobject_impl.hpp +++ b/src/lib/homeobject_impl.hpp @@ -88,7 +88,8 @@ class HomeObjectImpl : public HomeObject, public std::enable_shared_from_this< HomeObjectImpl > { /// Implementation defines these - virtual ShardManager::AsyncResult< ShardInfo > _create_shard(pg_id_t, uint64_t size_bytes, std::string meta, trace_id_t tid) = 0; + virtual ShardManager::AsyncResult< ShardInfo > _create_shard(pg_id_t, uint64_t size_bytes, std::string meta, + trace_id_t tid) = 0; virtual ShardManager::AsyncResult< ShardInfo > _seal_shard(ShardInfo const&, trace_id_t tid) = 0; virtual BlobManager::AsyncResult< blob_id_t > _put_blob(ShardInfo const&, Blob&&, trace_id_t tid) = 0; @@ -189,7 +190,8 @@ class HomeObjectImpl : public HomeObject, /// ShardManager ShardManager::AsyncResult< ShardInfo > get_shard(shard_id_t id, trace_id_t tid) const final; - ShardManager::AsyncResult< ShardInfo > create_shard(pg_id_t pg_owner, uint64_t size_bytes, std::string meta, trace_id_t tid) final; + ShardManager::AsyncResult< ShardInfo > create_shard(pg_id_t pg_owner, uint64_t size_bytes, std::string meta, + trace_id_t tid) final; ShardManager::AsyncResult< InfoList > list_shards(pg_id_t pg, trace_id_t tid) const final; ShardManager::AsyncResult< ShardInfo > seal_shard(shard_id_t id, trace_id_t tid) final; uint64_t get_current_timestamp(); diff --git a/src/lib/homestore_backend/CMakeLists.txt b/src/lib/homestore_backend/CMakeLists.txt index 7fd3d6fe2..88a132ada 100644 --- a/src/lib/homestore_backend/CMakeLists.txt +++ b/src/lib/homestore_backend/CMakeLists.txt @@ -30,6 +30,8 @@ target_sources("${PROJECT_NAME}_homestore" PRIVATE hs_cp_callbacks.cpp hs_http_manager.cpp gc_manager.cpp + scrub_manager.cpp + MPMCPriorityQueue.hpp $ ) target_link_libraries("${PROJECT_NAME}_homestore" PUBLIC @@ -42,10 +44,14 @@ settings_gen_cpp( ${FLATBUFFERS_FLATC_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/generated/ "${PROJECT_NAME}_homestore" - hs_backend_config.fbs - resync_pg_data.fbs - resync_shard_data.fbs - resync_blob_data.fbs + hs_homeobject_fbs/hs_backend_config.fbs + hs_homeobject_fbs/resync_pg_data.fbs + hs_homeobject_fbs/resync_shard_data.fbs + hs_homeobject_fbs/resync_blob_data.fbs + + hs_homeobject_fbs/scrub_common.fbs + hs_homeobject_fbs/scrub_req.fbs + hs_homeobject_fbs/scrub_result.fbs ) # Unit test objects @@ -165,3 +171,12 @@ add_test(NAME HomestoreTestGC COMMAND homestore_test_gc -csv error --executor im --override_config hs_backend_config.gc_enable_read_verify=true --override_config hs_backend_config.gc_garbage_rate_threshold=0 --override_config hs_backend_config.gc_scan_interval_sec=5) + +add_executable(homestore_test_scrubber) +target_sources(homestore_test_scrubber PRIVATE $) +target_link_libraries(homestore_test_scrubber PUBLIC homeobject_homestore ${COMMON_TEST_DEPS}) +add_test(NAME HomestoreTestScrubber COMMAND homestore_test_scrubber -csv error --executor immediate --config_path ./ + --override_config hs_backend_config.enable_scrubber=true + --override_config nuraft_mesg_config.mesg_factory_config.data_request_deadline_secs:10) + + diff --git a/src/lib/homestore_backend/MPMCPriorityQueue.hpp b/src/lib/homestore_backend/MPMCPriorityQueue.hpp new file mode 100644 index 000000000..9b0ba02d5 --- /dev/null +++ b/src/lib/homestore_backend/MPMCPriorityQueue.hpp @@ -0,0 +1,192 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace homeobject { + +/** + * @brief Multi-Producer Multi-Consumer Priority Queue (C++20) + * + * Thread-safe priority queue that supports: + * - Concurrent push operations from multiple producers + * - Concurrent pop operations from multiple consumers + * - Blocking pop when queue is empty + * - Graceful shutdown via close() method + * + * @tparam T Element type (must be comparable) + * @tparam Compare Comparison function (default: std::less for max-heap) + */ +template < typename T, typename Compare = std::less< T > > + requires std::movable< T > && std::predicate< Compare, T, T > +class MPMCPriorityQueue { +public: + using value_type = T; + using size_type = std::size_t; + using comparator_type = Compare; + + /** + * @brief Status codes returned by pop operations + */ + enum class Status : uint8_t { + Ok, ///< Successfully popped an element + Closed ///< Queue is closed, no more elements available + }; + + /** + * @brief Result of a pop operation + */ + struct PopResult { + Status status; + std::optional< T > value; ///< Has value only if status == Ok + + // Convenience methods + [[nodiscard]] constexpr bool is_ok() const noexcept { return status == Status::Ok; } + [[nodiscard]] constexpr bool is_closed() const noexcept { return status == Status::Closed; } + }; + + /** + * @brief Construct an empty priority queue + */ + constexpr MPMCPriorityQueue() noexcept(std::is_nothrow_default_constructible_v< Compare >) = default; + + /** + * @brief Destructor - automatically closes the queue + */ + ~MPMCPriorityQueue() { close(); } + + // Disable copy and move to prevent issues with condition variables + MPMCPriorityQueue(const MPMCPriorityQueue&) = delete; + MPMCPriorityQueue& operator=(const MPMCPriorityQueue&) = delete; + MPMCPriorityQueue(MPMCPriorityQueue&&) = delete; + MPMCPriorityQueue& operator=(MPMCPriorityQueue&&) = delete; + + /** + * @brief Thread-safe push operation (copy) + * + * @param value Element to insert + * @return true if pushed successfully, false if queue is closed + */ + bool push(const T& value) + requires std::copy_constructible< T > + { + { + std::scoped_lock lock(mutex_); + if (closed_) [[unlikely]] { + return false; // Queue is closed, cannot push + } + pq_.push(value); + } + cv_.notify_one(); // Wake one waiting consumer + return true; + } + + /** + * @brief Thread-safe push operation (move) + * + * @param value Element to insert (will be moved) + * @return true if pushed successfully, false if queue is closed + */ + bool push(T&& value) { + { + std::scoped_lock lock(mutex_); + if (closed_) [[unlikely]] { return false; } + pq_.push(std::move(value)); + } + cv_.notify_one(); + return true; + } + + /** + * @brief Thread-safe pop operation + * + * Blocks if queue is empty and not closed. + * Returns immediately if queue is closed. + * + * @return PopResult containing status and optional value + * @note Thread-safe for multiple concurrent consumers + */ + [[nodiscard]] PopResult pop() { + std::unique_lock lock(mutex_); + + // Wait until queue has elements or is closed + cv_.wait(lock, [this] { return closed_ || !pq_.empty(); }); + + // Try to pop an element + if (!pq_.empty()) { + T top = std::move(const_cast< T& >(pq_.top())); + pq_.pop(); + return PopResult{.status = Status::Ok, .value = std::move(top)}; + } + + // Queue is empty and closed + return PopResult{.status = Status::Closed, .value = std::nullopt}; + } + + /** + * @brief Close the queue + * + * After calling close(): + * - All blocked pop() calls will wake up + * - Existing elements can still be popped + * - New push() calls will be ignored + * - pop() returns Status::Closed when queue becomes empty + * + * @note Thread-safe and idempotent + */ + void close() noexcept { + { + std::scoped_lock lock(mutex_); + closed_ = true; + } + cv_.notify_all(); // Wake all waiting consumers + } + + /** + * @brief Get current number of elements + * + * @return Number of elements in the queue + * @note Thread-safe + */ + [[nodiscard]] size_type size() const { + std::scoped_lock lock(mutex_); + return pq_.size(); + } + + /** + * @brief Check if queue is empty + * + * @return true if queue has no elements + * @note Thread-safe + */ + [[nodiscard]] bool empty() const { + std::scoped_lock lock(mutex_); + return pq_.empty(); + } + + /** + * @brief Check if queue is closed + * + * @return true if close() has been called + * @note Thread-safe + */ + [[nodiscard]] bool is_closed() const { + std::scoped_lock lock(mutex_); + return closed_; + } + +private: + mutable std::mutex mutex_; + std::condition_variable cv_; + bool closed_{false}; + std::priority_queue< T, std::vector< T >, Compare > pq_; +}; + +} // namespace homeobject diff --git a/src/lib/homestore_backend/gc_manager.cpp b/src/lib/homestore_backend/gc_manager.cpp index 8076d92f3..83fcaf1df 100644 --- a/src/lib/homestore_backend/gc_manager.cpp +++ b/src/lib/homestore_backend/gc_manager.cpp @@ -25,14 +25,14 @@ SISL_LOGGING_DECL(gcmgr) GCManager::GCManager(HSHomeObject* homeobject) : m_chunk_selector{homeobject->chunk_selector()}, m_hs_home_object{homeobject} { homestore::meta_service().register_handler( - _gc_actor_meta_name, + gc_actor_meta_name, [this](homestore::meta_blk* mblk, sisl::byte_view buf, size_t size) { on_gc_actor_meta_blk_found(std::move(buf), voidptr_cast(mblk)); }, nullptr, true); homestore::meta_service().register_handler( - _gc_reserved_chunk_meta_name, + gc_reserved_chunk_meta_name, [this](homestore::meta_blk* mblk, sisl::byte_view buf, size_t size) { on_reserved_chunk_meta_blk_found(std::move(buf), voidptr_cast(mblk)); }, @@ -44,7 +44,7 @@ GCManager::GCManager(HSHomeObject* homeobject) : true); homestore::meta_service().register_handler( - _gc_task_meta_name, + gc_task_meta_name, [this](homestore::meta_blk* mblk, sisl::byte_view buf, size_t size) { on_gc_task_meta_blk_found(std::move(buf), voidptr_cast(mblk)); }, @@ -64,7 +64,7 @@ void GCManager::on_gc_task_meta_blk_found(sisl::byte_view const& buf, void* meta // here, we are under the protection of the lock of metaservice. however, we will also try to update pg and shard // metablk and then destroy the gc_task_sb, which will also try to acquire the lock of metaservice, as a result, a // dead lock will happen. so here we will handle all the gc tasks after read all the metablks - m_recovered_gc_tasks.emplace_back(_gc_task_meta_name); + m_recovered_gc_tasks.emplace_back(gc_task_meta_name); m_recovered_gc_tasks.back().load(buf, meta_cookie); } @@ -89,7 +89,7 @@ void GCManager::handle_all_recovered_gc_tasks() { } void GCManager::on_gc_actor_meta_blk_found(sisl::byte_view const& buf, void* meta_cookie) { - m_gc_actor_sbs.emplace_back(_gc_actor_meta_name); + m_gc_actor_sbs.emplace_back(gc_actor_meta_name); auto& gc_actor_sb = m_gc_actor_sbs.back(); gc_actor_sb.load(buf, meta_cookie); auto pdev_id = gc_actor_sb->pdev_id; @@ -100,7 +100,7 @@ void GCManager::on_gc_actor_meta_blk_found(sisl::byte_view const& buf, void* met } void GCManager::on_reserved_chunk_meta_blk_found(sisl::byte_view const& buf, void* meta_cookie) { - homestore::superblk< gc_reserved_chunk_superblk > reserved_chunk_sb(_gc_reserved_chunk_meta_name); + homestore::superblk< gc_reserved_chunk_superblk > reserved_chunk_sb(gc_reserved_chunk_meta_name); auto chunk_id = reserved_chunk_sb.load(buf, meta_cookie)->chunk_id; auto EXVchunk = m_chunk_selector->get_extend_vchunk(chunk_id); if (EXVchunk == nullptr) { @@ -976,7 +976,7 @@ bool GCManager::pdev_gc_actor::copy_valid_data( if (err) { // we will come here if: - // 1 any blob copy fails, then err is operation_canceled + // 1 any blob copy fails, then err is operation_cancelled // 2 write footer fails, then err is the error code of write footer GCLOGE(task_id, pg_id, shard_id, "Failed to copy some blos or failed to write shard footer for move_to_chunk={}, " @@ -1271,7 +1271,7 @@ void GCManager::pdev_gc_actor::process_gc_task(chunk_id_t move_from_chunk, uint8 // after data copy, we persist the gc task meta blk. now, we can make sure all the valid blobs are successfully // copyed and new blob indexes have be written to gc index table before gc task superblk is persisted. - homestore::superblk< GCManager::gc_task_superblk > gc_task_sb{GCManager::_gc_task_meta_name}; + homestore::superblk< GCManager::gc_task_superblk > gc_task_sb{GCManager::gc_task_meta_name}; gc_task_sb.create(sizeof(GCManager::gc_task_superblk)); gc_task_sb->move_from_chunk = move_from_chunk; gc_task_sb->move_to_chunk = move_to_chunk; diff --git a/src/lib/homestore_backend/gc_manager.hpp b/src/lib/homestore_backend/gc_manager.hpp index 7fd2a46be..6a0415023 100644 --- a/src/lib/homestore_backend/gc_manager.hpp +++ b/src/lib/homestore_backend/gc_manager.hpp @@ -46,9 +46,9 @@ class GCManager { GCManager& operator=(GCManager&&) = delete; public: - inline static auto const _gc_actor_meta_name = std::string("GCActor"); - inline static auto const _gc_task_meta_name = std::string("GCTask"); - inline static auto const _gc_reserved_chunk_meta_name = std::string("GCReservedChunk"); + inline static auto const gc_actor_meta_name = std::string("GCActor"); + inline static auto const gc_task_meta_name = std::string("GCTask"); + inline static auto const gc_reserved_chunk_meta_name = std::string("GCReservedChunk"); inline static atomic_uint64_t _gc_task_id{1}; // 0 is used for crash recovery #pragma pack(1) @@ -61,7 +61,7 @@ class GCManager { uint64_t failed_egc_task_count{0ull}; uint64_t total_reclaimed_blk_count_by_gc{0ull}; uint64_t total_reclaimed_blk_count_by_egc{0ull}; - static std::string name() { return _gc_actor_meta_name; } + static std::string name() { return gc_actor_meta_name; } }; struct gc_task_superblk { @@ -70,12 +70,12 @@ class GCManager { chunk_id_t vchunk_id; pg_id_t pg_id; uint8_t priority; - static std::string name() { return _gc_task_meta_name; } + static std::string name() { return gc_task_meta_name; } }; struct gc_reserved_chunk_superblk { chunk_id_t chunk_id; - static std::string name() { return _gc_reserved_chunk_meta_name; } + static std::string name() { return gc_reserved_chunk_meta_name; } }; #pragma pack() diff --git a/src/lib/homestore_backend/heap_chunk_selector.cpp b/src/lib/homestore_backend/heap_chunk_selector.cpp index 1068ebb02..ba04d276c 100644 --- a/src/lib/homestore_backend/heap_chunk_selector.cpp +++ b/src/lib/homestore_backend/heap_chunk_selector.cpp @@ -381,8 +381,6 @@ void HeapChunkSelector::switch_chunks_for_pg(const pg_id_t pg_id, const chunk_nu std::unique_lock lk(pg_chunk_collection->mtx); auto& pg_chunks = pg_chunk_collection->m_pg_chunks; - // LOGDEBUGMOD(homeobject, "gc: before switch chunks for pg_id={}, pg_chunks={}", pg_chunks); - if (sisl_unlikely(pg_chunks[v_chunk_id]->get_chunk_id() == new_chunk_id)) { // this might happens when crash recovery. the crash happens after pg metablk is updated but before gc task // metablk is destroyed. diff --git a/src/lib/homestore_backend/hs_blob_manager.cpp b/src/lib/homestore_backend/hs_blob_manager.cpp index c89c32ec5..1c01c1125 100644 --- a/src/lib/homestore_backend/hs_blob_manager.cpp +++ b/src/lib/homestore_backend/hs_blob_manager.cpp @@ -88,7 +88,7 @@ BlobManager::AsyncResult< blob_id_t > HSHomeObject::_put_blob(ShardInfo const& s return folly::makeUnexpected(BlobErrorCode::SHUTTING_DOWN); } incr_pending_request_num(); - // check user key size + // check user key size if (blob.user_key.size() > BlobHeader::max_user_key_length) { BLOGE(tid, shard.id, 0, "input user key length > max_user_key_length {}", blob.user_key.size(), BlobHeader::max_user_key_length); @@ -167,8 +167,7 @@ BlobManager::AsyncResult< blob_id_t > HSHomeObject::_put_blob(ShardInfo const& s // Set offset of actual data after the blob header and user key (rounded off) req->blob_header()->data_offset = req->blob_header_buf().size(); - RELEASE_ASSERT(req->blob_header()->data_offset == _data_block_size, - "blob header should equals _data_block_size"); + RELEASE_ASSERT(req->blob_header()->data_offset == _data_block_size, "blob header should equals _data_block_size"); // In case blob body is not aligned, create a new aligned buffer and copy the blob body. if (((r_cast< uintptr_t >(blob.body.cbytes()) % io_align) != 0) || ((blob_size % io_align) != 0)) { // If address or size is not aligned, create a separate aligned buffer and do expensive memcpy. @@ -248,6 +247,13 @@ bool HSHomeObject::local_add_blob_info(pg_id_t const pg_id, BlobInfo const& blob } else { BLOGT(tid, blob_info.shard_id, blob_info.blob_id, "blob already exists in index table, skip it."); } + + hs_pg->last_committed_blob_id.store(blob_info.blob_id); + + // local_add_blob_info will also be called if br happens, in this case, last_committed_blob_id will be finally + // updated to the correct value after br is done, so we don't need to worry about the case where + // last_committed_blob_id is updated to a smaller value than the current last_committed_blob_id + return true; } @@ -367,9 +373,7 @@ BlobManager::AsyncResult< Blob > HSHomeObject::_get_blob_data(const shared< home } auto verify_result = do_verify_blob(read_buf.cbytes(), shard_id, 0 /* no blob_id check */); - if (!verify_result.hasValue()) { - return folly::makeUnexpected(verify_result.error()); - } + if (!verify_result.hasValue()) { return folly::makeUnexpected(verify_result.error()); } std::string user_key = std::move(verify_result.value()); BlobHeader const* header = r_cast< BlobHeader const* >(read_buf.cbytes()); diff --git a/src/lib/homestore_backend/hs_homeobject.cpp b/src/lib/homestore_backend/hs_homeobject.cpp index ef84a4c27..b030815cd 100644 --- a/src/lib/homestore_backend/hs_homeobject.cpp +++ b/src/lib/homestore_backend/hs_homeobject.cpp @@ -259,6 +259,14 @@ void HSHomeObject::init_homestore() { } else { LOGI("GC is disabled"); } + + // start scrubber + if (HS_BACKEND_DYNAMIC_CONFIG(enable_scrubber)) { + LOGI("Starting scrub manager"); + scrub_mgr_->start(); + } else { + LOGI("scrub manager is disabled"); + } } void HSHomeObject::on_replica_restart() { @@ -309,7 +317,6 @@ void HSHomeObject::on_replica_restart() { // gc_manager will be created only once here. we need make sure gc manager is created after all the pg meta blk // are replayed since we build pdev chunk heap in the constructor of gc manager , which depends on the pg meta. - // gc metablk handlers are registered in the constructor of gc manager gc_mgr_ = std::make_shared< GCManager >(this); @@ -326,7 +333,7 @@ void HSHomeObject::on_replica_restart() { gc_index_table_map.emplace(boost::uuids::to_string(uuid), gc_index_table); // 2 create gc actor superblk for each pdev, which contains the pdev_id and index table uuid. - homestore::superblk< GCManager::gc_actor_superblk > gc_actor_sb{GCManager::_gc_actor_meta_name}; + homestore::superblk< GCManager::gc_actor_superblk > gc_actor_sb{GCManager::gc_actor_meta_name}; gc_actor_sb.create(sizeof(GCManager::gc_actor_superblk)); gc_actor_sb->pdev_id = pdev_id; gc_actor_sb->index_table_uuid = uuid; @@ -340,7 +347,7 @@ void HSHomeObject::on_replica_restart() { for (size_t i = 0; i < reserved_chunk_num_per_pdev; ++i) { auto chunk = chunks[i]; homestore::superblk< GCManager::gc_reserved_chunk_superblk > reserved_chunk_sb{ - GCManager::_gc_reserved_chunk_meta_name}; + GCManager::gc_reserved_chunk_meta_name}; reserved_chunk_sb.create(sizeof(GCManager::gc_reserved_chunk_superblk)); reserved_chunk_sb->chunk_id = chunk; reserved_chunk_sb.write(); @@ -356,9 +363,9 @@ void HSHomeObject::on_replica_restart() { // when initializing, there is not gc task. we need to recover reserved chunks here, so that the reserved chunks // will not be put into pdev heap when built - homestore::meta_service().read_sub_sb(GCManager::_gc_actor_meta_name); - homestore::meta_service().read_sub_sb(GCManager::_gc_reserved_chunk_meta_name); - homestore::meta_service().read_sub_sb(GCManager::_gc_task_meta_name); + homestore::meta_service().read_sub_sb(GCManager::gc_actor_meta_name); + homestore::meta_service().read_sub_sb(GCManager::gc_reserved_chunk_meta_name); + homestore::meta_service().read_sub_sb(GCManager::gc_task_meta_name); // At this point, log replay has not started yet. We must process all recovered GC tasks before replay begins. // After log replay completes, ReplicationStateMachine::on_log_replay_done() calls select_specific_chunk() for @@ -377,6 +384,9 @@ void HSHomeObject::on_replica_restart() { gc_mgr_->handle_all_recovered_gc_tasks(); }); + + // initialize scrub manager + scrub_mgr_ = std::make_shared< ScrubManager >(this); } #if 0 @@ -446,16 +456,20 @@ void HSHomeObject::shutdown() { LOGI("waiting for {} pending requests to complete", pending_reqs); std::this_thread::sleep_for(std::chrono::milliseconds(1000)); }; - LOGI("start stopping GC"); + LOGI("stopping GC"); // we need stop gc before shutting down homestore(where metaservice is shutdown), because gc mgr needs metaservice // to persist gc task metablk if there is any ongoing gc task. after stopping gc manager, there is no gc task // anymore, and thus now new gc task will be written to metaservice during homestore shutdown. - gc_mgr_->stop(); + if (gc_mgr_) gc_mgr_->stop(); + + LOGI("stopping scrubbing"); + if (scrub_mgr_) scrub_mgr_->stop(); LOGI("start shutting down HomeStore"); homestore::HomeStore::instance()->shutdown(); homestore::HomeStore::reset_instance(); gc_mgr_.reset(); + scrub_mgr_.reset(); iomanager.stop(); LOGI("complete shutting down HomeStore"); } diff --git a/src/lib/homestore_backend/hs_homeobject.hpp b/src/lib/homestore_backend/hs_homeobject.hpp index d4a1d25f4..3fced7559 100644 --- a/src/lib/homestore_backend/hs_homeobject.hpp +++ b/src/lib/homestore_backend/hs_homeobject.hpp @@ -14,11 +14,17 @@ #include "homeobject/common.hpp" #include "index_kv.hpp" #include "gc_manager.hpp" +#include "scrub_manager.hpp" #include "hs_backend_config.hpp" #include "generated/resync_pg_data_generated.h" #include "generated/resync_shard_data_generated.h" #include "generated/resync_blob_data_generated.h" +// scrubber fbs headers. +#include "generated/scrub_common_generated.h" +#include "generated/scrub_req_generated.h" +#include "generated/scrub_result_generated.h" + namespace homestore { struct meta_blk; class IndexTableBase; @@ -364,7 +370,10 @@ class HSHomeObject : public HomeObjectImpl { shared< homestore::ReplDev > repl_dev_; std::shared_ptr< BlobIndexTable > index_table_; PGMetrics metrics_; + HSHomeObject& home_obj_; mutable pg_state pg_state_{0}; + mutable std::atomic_bool in_scrubbing{false}; + mutable std::atomic_uint64_t last_committed_blob_id{0}; // Snapshot receiver progress info, used as a checkpoint for recovery // Placed within HS_PG since HomeObject is unable to locate the ReplicationStateMachine @@ -372,8 +381,8 @@ class HSHomeObject : public HomeObjectImpl { mutable homestore::superblk< snapshot_rcvr_shard_list_superblk > snp_rcvr_shard_list_sb_; HS_PG(PGInfo info, shared< homestore::ReplDev > rdev, shared< BlobIndexTable > index_table, - std::shared_ptr< const std::vector< homestore::chunk_num_t > > pg_chunk_ids); - HS_PG(homestore::superblk< pg_info_superblk >&& sb, shared< homestore::ReplDev > rdev); + std::shared_ptr< const std::vector< homestore::chunk_num_t > > pg_chunk_ids, HSHomeObject& home_obj); + HS_PG(homestore::superblk< pg_info_superblk >&& sb, shared< homestore::ReplDev > rdev, HSHomeObject& home_obj); ~HS_PG() override = default; static PGInfo pg_info_from_sb(homestore::superblk< pg_info_superblk > const& sb); @@ -396,6 +405,13 @@ class HSHomeObject : public HomeObjectImpl { */ uint32_t get_snp_progress() const; + /** + * Returns the blob_id of the last committed put_blob. + */ + blob_id_t get_last_committed_blob_id() const; + + pg_id_t pg_id() const { return pg_sb_->id; } + /** * Returns all replication info of all peers. */ @@ -416,6 +432,19 @@ class HSHomeObject : public HomeObjectImpl { * Update membership in pg's superblock. */ void update_membership(const MemberSet& members); + + /* + * RPC handlers for scrub: + * 1. on_scrub_req_received: receive the scrub req from leader + * 2. on_scrub_result_received: receive the scrub map from followers + */ + void on_scrub_req_received(boost::intrusive_ptr< sisl::GenericRpcData >& rpc_data); + void on_scrub_result_received(boost::intrusive_ptr< sisl::GenericRpcData >& rpc_data); + + /** + * Register data RPC handlers for this PG + */ + void register_data_rpc_handlers(); }; struct HS_Shard : public Shard { @@ -537,6 +566,11 @@ class HSHomeObject : public HomeObjectImpl { inline const static homestore::MultiBlkId tombstone_pbas{0, 0, 0}; inline const static std::string delete_marker_blob_data{"HOMEOBJECT_BLOB_DELETE_MARKER"}; + // ask followers to scrub + inline const static std::string PUSH_SCRUB_REQ{"PUSH_SCRUB_REQ"}; + // return scrub map to leader + inline const static std::string PUSH_SCRUB_RESULT{"PUSH_SCRUB_RESULT"}; + class PGBlobIterator { public: struct blob_read_result { @@ -732,6 +766,7 @@ class HSHomeObject : public HomeObjectImpl { mutable std::shared_mutex snp_sbs_lock_; shared< HeapChunkSelector > chunk_selector_; shared< GCManager > gc_mgr_; + shared< ScrubManager > scrub_mgr_; unique< HttpManager > http_mgr_; static constexpr size_t max_zpad_bufs = _data_block_size / io_align; @@ -986,6 +1021,7 @@ class HSHomeObject : public HomeObjectImpl { cshared< HeapChunkSelector > chunk_selector() const { return chunk_selector_; } cshared< GCManager > gc_manager() const { return gc_mgr_; } + cshared< ScrubManager > scrub_manager() const { return scrub_mgr_; } /** * @brief Reconciles the leaders for all PGs or a specific PG identified by pg_id. @@ -1057,6 +1093,7 @@ class HSHomeObject : public HomeObjectImpl { // Refresh PG statistics (called after log replay) void refresh_pg_statistics(pg_id_t pg_id); + shard_id_t get_last_shard_id_in_pg(pg_id_t pg_id) const; private: BlobManager::Result< std::string > do_verify_blob(const void* blob, shard_id_t expected_shard_id, diff --git a/src/lib/homestore_backend/hs_backend_config.fbs b/src/lib/homestore_backend/hs_homeobject_fbs/hs_backend_config.fbs similarity index 93% rename from src/lib/homestore_backend/hs_backend_config.fbs rename to src/lib/homestore_backend/hs_homeobject_fbs/hs_backend_config.fbs index bd6991db9..983d19208 100644 --- a/src/lib/homestore_backend/hs_backend_config.fbs +++ b/src/lib/homestore_backend/hs_homeobject_fbs/hs_backend_config.fbs @@ -23,6 +23,10 @@ table HSBackendSettings { //TODO: make this hotswap after gc is well tested enable_gc: bool = true; + //Enable scrubber + //TODO: make this hotswap after scrubber is well tested + enable_scrubber: bool = false; + //Total reserved chunk num (dedicated for gc/egc) per pdev reserved_chunk_num_per_pdev: uint8 = 6; diff --git a/src/lib/homestore_backend/resync_blob_data.fbs b/src/lib/homestore_backend/hs_homeobject_fbs/resync_blob_data.fbs similarity index 100% rename from src/lib/homestore_backend/resync_blob_data.fbs rename to src/lib/homestore_backend/hs_homeobject_fbs/resync_blob_data.fbs diff --git a/src/lib/homestore_backend/resync_pg_data.fbs b/src/lib/homestore_backend/hs_homeobject_fbs/resync_pg_data.fbs similarity index 100% rename from src/lib/homestore_backend/resync_pg_data.fbs rename to src/lib/homestore_backend/hs_homeobject_fbs/resync_pg_data.fbs diff --git a/src/lib/homestore_backend/resync_shard_data.fbs b/src/lib/homestore_backend/hs_homeobject_fbs/resync_shard_data.fbs similarity index 100% rename from src/lib/homestore_backend/resync_shard_data.fbs rename to src/lib/homestore_backend/hs_homeobject_fbs/resync_shard_data.fbs diff --git a/src/lib/homestore_backend/hs_homeobject_fbs/scrub_common.fbs b/src/lib/homestore_backend/hs_homeobject_fbs/scrub_common.fbs new file mode 100644 index 000000000..c08fdab0e --- /dev/null +++ b/src/lib/homestore_backend/hs_homeobject_fbs/scrub_common.fbs @@ -0,0 +1,25 @@ +native_include "sisl/utility/non_null_ptr.hpp"; + +namespace homeobject; + +enum ScrubStatus : uint8 { + NONE = 0, + IO_ERROR = 1, + MISMATCH = 2, + NOT_FOUND = 3 +} + +enum ScrubType : uint8 { + META = 0, + SHALLOW_BLOB = 1, + DEEP_BLOB = 2, + CHECK_BLOB_EXISTENCE = 3, + CHECK_SHARD_EXISTENCE = 4 +} + +table ScrubResultEntry { + shard_id: uint64; + blob_id: uint64; + scrub_result: ScrubStatus; + hash: uint64; +} \ No newline at end of file diff --git a/src/lib/homestore_backend/hs_homeobject_fbs/scrub_req.fbs b/src/lib/homestore_backend/hs_homeobject_fbs/scrub_req.fbs new file mode 100644 index 000000000..23e8d5452 --- /dev/null +++ b/src/lib/homestore_backend/hs_homeobject_fbs/scrub_req.fbs @@ -0,0 +1,17 @@ +include "scrub_common.fbs"; + +namespace homeobject; + +table ScrubReq { + pg_id: uint16; + req_id: uint64; + scrub_lsn: int64; + start_shard_id: uint64; + start_blob_id: uint64; + end_shard_id: uint64; + end_blob_id: uint64; + issuer_uuid: [ubyte]; + scrub_type: ScrubType; +} + +root_type ScrubReq; \ No newline at end of file diff --git a/src/lib/homestore_backend/hs_homeobject_fbs/scrub_result.fbs b/src/lib/homestore_backend/hs_homeobject_fbs/scrub_result.fbs new file mode 100644 index 000000000..f342344cc --- /dev/null +++ b/src/lib/homestore_backend/hs_homeobject_fbs/scrub_result.fbs @@ -0,0 +1,11 @@ +include "scrub_common.fbs"; + +namespace homeobject; + +table ScrubResult { + req_id: uint64; + issuer_uuid: [ubyte]; + scrub_results: [ScrubResultEntry]; +} + +root_type ScrubResult; \ No newline at end of file diff --git a/src/lib/homestore_backend/hs_http_manager.cpp b/src/lib/homestore_backend/hs_http_manager.cpp index b77718bad..8ae007603 100644 --- a/src/lib/homestore_backend/hs_http_manager.cpp +++ b/src/lib/homestore_backend/hs_http_manager.cpp @@ -17,12 +17,29 @@ #include #include #include +#include +#include +#include +#include #include "hs_http_manager.hpp" #include "hs_homeobject.hpp" namespace homeobject { +namespace { +// Helper function to format time as ISO 8601 +std::string format_iso8601_time(const std::chrono::system_clock::time_point& tp) { + auto time_t = std::chrono::system_clock::to_time_t(tp); + std::tm tm; + gmtime_r(&time_t, &tm); // Thread-safe version + char buf[32]; + std::strftime(buf, sizeof(buf), "%Y-%m-%dT%H:%M:%SZ", &tm); + return std::string(buf); +} + +} // anonymous namespace + HttpManager::HttpManager(HSHomeObject& ho) : ho_(ho) { using namespace Pistache; using namespace Pistache::Rest; @@ -74,7 +91,13 @@ HttpManager::HttpManager(HSHomeObject& ho) : ho_(ho) { {Pistache::Http::Method::Post, "/api/v1/trigger_gc", Pistache::Rest::Routes::bind(&HttpManager::trigger_gc, this)}, {Pistache::Http::Method::Get, "/api/v1/gc_job_status", - Pistache::Rest::Routes::bind(&HttpManager::get_gc_job_status, this)}}; + Pistache::Rest::Routes::bind(&HttpManager::get_gc_job_status, this)}, + {Pistache::Http::Method::Post, "/api/v1/trigger_pg_scrub", + Pistache::Rest::Routes::bind(&HttpManager::trigger_pg_scrub, this)}, + {Pistache::Http::Method::Get, "/api/v1/scrub_job_status", + Pistache::Rest::Routes::bind(&HttpManager::get_scrub_job_status, this)}, + {Pistache::Http::Method::Post, "/api/v1/cancel_scrub_job", + Pistache::Rest::Routes::bind(&HttpManager::cancel_scrub_job, this)}}; auto http_server = ioenvironment.get_http_server(); if (!http_server) { @@ -486,6 +509,196 @@ void HttpManager::exit_pg(const Pistache::Rest::Request& request, Pistache::Http response.send(Pistache::Http::Code::Ok, "Exit pg request submitted"); } +void HttpManager::trigger_pg_scrub(const Pistache::Rest::Request& request, Pistache::Http::ResponseWriter response) { + auto scrub_mgr = ho_.scrub_manager(); + if (!scrub_mgr) { + response.send(Pistache::Http::Code::Internal_Server_Error, "Scrub manager not available"); + return; + } + + // Get query parameters + const auto pg_id_param = request.query().get("pg_id"); + const auto is_deep_param = request.query().get("deep"); + + // Validate pg_id parameter (required) + if (!pg_id_param || pg_id_param.value().empty()) { + nlohmann::json error; + error["error"] = "Missing required parameter: pg_id"; + error["usage"] = "POST /api/v1/trigger_pg_scrub?pg_id=&deep="; + response.send(Pistache::Http::Code::Bad_Request, error.dump()); + return; + } + + uint16_t pg_id; + try { + auto val = std::stoul(pg_id_param.value()); + if (val > std::numeric_limits< uint16_t >::max()) { + nlohmann::json error; + error["error"] = "pg_id out of range"; + error["pg_id"] = pg_id_param.value(); + response.send(Pistache::Http::Code::Bad_Request, error.dump()); + return; + } + pg_id = static_cast< uint16_t >(val); + } catch (const std::invalid_argument& e) { + nlohmann::json error; + error["error"] = "Invalid pg_id format: not a number"; + error["pg_id"] = pg_id_param.value(); + response.send(Pistache::Http::Code::Bad_Request, error.dump()); + return; + } catch (const std::out_of_range& e) { + nlohmann::json error; + error["error"] = "pg_id out of range"; + error["pg_id"] = pg_id_param.value(); + response.send(Pistache::Http::Code::Bad_Request, error.dump()); + return; + } + + // Parse optional parameters + bool is_deep = false; + if (is_deep_param && !is_deep_param.value().empty()) { + const auto& value = is_deep_param.value(); + is_deep = (value == "true" || value == "1" || value == "yes"); + } + + LOGINFO("Received trigger_pg_scrub request for pg_id={}, deep={}", pg_id, is_deep); + + // Verify PG exists + auto hs_pg = ho_.get_hs_pg(pg_id); + if (!hs_pg) { + nlohmann::json error; + error["error"] = "PG not found"; + error["pg_id"] = pg_id; + response.send(Pistache::Http::Code::Not_Found, error.dump()); + return; + } + + // Generate job ID and create job info + const auto job_id = generate_job_id(); + auto job_info = std::make_shared< ScrubJobInfo >(job_id, pg_id, is_deep); + + { + std::lock_guard< std::shared_mutex > lock(scrub_job_mutex_); + scrub_jobs_map_.set(job_id, job_info); + } + + // Prepare immediate response + nlohmann::json result; + result["job_id"] = job_id; + result["pg_id"] = pg_id; + result["scrub_type"] = is_deep ? "deep" : "shallow"; + result["message"] = "Scrub task submitted, query status using /api/v1/scrub_job_status?job_id=" + job_id; + + // Return immediately with HTTP 202 Accepted + response.send(Pistache::Http::Code::Accepted, result.dump()); + + // Submit scrub task (MANUALLY trigger type) - runs asynchronously + scrub_mgr->submit_scrub_task(pg_id, is_deep, SCRUB_TRIGGER_TYPE::MANUALLY) + .via(&folly::InlineExecutor::instance()) + .thenValue([job_info, is_deep](std::shared_ptr< ScrubManager::ShallowScrubReport > report) { + if (!report) { + job_info->try_complete(ScrubJobStatus::FAILED, "Scrub task failed or was cancelled"); + return; + } + + // Build report summary + nlohmann::json report_summary; + report_summary["pg_id"] = report->get_pg_id(); + + // Add missing shards info + const auto& missing_shards = report->get_missing_shard_ids(); + if (!missing_shards.empty()) { + nlohmann::json missing_shards_json; + for (const auto& [shard_id, peer_ids] : missing_shards) { + nlohmann::json peer_list = nlohmann::json::array(); + for (const auto& peer_id : peer_ids) { + peer_list.push_back(boost::uuids::to_string(peer_id)); + } + missing_shards_json[std::to_string(shard_id)] = peer_list; + } + report_summary["missing_shards"] = missing_shards_json; + } + + // Add missing blobs info + const auto& missing_blobs = report->get_missing_blobs(); + if (!missing_blobs.empty()) { + nlohmann::json missing_blobs_json; + for (const auto& [blob_route, peer_ids] : missing_blobs) { + nlohmann::json peer_list = nlohmann::json::array(); + for (const auto& peer_id : peer_ids) { + peer_list.push_back(boost::uuids::to_string(peer_id)); + } + missing_blobs_json[fmt::format("{}", blob_route)] = peer_list; + } + report_summary["missing_blobs"] = missing_blobs_json; + } + + // If it's a deep scrub report, add additional info + if (is_deep) { + auto deep_report = std::dynamic_pointer_cast< ScrubManager::DeepScrubReport >(report); + if (deep_report) { + // Add corrupted blobs info + const auto& corrupted_blobs = deep_report->get_corrupted_blobs(); + if (!corrupted_blobs.empty()) { + nlohmann::json corrupted_blobs_json; + for (const auto& [peer_id, blob_map] : corrupted_blobs) { + nlohmann::json blob_status_json; + for (const auto& [blob_route, status] : blob_map) { + blob_status_json[fmt::format("{}", blob_route)] = scrub_result_to_string(status); + } + corrupted_blobs_json[boost::uuids::to_string(peer_id)] = blob_status_json; + } + report_summary["corrupted_blobs"] = corrupted_blobs_json; + } + + // Add inconsistent blobs info + const auto& inconsistent_blobs = deep_report->get_inconsistent_blobs(); + if (!inconsistent_blobs.empty()) { + nlohmann::json inconsistent_blobs_json; + for (const auto& [blob_route, peer_hash_map] : inconsistent_blobs) { + nlohmann::json peer_hash_json; + for (const auto& [peer_id, hash] : peer_hash_map) { + peer_hash_json[boost::uuids::to_string(peer_id)] = fmt::format("{:016x}", hash); + } + inconsistent_blobs_json[fmt::format("{}", blob_route)] = peer_hash_json; + } + report_summary["inconsistent_blobs"] = inconsistent_blobs_json; + } + + // Add corrupted shards info + const auto& corrupted_shards = deep_report->get_corrupted_shards(); + if (!corrupted_shards.empty()) { + nlohmann::json corrupted_shards_json; + for (const auto& [peer_id, shard_map] : corrupted_shards) { + nlohmann::json shard_status_json; + for (const auto& [shard_id, status] : shard_map) { + shard_status_json[std::to_string(shard_id)] = scrub_result_to_string(status); + } + corrupted_shards_json[boost::uuids::to_string(peer_id)] = shard_status_json; + } + report_summary["corrupted_shards"] = corrupted_shards_json; + } + + // Add corrupted PG meta info + const auto& corrupted_pg_metas = deep_report->get_corrupted_pg_metas(); + if (!corrupted_pg_metas.empty()) { + nlohmann::json corrupted_pg_metas_json; + for (const auto& [peer_id, status] : corrupted_pg_metas) { + corrupted_pg_metas_json[boost::uuids::to_string(peer_id)] = scrub_result_to_string(status); + } + report_summary["corrupted_pg_metas"] = corrupted_pg_metas_json; + } + } + } + + // Complete the job with success status and report + job_info->try_complete(ScrubJobStatus::COMPLETED, "", report_summary); + }) + .thenError([job_info](const folly::exception_wrapper& ew) { + job_info->try_complete(ScrubJobStatus::FAILED, ew.what().c_str()); + }); +} + void HttpManager::trigger_gc(const Pistache::Rest::Request& request, Pistache::Http::ResponseWriter response) { auto gc_mgr = ho_.gc_manager(); if (!gc_mgr) { @@ -651,7 +864,7 @@ void HttpManager::trigger_gc(const Pistache::Rest::Request& request, Pistache::H std::string HttpManager::generate_job_id() { auto counter = job_counter_.fetch_add(1, std::memory_order_relaxed); - return fmt::format("trigger-gc-task-{}", counter); + return fmt::format("job-{}", counter); } void HttpManager::get_job_status(const std::string& job_id, nlohmann::json& result) { @@ -783,6 +996,189 @@ folly::Future< folly::Unit > HttpManager::trigger_gc_for_pg(uint16_t pg_id, cons }); } +void HttpManager::get_scrub_job_status(const Pistache::Rest::Request& request, + Pistache::Http::ResponseWriter response) { + auto job_id_param = request.query().get("job_id"); + + if (job_id_param && !job_id_param.value().empty()) { + // Query specific job + const auto job_id = job_id_param.value(); + LOGINFO("Query scrub job {} status", job_id); + + std::shared_ptr< ScrubJobInfo > job_info; + { + std::shared_lock lock(scrub_job_mutex_); + job_info = scrub_jobs_map_.get(job_id); + } + + if (!job_info) { + nlohmann::json error; + error["error"] = "Job not found"; + error["job_id"] = job_id; + response.send(Pistache::Http::Code::Not_Found, error.dump()); + return; + } + + nlohmann::json result = build_scrub_job_json(job_info); + response.send(Pistache::Http::Code::Ok, result.dump()); + return; + } + + // Query all jobs + LOGINFO("Query all scrub job status"); + nlohmann::json result; + std::vector< std::shared_ptr< ScrubJobInfo > > all_jobs; + + { + std::shared_lock lock(scrub_job_mutex_); + for (const auto& [k, v] : scrub_jobs_map_) { + all_jobs.push_back(v); + } + } + + for (const auto& job_info : all_jobs) { + result["jobs"].push_back(build_scrub_job_json(job_info)); + } + + response.send(Pistache::Http::Code::Ok, result.dump()); +} + +nlohmann::json HttpManager::build_scrub_job_json(const std::shared_ptr< ScrubJobInfo >& job_info) { + nlohmann::json result; + + // Helper to convert status enum to string + auto status_to_string = [](ScrubJobStatus status) -> std::string { + switch (status) { + case ScrubJobStatus::RUNNING: + return "running"; + case ScrubJobStatus::COMPLETED: + return "completed"; + case ScrubJobStatus::FAILED: + return "failed"; + case ScrubJobStatus::CANCELLED: + return "cancelled"; + default: + return "unknown"; + } + }; + + // Thread-unsafe fields (read-only after construction) + result["job_id"] = job_info->job_id; + result["pg_id"] = job_info->pg_id; + result["scrub_type"] = job_info->is_deep ? "deep" : "shallow"; + + // Thread-safe fields (protected by mutex) + { + std::lock_guard< std::mutex > lock(job_info->mtx_); + + // Status + result["status"] = status_to_string(job_info->status); + + // Timestamps - convert to ISO 8601 format (no newline) + result["start_time"] = format_iso8601_time(job_info->start_time); + + if (job_info->status != ScrubJobStatus::RUNNING) { + result["end_time"] = format_iso8601_time(job_info->end_time); + + auto duration = + std::chrono::duration_cast< std::chrono::seconds >(job_info->end_time - job_info->start_time); + result["duration_seconds"] = duration.count(); + } + + // Error message (if any) + if (!job_info->error_message.empty()) { result["error_message"] = job_info->error_message; } + + // Report summary (if completed) + if (job_info->status == ScrubJobStatus::COMPLETED && !job_info->report_summary.empty()) { + result["report"] = job_info->report_summary; + } + } + + return result; +} + +void HttpManager::cancel_scrub_job(const Pistache::Rest::Request& request, Pistache::Http::ResponseWriter response) { + auto job_id_param = request.query().get("job_id"); + + if (!job_id_param || job_id_param.value().empty()) { + nlohmann::json error; + error["error"] = "Missing required parameter: job_id"; + error["usage"] = "POST /api/v1/cancel_scrub_job?job_id="; + response.send(Pistache::Http::Code::Bad_Request, error.dump()); + return; + } + + const auto job_id = job_id_param.value(); + LOGINFO("Cancel scrub job {}", job_id); + + std::shared_ptr< ScrubJobInfo > job_info; + { + std::shared_lock lock(scrub_job_mutex_); + job_info = scrub_jobs_map_.get(job_id); + } + + if (!job_info) { + nlohmann::json error; + error["error"] = "Job not found"; + error["job_id"] = job_id; + response.send(Pistache::Http::Code::Not_Found, error.dump()); + return; + } + + // Check if job is still running (thread-safe) + bool can_cancel = false; + std::string current_status_str; + { + std::lock_guard< std::mutex > lock(job_info->mtx_); + can_cancel = (job_info->status == ScrubJobStatus::RUNNING); + if (!can_cancel) { + // Get status string for error message + switch (job_info->status) { + case ScrubJobStatus::COMPLETED: + current_status_str = "completed"; + break; + case ScrubJobStatus::FAILED: + current_status_str = "failed"; + break; + case ScrubJobStatus::CANCELLED: + current_status_str = "cancelled"; + break; + default: + current_status_str = "unknown"; + } + } + } + + if (!can_cancel) { + nlohmann::json result; + result["job_id"] = job_id; + result["message"] = "Job is not running, cannot cancel"; + result["current_status"] = current_status_str; + response.send(Pistache::Http::Code::Bad_Request, result.dump()); + return; + } + + // Cancel the scrub task + auto scrub_mgr = ho_.scrub_manager(); + if (!scrub_mgr) { + nlohmann::json error; + error["error"] = "Scrub manager not available"; + response.send(Pistache::Http::Code::Internal_Server_Error, error.dump()); + return; + } + + // Cancel in scrub manager first (this will stop ongoing work) + scrub_mgr->cancel_scrub_task(job_info->pg_id); + + // Update job status (thread-safe) + job_info->cancel(); + + nlohmann::json result; + result["job_id"] = job_id; + result["message"] = "Scrub job cancelled successfully"; + response.send(Pistache::Http::Code::Ok, result.dump()); +} + #ifdef _PRERELEASE void HttpManager::crash_system(const Pistache::Rest::Request& request, Pistache::Http::ResponseWriter response) { std::string crash_type; diff --git a/src/lib/homestore_backend/hs_http_manager.hpp b/src/lib/homestore_backend/hs_http_manager.hpp index 016c537cc..02a24dcc4 100644 --- a/src/lib/homestore_backend/hs_http_manager.hpp +++ b/src/lib/homestore_backend/hs_http_manager.hpp @@ -50,6 +50,9 @@ class HttpManager { void get_gc_job_status(const Pistache::Rest::Request& request, Pistache::Http::ResponseWriter response); folly::Future< folly::Unit > trigger_gc_for_pg(uint16_t pg_id, const std::string& job_id); void get_job_status(const std::string& job_id, nlohmann::json& result); + void trigger_pg_scrub(const Pistache::Rest::Request& request, Pistache::Http::ResponseWriter response); + void get_scrub_job_status(const Pistache::Rest::Request& request, Pistache::Http::ResponseWriter response); + void cancel_scrub_job(const Pistache::Rest::Request& request, Pistache::Http::ResponseWriter response); #ifdef _PRERELEASE void crash_system(const Pistache::Rest::Request& request, Pistache::Http::ResponseWriter response); @@ -74,15 +77,66 @@ class HttpManager { job_id(id), status(GCJobStatus::RUNNING), pg_id(pgid), chunk_id(cid) {} }; + enum class ScrubJobStatus { RUNNING, COMPLETED, FAILED, CANCELLED }; + + struct ScrubJobInfo { + std::string job_id; + uint16_t pg_id; + bool is_deep; + + // Mutable fields protected by mutex + mutable std::mutex mtx_; + ScrubJobStatus status; + std::chrono::system_clock::time_point start_time; + std::chrono::system_clock::time_point end_time; + std::string error_message; + nlohmann::json report_summary; + + // Flag to prevent status update after cancellation + std::atomic< bool > is_cancelled{false}; + + ScrubJobInfo(const std::string& id, uint16_t pgid, bool deep) : + job_id(id), + pg_id(pgid), + is_deep(deep), + status(ScrubJobStatus::RUNNING), + start_time(std::chrono::system_clock::now()) {} + + // Thread-safe status update - returns false if already cancelled + bool try_complete(ScrubJobStatus new_status, const std::string& error_msg = "", + const nlohmann::json& summary = nlohmann::json()) { + std::lock_guard< std::mutex > lock(mtx_); + if (is_cancelled.load(std::memory_order_acquire)) { return false; } // Already cancelled, reject update + + status = new_status; + end_time = std::chrono::system_clock::now(); + error_message = error_msg; + if (!summary.empty()) { report_summary = summary; } + return true; + } + + // Thread-safe cancel + void cancel() { + std::lock_guard< std::mutex > lock(mtx_); + is_cancelled.store(true, std::memory_order_release); + status = ScrubJobStatus::CANCELLED; + end_time = std::chrono::system_clock::now(); + error_message = "Cancelled by user"; + } + }; + std::string generate_job_id(); + nlohmann::json build_scrub_job_json(const std::shared_ptr< ScrubJobInfo >& job_info); private: HSHomeObject& ho_; std::atomic< uint64_t > job_counter_{0}; std::shared_mutex gc_job_mutex_; + std::shared_mutex scrub_job_mutex_; // we don`t have an external DB to store the job status, so we only keep the status of the lastest 100 jobs for // query. or, we can evict the job after it is completed after a timeout period. folly::EvictingCacheMap< std::string, std::shared_ptr< GCJobInfo > > gc_jobs_map_{100}; + folly::EvictingCacheMap< std::string, std::shared_ptr< ScrubJobInfo > > scrub_jobs_map_{100}; }; } // namespace homeobject \ No newline at end of file diff --git a/src/lib/homestore_backend/hs_pg_manager.cpp b/src/lib/homestore_backend/hs_pg_manager.cpp index 605d5f872..442ec4078 100644 --- a/src/lib/homestore_backend/hs_pg_manager.cpp +++ b/src/lib/homestore_backend/hs_pg_manager.cpp @@ -223,7 +223,7 @@ folly::Expected< HSHomeObject::HS_PG*, PGError > HSHomeObject::local_create_pg(s auto uuid_str = boost::uuids::to_string(index_table->uuid()); repl_dev->set_custom_rdev_name(fmt::format("rdev{}", pg_info.id)); - auto hs_pg = std::make_unique< HS_PG >(std::move(pg_info), std::move(repl_dev), index_table, chunk_ids); + auto hs_pg = std::make_unique< HS_PG >(std::move(pg_info), std::move(repl_dev), index_table, chunk_ids, *this); auto ret = hs_pg.get(); { scoped_lock lck(index_lock_); @@ -236,6 +236,9 @@ folly::Expected< HSHomeObject::HS_PG*, PGError > HSHomeObject::local_create_pg(s // Add to index service, so that it gets cleaned up when index service is shutdown. hs()->index_service().add_index_table(index_table); add_pg_to_map(std::move(hs_pg)); + + // when local_create_pg is called by BR ,pg scrub superblk will not be overrite if it already exists + scrub_mgr_->add_pg(pg_info.id); } return ret; } @@ -350,7 +353,6 @@ void HSHomeObject::on_pg_start_replace_member(group_id_t group_id, const std::st auto hs_pg = static_cast< HSHomeObject::HS_PG* >(pg.get()); pg->pg_info_.members.emplace(std::move(to_pg_member(member_in))); pg->pg_info_.members.emplace(std::move(to_pg_member(member_out))); - uint32_t i{0}; pg_members* sb_members = hs_pg->pg_sb_->get_pg_members_mutable(); for (auto const& m : pg->pg_info_.members) { @@ -368,6 +370,12 @@ void HSHomeObject::on_pg_start_replace_member(group_id_t group_id, const std::st LOGI("PG start replace member done, task_id={} member_out={} member_in={}, member_nums={}, trace_id={}", task_id, boost::uuids::to_string(member_out.id), boost::uuids::to_string(member_in.id), pg->pg_info_.members.size(), tid); + + // TODO::change the pg state to indicate it's under replacing member, so that we can reject some + // operations(like scrub) which may conflict with replace member. Currently we just cancel scrub task if + // it's running + scrub_mgr_->cancel_scrub_task(pg->pg_info_.id); + return; } } @@ -698,7 +706,8 @@ bool HSHomeObject::pg_destroy(pg_id_t pg_id, bool need_to_pause_pg_state_machine // we have the assumption that after pg is marked as destroyed, it will not be marked as alive again. // TODO:: if this assumption is broken, we need to handle it. - gc_mgr_->drain_pg_pending_gc_task(pg_id); + if (gc_mgr_) gc_mgr_->drain_pg_pending_gc_task(pg_id); + if (scrub_mgr_) scrub_mgr_->remove_pg(pg_id); destroy_shards(pg_id); destroy_hs_resources(pg_id); @@ -815,7 +824,6 @@ void HSHomeObject::destroy_hs_resources(pg_id_t pg_id) { chunk_selector_->reset_ void HSHomeObject::destroy_pg_index_table(pg_id_t pg_id) { std::shared_ptr< BlobIndexTable > index_table; - { // index_table->destroy() will trigger a cp_flush, which will call homeobject#cp_flush and try to acquire // `_pg_lock`, so we need to release the lock here to avoid a dead lock @@ -935,7 +943,7 @@ void HSHomeObject::on_pg_meta_blk_found(sisl::byte_view const& buf, void* meta_c std::vector< chunk_num_t > p_chunk_ids(pg_sb->get_chunk_ids(), pg_sb->get_chunk_ids() + pg_sb->num_chunks); bool set_pg_chunks_res = chunk_selector_->recover_pg_chunks(pg_id, std::move(p_chunk_ids)); auto uuid_str = boost::uuids::to_string(pg_sb->index_table_uuid); - auto hs_pg = std::make_unique< HS_PG >(std::move(pg_sb), std::move(v.value())); + auto hs_pg = std::make_unique< HS_PG >(std::move(pg_sb), std::move(v.value()), *this); if (!set_pg_chunks_res) { hs_pg->pg_state_.set_state(PGStateMask::DISK_DOWN); hs_pg->repl_dev_->set_stage(homestore::repl_dev_stage_t::UNREADY); @@ -971,12 +979,13 @@ PGInfo HSHomeObject::HS_PG::pg_info_from_sb(homestore::superblk< pg_info_superbl } HSHomeObject::HS_PG::HS_PG(PGInfo info, shared< homestore::ReplDev > rdev, shared< BlobIndexTable > index_table, - std::shared_ptr< const std::vector< chunk_num_t > > pg_chunk_ids) : + std::shared_ptr< const std::vector< chunk_num_t > > pg_chunk_ids, HSHomeObject& home_obj) : PG{std::move(info)}, pg_sb_{_pg_meta_name}, repl_dev_{std::move(rdev)}, index_table_{std::move(index_table)}, metrics_{*this}, + home_obj_{home_obj}, snp_rcvr_info_sb_{_snp_rcvr_meta_name}, snp_rcvr_shard_list_sb_{_snp_rcvr_shard_list_meta_name} { RELEASE_ASSERT(pg_chunk_ids != nullptr, "PG chunks null, pg={}", pg_info_.id); @@ -1011,19 +1020,29 @@ HSHomeObject::HS_PG::HS_PG(PGInfo info, shared< homestore::ReplDev > rdev, share pg_sb_chunk_ids[i] = pg_chunk_ids->at(i); } pg_sb_.write(); + + register_data_rpc_handlers(); } -HSHomeObject::HS_PG::HS_PG(superblk< pg_info_superblk >&& sb, shared< ReplDev > rdev) : - PG{pg_info_from_sb(sb)}, pg_sb_{std::move(sb)}, repl_dev_{std::move(rdev)}, metrics_{*this} { +HSHomeObject::HS_PG::HS_PG(superblk< pg_info_superblk >&& sb, shared< ReplDev > rdev, HSHomeObject& home_obj) : + PG{pg_info_from_sb(sb)}, + pg_sb_{std::move(sb)}, + repl_dev_{std::move(rdev)}, + metrics_{*this}, + home_obj_{home_obj} { durable_entities_.blob_sequence_num = pg_sb_->blob_sequence_num; durable_entities_.active_blob_count = pg_sb_->active_blob_count; durable_entities_.tombstone_blob_count = pg_sb_->tombstone_blob_count; durable_entities_.total_occupied_blk_count = pg_sb_->total_occupied_blk_count; durable_entities_.total_reclaimed_blk_count = pg_sb_->total_reclaimed_blk_count; + + register_data_rpc_handlers(); } uint32_t HSHomeObject::HS_PG::total_shards() const { return shards_.size(); } +blob_id_t HSHomeObject::HS_PG::get_last_committed_blob_id() const { return last_committed_blob_id.load(); } + uint32_t HSHomeObject::HS_PG::open_shards() const { return std::count_if(shards_.begin(), shards_.end(), [](auto const& s) { return s->is_open(); }); } @@ -1115,6 +1134,170 @@ void HSHomeObject::HS_PG::update_membership(const MemberSet& members) { LOGI("PG membership updated, member_nums={}", pg_sb_->num_dynamic_members); } +void HSHomeObject::HS_PG::register_data_rpc_handlers() { + const auto& pg_id = pg_info_.id; + bool success; + + success = repl_dev_->add_data_rpc_service(PUSH_SCRUB_REQ, bind_this(HS_PG::on_scrub_req_received, 1)); + if (success) { + LOGI("Successfully registered PUSH_SCRUB_REQ RPC handler for pg={}", pg_id); + } else { + LOGW("PUSH_SCRUB_REQ RPC handler already registered for pg={}", pg_id); + } + + success = repl_dev_->add_data_rpc_service(PUSH_SCRUB_RESULT, bind_this(HS_PG::on_scrub_result_received, 1)); + if (success) { + LOGI("Successfully registered PUSH_SCRUB_RESULT RPC handler for pg={}", pg_id); + } else { + LOGW("PUSH_SCRUB_RESULT RPC handler already registered for pg={}", pg_id); + } +} + +void HSHomeObject::HS_PG::on_scrub_req_received(boost::intrusive_ptr< sisl::GenericRpcData >& rpc_data) { + const auto pg_id = pg_info_.id; + LOGD("Received scrub req for pg={}", pg_id); + + auto const& incoming_buf = rpc_data->request_blob(); + const auto buf_size = incoming_buf.size(); + const auto buf_ptr = incoming_buf.cbytes(); + + if (!buf_ptr || !buf_size) { + LOGW("scrub req received with empty buffer for pg={}", pg_id); + rpc_data->send_response(); + return; + } + + flatbuffers::Verifier verifier(buf_ptr, buf_size); + if (!VerifySizePrefixedScrubReqBuffer(verifier)) { + LOGW("received with invalid flatbuffer for pg={}", pg_id); + rpc_data->send_response(); + return; + } + + std::shared_ptr< ScrubManager::scrub_req > scrub_req = std::make_shared< ScrubManager::scrub_req >(); + + if (!scrub_req->load(buf_ptr, buf_size)) { + LOGW("Failed to load scrub_blob request from flatbuffer for pg={}", pg_id); + rpc_data->send_response(); + return; + } + + LOGD("Scrub req loaded from flatbuffer for pg={}, scrub_type:{}, issuer_peer_id:{}", pg_id, scrub_req->scrub_type, + scrub_req->issuer_peer_id); + + const bool is_check_existence_req = scrub_req->scrub_type == SCRUB_TYPE::CHECK_BLOB_EXISTENCE || + scrub_req->scrub_type == SCRUB_TYPE::CHECK_SHARD_EXISTENCE; + + // handle check existence req + if (is_check_existence_req) { + const auto& shard_id = scrub_req->start_shard_id; + + LOGD("handle check existence req for pg={}, shard_id={}, blob_id={}, req_type={}", pg_id, + scrub_req->start_shard_id, scrub_req->start_blob_id, scrub_req->scrub_type); + + // sicne check existence is light weight, we can handle it immediately without adding it to scrub manager, and + // reply the result in rpc response directly. + bool exists = false; + + if (scrub_req->scrub_type == SCRUB_TYPE::CHECK_BLOB_EXISTENCE) { + const auto blob_id = scrub_req->start_blob_id; + BlobRouteKey key{BlobRoute{shard_id, blob_id}}; + BlobRouteValue value; + homestore::BtreeSingleGetRequest get_req{&key, &value}; + auto ret = index_table_->get(get_req); + if (homestore::btree_status_t::success == ret && value.pbas() != HSHomeObject::tombstone_pbas) { + exists = true; + } + } else { + // check if shard exists in pg_index_table + auto start_key = BlobRouteKey{BlobRoute{shard_id, 0}}; + auto end_key = BlobRouteKey{BlobRoute{shard_id, UINT64_MAX}}; + + homestore::BtreeQueryRequest< BlobRouteKey > qr{ + homestore::BtreeKeyRange< BlobRouteKey >{start_key, true, end_key, true}, + homestore::BtreeQueryType::SWEEP_NON_INTRUSIVE_PAGINATION_QUERY, 1, + [](homestore::BtreeKey const& /*key*/, homestore::BtreeValue const& value) mutable -> bool { + BlobRouteValue blob_value{value}; + // we consider shard exists only if we can find at least one blob route entry for that shard and the + // blob is not deleted(tombstone) + return blob_value.pbas() != HSHomeObject::tombstone_pbas; + }}; + + std::vector< std::pair< BlobRouteKey, BlobRouteValue > > out; + index_table_->query(qr, out); + // if query returns error, we just return false to indicate blob not exists, but it may be better to have a + // separate error code to indicate the failure case. + + // TODO:: handle the case if query returns error, currently we just return false to indicate blob not + // exists, but it may be better to have a separate error code to indicate the failure case. + exists = !out.empty(); + } + + // TODO:: handle the case if index_table returns error, currently we just return false to indicate blob not + // exists, but it may be better to have a separate error code to indicate the failure case. + + auto resp = std::make_shared< bool >(exists); + sisl::io_blob_list_t blob_list; + blob_list.emplace_back(reinterpret_cast< uint8_t* >(resp.get()), static_cast< uint32_t >(sizeof(bool)), false); + + rpc_data->set_comp_cb([resp](boost::intrusive_ptr< sisl::GenericRpcData >&) {}); + // we only send a boolean back to indicate whether the blob exists + rpc_data->send_response(blob_list); + + return; + } + + // handle scrub req + rpc_data->send_response(); + auto scrub_mgr = home_obj_.scrub_manager(); + if (!scrub_mgr) { + LOGW("ScrubManager is not initialized in HS_PG::on_scrub_req_received for pg={}", pg_id); + return; + } + scrub_mgr->add_scrub_req(scrub_req); +} + +void HSHomeObject::HS_PG::on_scrub_result_received(boost::intrusive_ptr< sisl::GenericRpcData >& rpc_data) { + const auto pg_id = pg_info_.id; + LOGD("Received scrub result for pg={}", pg_id); + + struct rpc_cleanup { + boost::intrusive_ptr< sisl::GenericRpcData >& rpc_data_; + ~rpc_cleanup() { + if (rpc_data_) { rpc_data_->send_response(); } + } + } rpc_cleanup{rpc_data}; + + auto const& incoming_buf = rpc_data->request_blob(); + const auto buf_size = incoming_buf.size(); + const auto buf_ptr = incoming_buf.cbytes(); + + if (!buf_ptr || !buf_size) { + LOGW("PUSH_DEEP_BLOB_SM received with empty buffer for pg={}, buffer_size={}", pg_id, buf_size); + return; + } + flatbuffers::Verifier verifier(buf_ptr, buf_size); + if (!VerifySizePrefixedScrubResultBuffer(verifier)) { + LOGW("scrub result received with invalid flatbuffer for pg={}, buffer_size={}", pg_id, buf_size); + return; + } + + std::shared_ptr< ScrubManager::scrub_result > scrub_result = std::make_shared< ScrubManager::scrub_result >(); + if (!scrub_result->load(buf_ptr, buf_size)) { + LOGW("Failed to load scrub result from flatbuffer for pg={}", pg_id); + return; + } + LOGD("Scrub result loaded from flatbuffer for pg={}, req_id:{}, issuer_peer_id:{}", pg_id, scrub_result->req_id, + scrub_result->issuer_peer_id); + + auto scrub_mgr = home_obj_.scrub_manager(); + if (!scrub_mgr) { + LOGW("ScrubManager is not initialized in HS_PG::on_scrub_result_received for pg={}", pg_id); + return; + } + scrub_mgr->add_scrub_result(pg_id, scrub_result); +} + // NOTE: caller should hold the _pg_lock const HSHomeObject::HS_PG* HSHomeObject::_get_hs_pg_unlocked(pg_id_t pg_id) const { auto iter = _pg_map.find(pg_id); @@ -1254,24 +1437,26 @@ void HSHomeObject::refresh_pg_statistics(pg_id_t pg_id) { uint64_t active_count = 0; uint64_t tombstone_count = 0; - auto start_key = - BlobRouteKey{BlobRoute{uint64_t(pg_id) << homeobject::shard_width, std::numeric_limits< uint64_t >::min()}}; - auto end_key = - BlobRouteKey{BlobRoute{uint64_t(pg_id + 1) << homeobject::shard_width, std::numeric_limits< uint64_t >::min()}}; + auto start_key = BlobRouteKey{BlobRoute{uint64_t(pg_id) << homeobject::shard_width, 0}}; + auto end_key = BlobRouteKey{BlobRoute{uint64_t(pg_id + 1) << homeobject::shard_width, 0}}; + uint64_t last_blob_id = 0; homestore::BtreeQueryRequest< BlobRouteKey > query_req{ homestore::BtreeKeyRange< BlobRouteKey >{std::move(start_key), true /* inclusive */, std::move(end_key), false /* inclusive */}, homestore::BtreeQueryType::SWEEP_NON_INTRUSIVE_PAGINATION_QUERY, std::numeric_limits< uint32_t >::max() /* blob count in a pg will not exceed uint32_t_max*/, - [&active_count, &tombstone_count](homestore::BtreeKey const& key, - homestore::BtreeValue const& value) mutable -> bool { + [&active_count, &tombstone_count, &last_blob_id](homestore::BtreeKey const& key, + homestore::BtreeValue const& value) mutable -> bool { BlobRouteValue blob_value{value}; if (blob_value.pbas() == HSHomeObject::tombstone_pbas) { tombstone_count++; } else { active_count++; + BlobRouteKey blob_key{key}; + last_blob_id = std::max(last_blob_id, blob_key.key().blob); } + return false; // Continue scanning }}; @@ -1309,6 +1494,8 @@ void HSHomeObject::refresh_pg_statistics(pg_id_t pg_id) { de.total_occupied_blk_count.store(total_occupied, std::memory_order_relaxed); }); + hs_pg->last_committed_blob_id.store(last_blob_id); + LOGI("Refreshed statistics for pg={}: active_blobs={} (original={}), tombstone_blobs={} (original={}), " "occupied_blocks={} (original={})", pg_id, active_count, original_active_count, tombstone_count, original_tombstone_count, total_occupied, @@ -1325,9 +1512,9 @@ void HSHomeObject::update_pg_meta_after_gc(const pg_id_t pg_id, const homestore: auto hs_pg = dynamic_cast< HS_PG* >(iter->second.get()); auto move_from_v_chunk = chunk_selector()->get_extend_vchunk(move_from_chunk); - // TODO:: for now, when updating pchunk for a vchunk, we have to update the whole pg super blk. we can optimize this - // by persist a single superblk for each vchunk in the pg, so that we only need to update the vchunk superblk - // itself. + // TODO:: for now, when updating pchunk for a vchunk, we have to update the whole pg super blk. we can optimize + // this by persist a single superblk for each vchunk in the pg, so that we only need to update the vchunk + // superblk itself. auto pg_chunks = hs_pg->pg_sb_->get_chunk_ids_mutable(); @@ -1339,7 +1526,7 @@ void HSHomeObject::update_pg_meta_after_gc(const pg_id_t pg_id, const homestore: if (sisl_unlikely(pg_chunks[v_chunk_id] == move_to_chunk)) { // this might happens when crash recovery. the crash happens after pg metablk is updated but before gc task // metablk is destroyed. - LOGD("gc task_id={}, the pchunk_id for vchunk={} for pg_id={} is already {}, update pg metablk again!", + LOGD("gc task_id={}, the pchunk_id for vchunk={} for pg_id={} is already {}, skip updating pg metablk!", task_id, v_chunk_id, pg_id, move_to_chunk); } else { RELEASE_ASSERT(pg_chunks[v_chunk_id] == move_from_chunk, @@ -1350,35 +1537,36 @@ void HSHomeObject::update_pg_meta_after_gc(const pg_id_t pg_id, const homestore: LOGD("gc task_id={}, pchunk for vchunk={} of pg_id={} is updated from {} to {}", task_id, v_chunk_id, pg_id, move_from_chunk, move_to_chunk); - // TODO:hs_pg->shards_.size() will be decreased by 1 in delete_shard if gc finds a empty shard, which will be - // implemented later - hs_pg->durable_entities_update([this, move_from_v_chunk, &move_to_chunk, &move_from_chunk, &pg_id, - &task_id](auto& de) { - // active_blob_count is updated by put/delete blob, not change it here. + // TODO:hs_pg->shards_.size() will be decreased by 1 in delete_shard if gc finds a empty shard, which will + // be implemented later + hs_pg->durable_entities_update( + [this, move_from_v_chunk, &move_to_chunk, &move_from_chunk, &pg_id, &task_id](auto& de) { + // active_blob_count is updated by put/delete blob, not change it here. - // considering the complexity of gc crash recovery for tombstone_blob_count, we get it directly from index - // table , which is the most accurate. + // considering the complexity of gc crash recovery for tombstone_blob_count, we get it directly from + // index table , which is the most accurate. - // TODO::do we need this as durable entity? remove it and get all the from pg index in real time. - de.tombstone_blob_count = get_pg_tombstone_blob_count(pg_id); + // TODO::do we need this as durable entity? remove it and get all the from pg index in real time. + de.tombstone_blob_count = get_pg_tombstone_blob_count(pg_id); - auto move_to_v_chunk = chunk_selector()->get_extend_vchunk(move_to_chunk); + auto move_to_v_chunk = chunk_selector()->get_extend_vchunk(move_to_chunk); - auto total_occupied_blk_count_by_move_from_chunk = move_from_v_chunk->get_used_blks(); - auto total_occupied_blk_count_by_move_to_chunk = move_to_v_chunk->get_used_blks(); + auto total_occupied_blk_count_by_move_from_chunk = move_from_v_chunk->get_used_blks(); + auto total_occupied_blk_count_by_move_to_chunk = move_to_v_chunk->get_used_blks(); - // TODO::in recovery case , this might be updated again , fix me later. - const auto reclaimed_blk_count = - total_occupied_blk_count_by_move_from_chunk - total_occupied_blk_count_by_move_to_chunk; + // TODO::in recovery case , this might be updated again , fix me later. + const auto reclaimed_blk_count = + total_occupied_blk_count_by_move_from_chunk - total_occupied_blk_count_by_move_to_chunk; - de.total_occupied_blk_count -= reclaimed_blk_count; - de.total_reclaimed_blk_count += reclaimed_blk_count; + de.total_occupied_blk_count -= reclaimed_blk_count; + de.total_reclaimed_blk_count += reclaimed_blk_count; - LOGD("gc task_id={}, move_from_chunk={}, total_occupied_blk_count_by_move_from_chunk={}, move_to_chunk={}, " - "total_occupied_blk_count_by_move_to_chunk={}, total_occupied_blk_count={}", - task_id, move_from_chunk, total_occupied_blk_count_by_move_from_chunk, move_to_chunk, - total_occupied_blk_count_by_move_to_chunk, de.total_occupied_blk_count.load()); - }); + LOGD("gc task_id={}, move_from_chunk={}, total_occupied_blk_count_by_move_from_chunk={}, " + "move_to_chunk={}, " + "total_occupied_blk_count_by_move_to_chunk={}, total_occupied_blk_count={}", + task_id, move_from_chunk, total_occupied_blk_count_by_move_from_chunk, move_to_chunk, + total_occupied_blk_count_by_move_to_chunk, de.total_occupied_blk_count.load()); + }); hs_pg->pg_sb_->total_occupied_blk_count = hs_pg->durable_entities().total_occupied_blk_count.load(std::memory_order_relaxed); diff --git a/src/lib/homestore_backend/hs_shard_manager.cpp b/src/lib/homestore_backend/hs_shard_manager.cpp index 8c949cb3e..9f2bf8ce5 100644 --- a/src/lib/homestore_backend/hs_shard_manager.cpp +++ b/src/lib/homestore_backend/hs_shard_manager.cpp @@ -63,15 +63,15 @@ uint64_t ShardManager::max_shard_size() { return Gi; } uint64_t ShardManager::max_shard_num_in_pg() { return ((uint64_t)0x01) << shard_width; } -shard_id_t HSHomeObject::generate_new_shard_id(pg_id_t pgid) { +shard_id_t HSHomeObject::generate_new_shard_id(pg_id_t pg_id) { std::scoped_lock lock_guard(_pg_lock); - auto hs_pg = const_cast< HS_PG* >(_get_hs_pg_unlocked(pgid)); + auto hs_pg = const_cast< HS_PG* >(_get_hs_pg_unlocked(pg_id)); RELEASE_ASSERT(hs_pg, "Missing pg info"); auto new_sequence_num = ++hs_pg->shard_sequence_num_; RELEASE_ASSERT(new_sequence_num < ShardManager::max_shard_num_in_pg(), "new shard id must be less than ShardManager::max_shard_num_in_pg()"); - return make_new_shard_id(pgid, new_sequence_num); + return make_new_shard_id(pg_id, new_sequence_num); } uint64_t HSHomeObject::get_sequence_num_from_shard_id(uint64_t shard_id) { @@ -704,6 +704,14 @@ void HSHomeObject::write_migrated_shard_metablks() { } } +shard_id_t HSHomeObject::get_last_shard_id_in_pg(pg_id_t pg_id) const { + std::scoped_lock lock_guard(_pg_lock, _shard_lock); + auto hs_pg = const_cast< HS_PG* >(_get_hs_pg_unlocked(pg_id)); + RELEASE_ASSERT(hs_pg, "Missing pg info, pg={}", pg_id); + auto& shards = hs_pg->shards_; + return shards.empty() ? 0 : shards.back()->info.id; +} + void HSHomeObject::add_new_shard_to_map(std::unique_ptr< HS_Shard > shard) { // TODO: We are taking a global lock for all pgs to create shard. Is it really needed?? // We need to have fine grained per PG lock and take only that. diff --git a/src/lib/homestore_backend/replication_state_machine.cpp b/src/lib/homestore_backend/replication_state_machine.cpp index 81ec3e6d0..991352b6b 100644 --- a/src/lib/homestore_backend/replication_state_machine.cpp +++ b/src/lib/homestore_backend/replication_state_machine.cpp @@ -293,9 +293,10 @@ void ReplicationStateMachine::on_destroy(const homestore::group_id_t& group_id) LOGW("do not have pg mapped by group_id={}", boost::uuids::to_string(group_id)); return; } - home_object_->pg_destroy(PG_ID.value()); - LOGI("replica destroyed, cleared pg={} resources with group_id={}", PG_ID.value(), - boost::uuids::to_string(group_id)); + + const auto pg_id = PG_ID.value(); + home_object_->pg_destroy(pg_id); + LOGI("replica destroyed, cleared pg={} resources with group_id={}", pg_id, boost::uuids::to_string(group_id)); } void ReplicationStateMachine::on_remove_member(const homestore::replica_id_t& member, trace_id_t tid) { @@ -1049,4 +1050,32 @@ void ReplicationStateMachine::on_log_replay_done(const homestore::group_id_t& gr home_object_->refresh_pg_statistics(pg_id); } +void ReplicationStateMachine::on_become_leader(const homestore::group_id_t& group_id) { + auto pg_id_opt = home_object_->get_pg_id_with_group_id(group_id); + if (!pg_id_opt.has_value()) { + LOGE("become leader but can not find any pg for group={}!", group_id); + return; + } + const auto pg_id = pg_id_opt.value(); + RELEASE_ASSERT(home_object_->pg_exists(pg_id), "pg={} should exist, but not! fatal error!", pg_id); + // TODO:: add whatever acitons needed to be take. +} + +void ReplicationStateMachine::on_become_follower(const homestore::group_id_t& group_id) { + auto pg_id_opt = home_object_->get_pg_id_with_group_id(group_id); + if (!pg_id_opt.has_value()) { + LOGE("become follower but can not find any pg for group={}!", group_id); + return; + } + const auto pg_id = pg_id_opt.value(); + RELEASE_ASSERT(home_object_->pg_exists(pg_id), "pg={} should exist, but not! fatal error!", pg_id); + + LOGI("become follower of group {}, cancel scrub task for pg={}", group_id, pg_id); + // TODO:: add whatever acitons needed to be take. + + // cancel scrub task if I am not leader again. + auto& scrub_mgr = home_object_->scrub_manager(); + if (scrub_mgr) scrub_mgr->cancel_scrub_task(pg_id); +} + } // namespace homeobject diff --git a/src/lib/homestore_backend/replication_state_machine.hpp b/src/lib/homestore_backend/replication_state_machine.hpp index 724f091d0..75d2d4187 100644 --- a/src/lib/homestore_backend/replication_state_machine.hpp +++ b/src/lib/homestore_backend/replication_state_machine.hpp @@ -240,6 +240,14 @@ class ReplicationStateMachine : public homestore::ReplDevListener { /// void on_log_replay_done(const homestore::group_id_t& group_id) override; + /// @brief this is called when this node becomes leader for the group + /// @param group_id - the group , where all the logs are replayed but not join raft group + virtual void on_become_leader(const homestore::group_id_t& group_id) override; + + /// @brief this is called when this node becomes follower for the group + /// @param group_id - the group , where all the logs are replayed but not join raft group + virtual void on_become_follower(const homestore::group_id_t& group_id) override; + private: HSHomeObject* home_object_{nullptr}; diff --git a/src/lib/homestore_backend/scrub_manager.cpp b/src/lib/homestore_backend/scrub_manager.cpp new file mode 100644 index 000000000..4d5ca9ef4 --- /dev/null +++ b/src/lib/homestore_backend/scrub_manager.cpp @@ -0,0 +1,2123 @@ +#include "hs_homeobject.hpp" +#include +#include +#include +#include +#include + +namespace homeobject { + +SISL_LOGGING_DECL(scrubmgr) + +#define SCRUBLOG(level, pg_id, task_id, msg, ...) \ + LOG##level##MOD(scrubmgr, "[pg_id={}, task_id={}] " msg, pg_id, task_id, ##__VA_ARGS__) + +#define SCRUBLOGD(pg_id, task_id, msg, ...) SCRUBLOG(DEBUG, pg_id, task_id, msg, ##__VA_ARGS__) +#define SCRUBLOGI(pg_id, task_id, msg, ...) SCRUBLOG(INFO, pg_id, task_id, msg, ##__VA_ARGS__) +#define SCRUBLOGW(pg_id, task_id, msg, ...) SCRUBLOG(WARN, pg_id, task_id, msg, ##__VA_ARGS__) +#define SCRUBLOGE(pg_id, task_id, msg, ...) SCRUBLOG(ERROR, pg_id, task_id, msg, ##__VA_ARGS__) +#define SCRUBLOGC(pg_id, task_id, msg, ...) SCRUBLOG(CRITICAL, pg_id, task_id, msg, ##__VA_ARGS__) + +class ScrubManager::PGScrubContext { +public: + PGScrubContext(uint64_t task_id, const HSHomeObject::HS_PG* hs_pg); + ~PGScrubContext(); + + bool scrub_meta_batch(std::shared_ptr< ScrubManager::MetaScrubReport > scrub_report, shard_id_t start_shard_id, + shard_id_t end_shard_id, blob_id_t last_blob_id, int64_t scrub_lsn, + std::map< shard_id_t, uint32_t >& shard_blob_count_in_batch); + + bool scrub_blob_batch(std::shared_ptr< ScrubManager::ShallowScrubReport > scrub_report, shard_id_t start_shard_id, + shard_id_t end_shard_id, blob_id_t last_blob_id, int64_t scrub_lsn, bool is_deep_scrub); + + void reconcile_scrub_report(std::shared_ptr< ScrubManager::ShallowScrubReport > scrub_report); + + folly::Future< bool > check_existence_in_peer(peer_id_t peer_id, BlobRoute blob, bool check_blob); + + void add_scrub_result(std::shared_ptr< ScrubManager::scrub_result > result); + uint64_t random_req_id() const; + void send_req_to_peer(const ScrubManager::scrub_req& req, const peer_id_t& peer_id); + void cancel() { + cancelled.store(true); + const auto pg_id = hs_pg->pg_id(); + SCRUBLOGI(pg_id, task_id, "scrub task is cancelled"); + } + + uint64_t task_id{0}; + std::atomic_bool cancelled{false}; + +private: + std::shared_ptr< folly::IOThreadPoolExecutor > m_scrub_executor; + const HSHomeObject::HS_PG* hs_pg; + folly::ConcurrentHashMap< peer_id_t, + std::shared_ptr< folly::MPMCQueue< std::shared_ptr< ScrubManager::scrub_result > > > > + peer_scrub_result_queue_map_; +}; + +ScrubManager::ScrubManager(HSHomeObject* homeobject) : m_hs_home_object{homeobject} { + // Register meta_service handlers to recover pg scrub superblocks + std::vector< homestore::superblk< pg_scrub_superblk > > stale_pg_scrub_sbs; + homestore::meta_service().register_handler( + pg_scrub_meta_name, + [this, &stale_pg_scrub_sbs](homestore::meta_blk* mblk, sisl::byte_view buf, size_t size) { + on_pg_scrub_meta_blk_found(std::move(buf), voidptr_cast(mblk), stale_pg_scrub_sbs); + }, + nullptr, true); + homestore::meta_service().read_sub_sb(pg_scrub_meta_name); + + // remove stale pg scrub superblocks + for (auto& sb : stale_pg_scrub_sbs) + sb.destroy(); +} + +ScrubManager::~ScrubManager() { stop(); } + +void ScrubManager::scan_pg_for_scrub() { + for (auto const& [pg_id, _] : m_pg_scrub_sb_map) { + if (is_eligible_for_deep_scrub(pg_id)) { + LOGINFOMOD(scrubmgr, "pg={} is eligible for deep scrub, submit scrub task", pg_id); + submit_scrub_task(pg_id, true) + .via(&folly::InlineExecutor::instance()) + .thenValue([this, pg_id](std::shared_ptr< ShallowScrubReport > report) { + if (!report) { + LOGERRORMOD(scrubmgr, "deep scrub failed for pg={}", pg_id); + return; + } + LOGINFOMOD(scrubmgr, "deep scrub is completed for pg={}", pg_id); + auto deep_report = std::dynamic_pointer_cast< DeepScrubReport >(report); + if (!deep_report) { + LOGERRORMOD(scrubmgr, "report for deep scrub cannot be casted to DeepScrubReport for pg={}", + pg_id); + return; + } + handle_deep_pg_scrub_report(std::move(deep_report)); + }); + return; + } + + if (is_eligible_for_shallow_scrub(pg_id)) { + LOGINFOMOD(scrubmgr, "pg={} is eligible for shallow scrub, submit scrub task", pg_id); + submit_scrub_task(pg_id, false) + .via(&folly::InlineExecutor::instance()) + .thenValue([this, pg_id](std::shared_ptr< ShallowScrubReport > report) { + if (!report) { + LOGERRORMOD(scrubmgr, "shallow scrub failed for pg={}", pg_id); + return; + } + LOGINFOMOD(scrubmgr, "shallow scrub is completed for pg={}", pg_id); + handle_shallow_pg_scrub_report(std::move(report)); + }); + return; + } + + LOGDEBUGMOD(scrubmgr, "pg={} is not eligible for any scrubbing", pg_id); + } +} + +void ScrubManager::handle_shallow_pg_scrub_report(std::shared_ptr< ShallowScrubReport > report) { + if (!report) { + LOGERRORMOD(scrubmgr, "Shallow scrub report is null!"); + return; + } + + report->print(); + // TODO:: add more logic, log event for notification, report to metrics?. +} + +void ScrubManager::handle_deep_pg_scrub_report(std::shared_ptr< DeepScrubReport > report) { + if (!report) { + LOGERRORMOD(scrubmgr, "Deep scrub report is null!"); + return; + } + + report->print(); + // TODO:: add more logic, log event for notification, report to metrics?. +} + +bool ScrubManager::is_eligible_for_deep_scrub(const pg_id_t& pg_id) { + // TODO:: add the real eligibility check logic + return false; +} + +bool ScrubManager::is_eligible_for_shallow_scrub(const pg_id_t& pg_id) { + // TODO:: add the real eligibility check logic + return false; +} + +void ScrubManager::start() { + // 1 set scrub task handling threads. + // TODO :: make thread count configurable, thread number is the most concurrent scrub tasks that can be handled + // concurrently. Too many concurrent scrub tasks may bring too much pressure to the node + const auto most_concurrent_scrub_task_num = 2; + m_scrub_executor = std::make_shared< folly::IOThreadPoolExecutor >(most_concurrent_scrub_task_num); + for (int i = 0; i < most_concurrent_scrub_task_num; ++i) { + m_scrub_executor->add([this]() { + while (true) { + // if no available scrub task, it will be blocked here. + auto pop_result = m_scrub_task_queue.pop(); + if (pop_result.is_closed()) { + LOGINFOMOD(scrubmgr, "scrub task queue is stopped, no need to handle scrub task anymore!"); + break; + } + RELEASE_ASSERT(pop_result.value.has_value() && pop_result.is_ok(), + "pop from scrub task queue should not fail when it is not closed!"); + auto task = std::move(pop_result.value.value()); + // we handle pg scrub task in a single thread , so that we can control the concurrent scrub tasks by + // controlling the thread number of m_scrub_executor. + handle_pg_scrub_task(std::move(task)); + } + }); + } + + // 2 set scrub req handling threads. + const auto most_concurrent_scrub_req_num = 2; + // we don't set priority for req as that of task, only control the concurrency to not bring too much io/cpu pressure + // to this node. + m_scrub_req_executor = std::make_shared< folly::IOThreadPoolExecutor >(most_concurrent_scrub_req_num); + + iomanager.run_on_wait(iomgr::reactor_regex::random_worker, [&]() { + m_scrub_timer_fiber = iomanager.iofiber_self(); + // TODO: make the interval configurable, for now set it to 60 seconds + m_scrub_timer_hdl = iomanager.schedule_thread_timer(60ull * 1000 * 1000 * 1000, true, nullptr /*cookie*/, + [this](void*) { scan_pg_for_scrub(); }); + }); + LOGINFOMOD(scrubmgr, "scrub manager started!"); +} + +void ScrubManager::stop() { + // shutdown timer — only if it was ever started + if (m_scrub_timer_hdl != iomgr::null_timer_handle) { + RELEASE_ASSERT(m_scrub_timer_fiber, + "m_scrub_timer_hdl is not null_timer_handle, but m_scrub_timer_fiber is null, fatal error!"); + LOGINFOMOD(scrubmgr, "stop scrub scheduler timer"); + iomanager.run_on_wait(m_scrub_timer_fiber, [&]() { + iomanager.cancel_timer(m_scrub_timer_hdl, true); + m_scrub_timer_hdl = iomgr::null_timer_handle; + }); + m_scrub_timer_fiber = nullptr; + } else { + LOGINFOMOD(scrubmgr, "scrub scheduler timer is not running, no need to stop it"); + } + + // cancel all the running scrub tasks and clear the scrub task queue. + // TODO:: add a stopped flag to avoid adding new scrub task if stopped. + if (!m_scrub_task_queue.is_closed()) { m_scrub_task_queue.close(); } + for (auto& [_, pg_scrub_ctx] : m_pg_scrub_ctx_map) { + pg_scrub_ctx->cancel(); + } + + if (m_scrub_executor) { + m_scrub_executor->stop(); + m_scrub_executor.reset(); + } + if (m_scrub_req_executor) { + m_scrub_req_executor->stop(); + m_scrub_req_executor.reset(); + } + LOGINFOMOD(scrubmgr, "scrub manager stopped!"); +} + +void ScrubManager::add_scrub_req(std::shared_ptr< scrub_req > req) { + m_scrub_req_executor->add([this, req = std::move(req)]() { handle_scrub_req(req); }); +} + +void ScrubManager::add_scrub_result(const pg_id_t pg_id, std::shared_ptr< scrub_result > result) { + auto pg_scrub_ctx_it = m_pg_scrub_ctx_map.find(pg_id); + if (pg_scrub_ctx_it == m_pg_scrub_ctx_map.end()) { + LOGERRORMOD(scrubmgr, "cannot find scrub context for pg_id={}, fail to add scrub map!", pg_id); + return; + } + + pg_scrub_ctx_it->second->add_scrub_result(std::move(result)); +} + +void ScrubManager::handle_scrub_req(std::shared_ptr< scrub_req > req) { + if (!req) { + LOGERRORMOD(scrubmgr, "scrub req is null, cannot handle it!"); + return; + } + + const auto& pg_id = req->pg_id; + const auto hs_pg = m_hs_home_object->get_hs_pg(pg_id); + if (!hs_pg) { + LOGERRORMOD(scrubmgr, "cannot find hs_pg for pg {}, fail to handle scrub req!", pg_id); + return; + } + + const auto& pg_repl_dev = hs_pg->repl_dev_; + if (!pg_repl_dev) { + LOGERRORMOD(scrubmgr, "repl_dev is null for pg {}, fail to handle scrub req!", pg_id); + return; + } + + // leader still need to handle the scrub req, as leader also needs to do scrub and send scrub result to itself to + // trigger the logic after receiving scrub result. + + std::shared_ptr< scrub_result > range_result; + auto& remote_peer_id = req->issuer_peer_id; + + // sleep for a while to avoid handling scrub req immediately, which may cause high IOPS to the node. + // for example, handling a deep blob scrub req will take some io resource. we sleep 1s here so that there is a + // interval in the middle of handing two deep blob scrub reqs. + + // TODO:: for different scrub req, we sleep different duration. + std::this_thread::sleep_for(std::chrono::seconds(1)); + + // 1 do scrub + const auto& scrub_type = req->scrub_type; + switch (scrub_type) { + case SCRUB_TYPE::META: { + LOGDEBUGMOD(scrubmgr, "handling meta scrub req for pg {}", pg_id); + range_result = local_scrub_meta(req); + break; + } + case SCRUB_TYPE::DEEP_BLOB: + case SCRUB_TYPE::SHALLOW_BLOB: { + LOGDEBUGMOD(scrubmgr, "handling blob scrub req for pg {}, scrub_type={}", pg_id, scrub_type); + range_result = local_scrub_blob(req); + break; + } + default: + RELEASE_ASSERT(false, "unknown scrub req type: {}!", scrub_type); + } + + if (!range_result) { + LOGERRORMOD(scrubmgr, "fail to handle scrub req for pg {}, scrub_type={}, drop it!", pg_id, scrub_type); + return; + } + + // 2 send scrub result back to leader + auto flatbuffer = range_result->build_flat_buffer(); + sisl::io_blob_list_t blob_list; + blob_list.emplace_back(flatbuffer.data(), flatbuffer.size(), false); + // no need to retry, leader will handle retries + pg_repl_dev->data_request_unidirectional(remote_peer_id, HSHomeObject::PUSH_SCRUB_RESULT, blob_list) + .via(&folly::InlineExecutor::instance()) + .thenValue([pg_id, remote_peer_id, flatbuffer = std::move(flatbuffer), scrub_type](auto&& response) { + if (response.hasError()) { + LOGERRORMOD(scrubmgr, "failed to send scrub result to peer {} in pg {}, scrub_type:{}, error={}", + remote_peer_id, pg_id, scrub_type, response.error()); + return; + } + + LOGDEBUGMOD(scrubmgr, "successfully sent scrub map to peer {} in pg {}, scrub_type:{}", remote_peer_id, + pg_id, scrub_type); + }); +} + +bool ScrubManager::wait_for_scrub_lsn_commit(shared< homestore::ReplDev > repl_dev, int64_t scrub_lsn) { + if (!repl_dev) { + LOGERRORMOD(scrubmgr, "repl_dev is null, cannot wait for scrub lsn commit!"); + return false; + } + + // TODO:: make this configurable + const auto wait_retry_times = 5; + for (auto i = 0; i < wait_retry_times; ++i) { + auto commit_lsn = repl_dev->get_last_commit_lsn(); + if (commit_lsn >= scrub_lsn) { + LOGINFOMOD(scrubmgr, "commit lsn {} is greater than or equal to scrub lsn {}, wait successfully", + commit_lsn, scrub_lsn); + return true; + } + LOGDEBUGMOD(scrubmgr, + "commit lsn {} is less than scrub lsn {}, wait for 1 second before retrying, retry times {}/{}", + commit_lsn, scrub_lsn, i + 1, wait_retry_times); + std::this_thread::sleep_for(std::chrono::seconds(1)); + } + + return false; +} + +uint64_t ScrubManager::compute_crc64(const void* data, size_t len, uint64_t crc) const { + static constexpr uint64_t kCrc64Poly = 0x42F0E1EBA9EA3693ULL; + static constexpr auto kCrc64Table = []() { + std::array< uint64_t, 256 > t{}; + for (int i = 0; i < 256; ++i) { + uint64_t c = static_cast< uint64_t >(i) << 56; + for (int b = 0; b < 8; ++b) { + c = (c & 0x8000000000000000ULL) ? ((c << 1) ^ kCrc64Poly) : (c << 1); + } + t[i] = c; + } + return t; + }(); + + const uint8_t* p = static_cast< const uint8_t* >(data); + while (len--) { + uint8_t idx = static_cast< uint8_t >((crc >> 56) ^ *p++); + crc = kCrc64Table[idx] ^ (crc << 8); + } + return crc; +} + +std::shared_ptr< ScrubManager::scrub_result > ScrubManager::local_scrub_blob(std::shared_ptr< scrub_req > req) { + if (!req) { + LOGERRORMOD(scrubmgr, "blob scrub req is null, cannot handle it!"); + return nullptr; + } + + const auto& req_id = req->req_id; + const auto& scrub_lsn = req->scrub_lsn; + const auto& pg_id = req->pg_id; + const auto& scrub_type = req->scrub_type; + + if (scrub_type != SCRUB_TYPE::DEEP_BLOB && scrub_type != SCRUB_TYPE::SHALLOW_BLOB) { + LOGERRORMOD(scrubmgr, + "invalid scrub req type for local_scrub_blob, pg_id={}, req_id={}, scrub_type={}, scrub_lsn={}", + pg_id, req_id, scrub_type, scrub_lsn); + return nullptr; + } + + LOGDEBUGMOD(scrubmgr, "handling blob scrub req for pg {}, req_id={}, scrub_lsn={}, scrub_type={}", pg_id, req_id, + scrub_lsn, scrub_type); + + auto hs_pg = m_hs_home_object->get_hs_pg(pg_id); + if (!hs_pg) { + LOGERRORMOD(scrubmgr, "req_id={} cannot find hs_pg for pg={}, fail to do deep blob scrub!", req_id, pg_id); + return nullptr; + } + + if (!wait_for_scrub_lsn_commit(hs_pg->repl_dev_, scrub_lsn)) { + LOGERRORMOD(scrubmgr, + "pg_id={}, req_id={}, commit lsn is not advanced to scrub lsn {} after waiting for a while, fail " + "to do {} blob scrub", + pg_id, req_id, scrub_lsn, scrub_type == SCRUB_TYPE::DEEP_BLOB ? "deep" : "shallow"); + return nullptr; + } + + if (req->start_shard_id > req->end_shard_id) { + LOGERRORMOD( + scrubmgr, + "received incorrect blob scrub req, start_shard_id={}, end_shard_id={}, start_blob_id={}, end_blob_id={}", + req->start_shard_id, req->end_shard_id, req->start_blob_id, req->end_blob_id); + return nullptr; + } + + // refer to docs/adr/scrub-blob-range-coverage.md + // TODO:: make this configurable. + uint32_t batch_capacity = static_cast< uint32_t >( + scrub_type == SCRUB_TYPE::SHALLOW_BLOB ? max_scrub_batch_size : deep_blob_scrub_batch_size); + + const auto start = BlobRouteKey{BlobRoute{req->start_shard_id, req->start_blob_id}}; + const auto end = BlobRouteKey{BlobRoute{req->end_shard_id, req->end_blob_id}}; + homestore::BtreeQueryRequest< BlobRouteKey > query_req{ + homestore::BtreeKeyRange< BlobRouteKey >{start, true /* inclusive */, end, true /* inclusive */}, + homestore::BtreeQueryType::SWEEP_NON_INTRUSIVE_PAGINATION_QUERY, batch_capacity, + [last_blob_id = req->end_blob_id](homestore::BtreeKey const& key, homestore::BtreeValue const& value) -> bool { + BlobRouteValue blob_value{value}; + BlobRouteKey blob_key{key}; + + return blob_value.pbas() != HSHomeObject::tombstone_pbas && blob_key.key().blob <= last_blob_id; + }}; + + std::vector< std::pair< BlobRouteKey, BlobRouteValue > > out; + auto const status = hs_pg->index_table_->query(query_req, out); + + // if there are more blobs to be scrubbed, we will handle them in the next scrub req, so we don't consider has_more + // as an error here. + if (status != homestore::btree_status_t::success && status != homestore::btree_status_t::has_more) { + LOGERRORMOD( + scrubmgr, + "pg_id={}, req_id={}, scrub_type={}, scrub_lsn={}, Failed to query blobs in index table for status={}", + pg_id, req_id, scrub_type, scrub_lsn, status); + return nullptr; + } + + auto blob_scrub_result = std::make_shared< ScrubManager::scrub_result >(req_id, m_hs_home_object->our_uuid()); + + if (scrub_type == SCRUB_TYPE::SHALLOW_BLOB) { + // for shallow blob scrubbing, we only check the existence of blobs, no io will be issued to hard drive. + for (const auto& [k, _] : out) { + blob_scrub_result->add_entry({k.key().shard, k.key().blob, ScrubStatus::NONE}); + } + LOGDEBUGMOD(scrubmgr, + "pg_id={}, req_id={}, scrub_lsn={}, shallow blob scrub completed, return {} blobs in range [{},{})", + pg_id, req_id, scrub_lsn, blob_scrub_result->entries.size(), start, end); + return blob_scrub_result; + } + + // Sort blobs by PBA (physical block address) for sequential disk access, this is a best effort, not guaranteed, + // since client io will move the disk pointer and break the sequence of io. + std::sort(out.begin(), out.end(), [](const auto& a, const auto& b) { + // Compare by PBA single blkid for ordering + const auto pba_a = a.second.pbas().to_single_blkid(); + const auto pba_b = b.second.pbas().to_single_blkid(); + return pba_a.blk_num() < pba_b.blk_num(); + }); + + // deep scrub: read and check blobs. + auto& data_service = homestore::data_service(); + const auto blk_size = data_service.get_blk_size(); + std::vector< folly::Future< folly::Unit > > futs; + + for (const auto& [k, v] : out) { + auto pba = v.pbas(); + auto total_size = pba.blk_count() * blk_size; + sisl::sg_list data_sgs; + data_sgs.size = total_size; + data_sgs.iovs.emplace_back( + iovec{.iov_base = iomanager.iobuf_alloc(blk_size, total_size), .iov_len = total_size}); + + const auto& shard_id = k.key().shard; + const auto& blob_id = k.key().blob; + + futs.emplace_back(std::move( + data_service.async_read(pba, data_sgs, total_size) + .thenValue([this, shard_id, blob_id, data_sgs = std::move(data_sgs), blob_scrub_result](auto&& err) { + auto blob = data_sgs.iovs[0].iov_base; + struct buffer_free_guard { + uint8_t* buf; + ~buffer_free_guard() { iomanager.iobuf_free(buf); } + } guard{reinterpret_cast< uint8_t* >(blob)}; + + ScrubManager::scrub_result_entry entry{shard_id, blob_id, ScrubStatus::NONE}; + + if (err) { + LOGERRORMOD(scrubmgr, "Failed to read blob for deep scrub, shard_id={}, blob_id={}, error={}", + shard_id, blob_id, err.message()); + entry.status_or_hash = ScrubStatus::IO_ERROR; + } else { + const auto blob_verify_succeed = m_hs_home_object->verify_blob(blob, shard_id, blob_id, true); + if (!blob_verify_succeed) { + // note that, if gc kicks in, the pba might be overwritten and lead to verification + // failure. + + // FIXME:: handle this case by query and read the blob again. + LOGERRORMOD(scrubmgr, "Blob verification failed for deep scrub, shard_id={}, blob_id={}", + shard_id, blob_id); + entry.status_or_hash = ScrubStatus::MISMATCH; + } else { + // we only calculate crc64 for data part. + const auto* header = reinterpret_cast< const HSHomeObject::BlobHeader* >(blob); + const auto* blob_data = reinterpret_cast< const uint8_t* >(blob) + header->data_offset; + entry.status_or_hash = compute_crc64(blob_data, header->blob_size); + } + } + + LOGDEBUGMOD(scrubmgr, "add entry to blob scrub result: shard_id={}, blob_id={}", entry.shard_id, + entry.blob_id); + blob_scrub_result->add_entry(entry); + }))); + } + + folly::collectAllUnsafe(futs).wait(); + + LOGDEBUGMOD(scrubmgr, "pg_id={}, req_id={}, deep blob scrub completed, found {} blobs in range [{},{}] to [{},{})", + pg_id, req_id, out.size(), req->start_shard_id, req->start_blob_id, req->end_shard_id, + req->end_blob_id); + + return blob_scrub_result; +} + +std::shared_ptr< ScrubManager::scrub_result > ScrubManager::local_scrub_meta(std::shared_ptr< scrub_req > req) { + if (!req) { + LOGERRORMOD(scrubmgr, "meta scrub req is null, cannot handle it!"); + return nullptr; + } + + const auto& req_id = req->req_id; + const auto& scrub_lsn = req->scrub_lsn; + const auto& pg_id = req->pg_id; + const auto& end_shard_id = req->end_shard_id; + const auto& start_shard_id = req->start_shard_id; + + if (req->scrub_type != SCRUB_TYPE::META) { + LOGERRORMOD(scrubmgr, + "invalid scrub req type for local_scrub_meta, pg_id={}, req_id={}, scrub_type={}, scrub_lsn={}", + pg_id, req_id, req->scrub_type, scrub_lsn); + return nullptr; + } + + LOGDEBUGMOD(scrubmgr, "handling meta scrub req for pg {}, req_id={}, scrub_lsn={}", pg_id, req_id, scrub_lsn); + + auto hs_pg = m_hs_home_object->get_hs_pg(pg_id); + if (!hs_pg) { + LOGERRORMOD(scrubmgr, "cannot find hs_pg for pg={}, fail to scrub meta!", pg_id); + return nullptr; + } + + if (start_shard_id > end_shard_id) { + LOGERRORMOD(scrubmgr, "received incorrect meta scrub req, start_shard_id={} > end_shard_id={}", start_shard_id, + end_shard_id); + return nullptr; + } + + if (!wait_for_scrub_lsn_commit(hs_pg->repl_dev_, scrub_lsn)) { + LOGERRORMOD( + scrubmgr, + "commit lsn is not advanced to scrub lsn {} after waiting for a while, fail to do local shard scrub, pg={}", + scrub_lsn, pg_id); + return nullptr; + } + + RELEASE_ASSERT(0 == req->start_blob_id, "for meta scrub, start_blob_id should be 0, pg_id={}, req_id={}", pg_id, + req_id); + const auto& end_blob_id = req->end_blob_id; + + LOGDEBUGMOD(scrubmgr, + "received meta scrub req for pg {}, req_id={}, scrub_lsn={}, start_shard_id={}, end_shard_id={}, " + "end_blob_id={}", + pg_id, req_id, scrub_lsn, start_shard_id, end_shard_id, end_blob_id); + + auto meta_scrub_result = std::make_shared< ScrubManager::scrub_result >(req_id, m_hs_home_object->our_uuid()); + + // we don't have a specific method to directly read a specific pg/shard meta_blk. the only way we can read metablk + // for now is registering a handler and then call meta_service read_sub_sb. we just skip this step for now + // TODO:: + // 1 to add a new method to directly read a specific meta_blk in meta_service. or we do this by registering handler + // and scan!! + // 2 add the ScrubResult for metablk to scrub_result_entry + // 3 calculate the hash of metablk. + + // for empty shard, without deleting shard(to be done), we can find the shard meta blk and shard meta data in + // memory, but cannot find any blob of this shard in pg_index_table. we consider the pg_index_table, not the + // shard_meta_blk, as the source of truth for meta scrub, so we will scan the index table to do meta scrub and don't + // care about empty shard, since no valid data in it. + + // scrub pg meta if start_shard_id is 0. since shard_id starts from 1, we use shard_id 0 to represent pg meta for + // convenience. + if (0 == (start_shard_id & homeobject::shard_mask)) { + LOGDEBUGMOD(scrubmgr, "scrubbing pg meta of pg={}", pg_id); + // blob_id here means the shard count of this pg. Since this is useless ATM, we just do it like this. + + // TODO:: do real pg meta blk scrub and calculate the hash for pg meta blk, for now we just skip this step and + // set its hash to 0. + meta_scrub_result->add_entry({0, hs_pg->total_shards() /*pg shard count*/, uint64_t(0) /*pg metablk hash*/}); + } + + // FIXME:: after we have shard sealed lsn, we only scrub those shards whose sealed_lsn is less than or equal to + // scrub_lsn. + auto start_key = BlobRouteKey{BlobRoute{std::max(uint64_t{1}, start_shard_id), 0}}; + auto end_key = BlobRouteKey{BlobRoute{end_shard_id, end_blob_id}}; + + ScrubManager::scrub_result_entry entry{0, 0, ScrubStatus::NONE}; + // max_scrub_batch_size here means how many shards we want to scrub in one batch, referring to + // docs/adr/scrub-blob-range-coverage.md + + // TODO: make this configurable + uint32_t batch_capacity = max_scrub_batch_size; + + homestore::BtreeQueryRequest< BlobRouteKey > qr{ + homestore::BtreeKeyRange< BlobRouteKey >{start_key, true, end_key, true}, + homestore::BtreeQueryType::SWEEP_NON_INTRUSIVE_PAGINATION_QUERY, std::numeric_limits< uint32_t >::max(), + [&entry, &batch_capacity, end_blob_id, meta_scrub_result](homestore::BtreeKey const& key, + homestore::BtreeValue const& value) mutable -> bool { + if (batch_capacity) { + BlobRouteKey blob_key{key}; + BlobRouteValue blob_value{value}; + + const auto shard_id = blob_key.key().shard; + if (shard_id != entry.shard_id) { + // coming to a new shard + if (entry.shard_id) { + // TODO:: do real shard meta blk scrub and calculate the hash for shard meta blk, for now we + // just set its hash to 0. + entry.status_or_hash = uint64_t(0); + meta_scrub_result->add_entry(entry); + if (--batch_capacity == 0) { + // so that it will not be added to meta_scrub_result again outside of the query loop. + entry.shard_id = 0; + return false; // Continue scanning + } + } + // reset entry for the new shard + entry.shard_id = shard_id; + entry.blob_id = 0; + } + + // there might be some deletion happens when we do meta scrub and lead to the inconsistency of the blob + // count of a specific shard among different replicas. this does not matter, since we will use the max + // blobs count of this shard in different replicas as the actual blob count of this shard, referring to + // docs/adr/scrub-blob-range-coverage.md + const auto blob_id = blob_key.key().blob; + if (blob_value.pbas() != HSHomeObject::tombstone_pbas && blob_id <= end_blob_id) { entry.blob_id++; } + } + + return false; // Continue scanning + }}; + + std::vector< std::pair< BlobRouteKey, BlobRouteValue > > out; + auto const ret = hs_pg->index_table_->query(qr, out); + if (ret != homestore::btree_status_t::success && ret != homestore::btree_status_t::has_more) { + LOGERRORMOD(scrubmgr, "[pg={}] failed to query index table, error={}", pg_id, ret); + return nullptr; + } + + // if we the last scrubbed shard in this batch is not 0, we add the scrub_result_entry for it here since it cannot + // be added in the query loop. + if (entry.shard_id) { + LOGDEBUGMOD(scrubmgr, "add last entry, shard_id={}, blob_id={}", entry.shard_id, entry.blob_id); + meta_scrub_result->add_entry(entry); + } + + LOGDEBUGMOD(scrubmgr, "meta scrub completed, checked {} shards in range [{},{}) to [{}, {}] in pg={}", + meta_scrub_result->entries.size(), start_shard_id, 0, end_shard_id, end_blob_id, pg_id); + + return meta_scrub_result; +} + +folly::SemiFuture< std::shared_ptr< ScrubManager::ShallowScrubReport > > +ScrubManager::submit_scrub_task(const pg_id_t& pg_id, const bool is_deep, SCRUB_TRIGGER_TYPE trigger_type) { + LOGINFOMOD(scrubmgr, "submit a scrub task for pg={}, deep_scrub={}, trigger_type={}", pg_id, is_deep, trigger_type); + + // Check if a scrub task is already running for this PG. + // Note: There's still a small race window between this check and task execution in handle_pg_scrub_task, + // but the in_scrubbing CAS below provides the final guard. This check prevents unnecessary work. + auto it = m_pg_scrub_ctx_map.find(pg_id); + if (it != m_pg_scrub_ctx_map.end()) { + LOGWARNMOD(scrubmgr, "a scrub task is already running for pg={}, no need to submit another one!", pg_id); + return folly::makeSemiFuture(std::shared_ptr< ScrubManager::ShallowScrubReport >(nullptr)); + } + + const auto ps_scrub_super_blk_it = m_pg_scrub_sb_map.find(pg_id); + if (ps_scrub_super_blk_it == m_pg_scrub_sb_map.end()) { + LOGERRORMOD(scrubmgr, "cannot find scrub superblk for pg={}, fail to submit scrub task!", pg_id); + return folly::makeSemiFuture(std::shared_ptr< ScrubManager::ShallowScrubReport >(nullptr)); + } + + // Get the PG and check its state + const auto hs_pg = m_hs_home_object->get_hs_pg(pg_id); + if (!hs_pg) { + LOGERRORMOD(scrubmgr, "cannot find hs_pg for pg={}, fail to submit scrub task!", pg_id); + return folly::makeSemiFuture(std::shared_ptr< ScrubManager::ShallowScrubReport >(nullptr)); + } + + // Check if pg_state is HEALTHY (state must be 0) + const auto current_state = hs_pg->pg_state_.get(); + if (current_state != 0) { + LOGWARNMOD(scrubmgr, "pg={} is not in HEALTHY state (current_state={}), cannot submit scrub task!", pg_id, + current_state); + return folly::makeSemiFuture(std::shared_ptr< ScrubManager::ShallowScrubReport >(nullptr)); + } + + // TODO:: use PGStateMask::SCRUBBING state to replace the in_scrubbing flag after cm supports + // PGStateMask::SCRUBBING. in_scrubbing here is used to indicate whether there is a scrub task pending/running for + // this pg. + bool expected = false; + if (!hs_pg->in_scrubbing.compare_exchange_strong(expected, true)) { + LOGWARNMOD(scrubmgr, "pg={} scrub submission already in-flight, skip!", pg_id); + return folly::makeSemiFuture(std::shared_ptr< ScrubManager::ShallowScrubReport >(nullptr)); + } + + const auto& pg_scrub_sb = *(ps_scrub_super_blk_it->second); + const auto last_scrub_time = + is_deep ? pg_scrub_sb->last_deep_scrub_timestamp : pg_scrub_sb->last_shallow_scrub_timestamp; + + auto [promise, future] = folly::makePromiseContract< std::shared_ptr< ShallowScrubReport > >(); + ScrubManager::scrub_task task(last_scrub_time, pg_id, is_deep, trigger_type, std::move(promise)); + if (!m_scrub_task_queue.push(std::move(task))) { + // Queue is closed (scrub manager is stopped); roll back in_scrubbing so future submissions are not blocked. + hs_pg->in_scrubbing.store(false); + LOGWARNMOD(scrubmgr, "pg={} scrub task queue is closed/stopped, skip!", pg_id); + return folly::makeSemiFuture(std::shared_ptr< ScrubManager::ShallowScrubReport >(nullptr)); + } + return std::move(future); +} + +void ScrubManager::cancel_scrub_task(const pg_id_t& pg_id) { + auto it = m_pg_scrub_ctx_map.find(pg_id); + if (it == m_pg_scrub_ctx_map.end()) { + LOGWARNMOD(scrubmgr, "no running scrub task for pg={}, no need to cancel!", pg_id); + return; + } + it->second->cancel(); + LOGINFOMOD(scrubmgr, "cancel scrub task for pg={}", pg_id); +} + +void ScrubManager::handle_pg_scrub_task(scrub_task task) { + const auto& pg_id = task.pg_id; + const auto& task_id = task.task_id; + const auto& is_deep_scrub = task.is_deep_scrub; + + SCRUBLOGD(pg_id, task_id, + "Starting handling {} scrub task, last_scrub_time={} =====", is_deep_scrub ? "deep" : "shallow", + task.last_scrub_time); + + std::shared_ptr< ShallowScrubReport > pg_scrub_report = + is_deep_scrub ? std::make_shared< DeepScrubReport >(pg_id) : std::make_shared< ShallowScrubReport >(pg_id); + + struct scrub_task_guard { + HSHomeObject* home_obj; + folly::ConcurrentHashMap< pg_id_t, std::shared_ptr< PGScrubContext > >& pg_scrub_ctx_map; + scrub_task& task; + std::shared_ptr< ShallowScrubReport >& scrub_report; + const pg_id_t& pg_id; + + ~scrub_task_guard() { + pg_scrub_ctx_map.erase(pg_id); + task.scrub_report_promise->setValue(scrub_report); + auto hs_pg = home_obj->get_hs_pg(pg_id); + if (hs_pg) { + hs_pg->in_scrubbing.store(false); + LOGINFOMOD(scrubmgr, "cleared SCRUBBING state for pg={}", pg_id); + } else { + // pg destroyed during scrubbing + LOGWARNMOD(scrubmgr, "cannot find hs_pg to clear SCRUBBING state for pg={}!", pg_id); + } + } + } guard{m_hs_home_object, m_pg_scrub_ctx_map, task, pg_scrub_report, pg_id}; + + const auto hs_pg = m_hs_home_object->get_hs_pg(pg_id); + if (!hs_pg) { + SCRUBLOGE(pg_id, task_id, "cannot find hs_pg for this pg, fail this scrub task!"); + return; + } + + auto [ctx_it, happened] = m_pg_scrub_ctx_map.try_emplace(pg_id, std::make_shared< PGScrubContext >(task_id, hs_pg)); + RELEASE_ASSERT(happened, + "pg={} should not have a running scrub task since we set in_scrubbing in submit_scrub_task", pg_id); + auto& scrub_ctx = ctx_it->second; + + // this is the last committed shard_id. we cannot get shard_sequence_num here since some of the shard might be + // not committed yet. note that, this depends on the fact that the last committed shard is always at the end of + // the shard list. + const auto last_committed_shard_id = m_hs_home_object->get_last_shard_id_in_pg(pg_id); + const auto last_committed_blob_id = hs_pg->get_last_committed_blob_id(); + + // we get scrub_lsn after we get last_committed_shard_id and last_committed_blob_id, so we can guarantee for any + // replica, if it has committed to scrub_lsn , it can at least see last_committed_shard_id and + // last_committed_blob_id. + // Now, the scrub range is finalized to [{0,0}, {last_committed_shard_id, last_committed_blob_id}] + const int64_t scrub_lsn = hs_pg->repl_dev_->get_last_commit_lsn(); + + // Step 1: Scrub META + SCRUBLOGD(pg_id, task_id, "Starting META scrubbing"); + std::map< shard_id_t, uint32_t > shard_blob_count; + for (shard_id_t start_shard_id = 0; start_shard_id <= last_committed_shard_id;) { + if (scrub_ctx->cancelled.load()) { + SCRUBLOGD(pg_id, task_id, "scrub task cancelled after meta scrub, skip blob scrub"); + return; + } + + std::map< shard_id_t, uint32_t > shard_blob_count_in_batch; + if (!scrub_ctx->scrub_meta_batch(pg_scrub_report, start_shard_id, last_committed_shard_id, + last_committed_blob_id, scrub_lsn, shard_blob_count_in_batch)) { + SCRUBLOGE(pg_id, task_id, "meta scrub failed for batch in range: {} to {}, scrub_lsn={}", start_shard_id, + last_committed_shard_id, scrub_lsn); + return; + } + + SCRUBLOGD(pg_id, task_id, "meta scrub batch completed in range: {} to {}, scrub_lsn={}", start_shard_id, + last_committed_shard_id, scrub_lsn); + + if (shard_blob_count_in_batch.empty()) { + SCRUBLOGD(pg_id, task_id, "no more shard to scrub, end meta scrub"); + break; + } + + // next shard_id after the last shard_id in this batch + start_shard_id = shard_blob_count_in_batch.rbegin()->first + 1; + shard_blob_count.merge(shard_blob_count_in_batch); + } + + // Step 2: Scrub BLOB + if (!shard_blob_count.empty()) { + SCRUBLOGD(pg_id, task_id, "Starting {} blob scrubbing", is_deep_scrub ? "deep" : "shallow"); + auto it = shard_blob_count.begin(); + shard_id_t start_shard_id = it->first; + shard_id_t end_shard_id = start_shard_id; + uint64_t total_blob_count_in_batch = it->second; + + // start from the second shard + for (++it; it != shard_blob_count.end(); ++it) { + if (scrub_ctx->cancelled.load()) { + SCRUBLOGD(pg_id, task_id, "scrub task cancelled during blob batch accumulation, stop"); + return; + } + auto blob_count = it->second; + if (total_blob_count_in_batch + blob_count >= max_scrub_batch_size) { + // scrub current batch + if (!scrub_ctx->scrub_blob_batch(pg_scrub_report, start_shard_id, end_shard_id, last_committed_blob_id, + scrub_lsn, is_deep_scrub)) { + SCRUBLOGE(pg_id, task_id, "{} blob scrub failed for shard range: {} to {}, scrub_lsn={}", + is_deep_scrub ? "deep" : "shallow", start_shard_id, end_shard_id, scrub_lsn); + return; + } + + // start a new batch + start_shard_id = it->first; + total_blob_count_in_batch = 0; + } + + total_blob_count_in_batch += blob_count; + end_shard_id = it->first; + } + + // scrub last batch + if (!scrub_ctx->scrub_blob_batch(pg_scrub_report, start_shard_id, end_shard_id, last_committed_blob_id, + scrub_lsn, is_deep_scrub)) { + SCRUBLOGE(pg_id, task_id, "{} blob scrub batch failed for shard range: {} to {}, scrub_lsn={}", + is_deep_scrub ? "deep" : "shallow", start_shard_id, end_shard_id, scrub_lsn); + return; + } + } + +#ifdef _PRERELEASE + // Trigger the callback flip to delete missing blob during scrub if enabled + iomgr_flip::instance()->callback_flip("delete_missing_blob_through_raft"); +#endif + + // when scrubbing is on going, blob or shard deletion probably happens and lead to false-positive missing blobs(and + // shards after we have delete shard). we reconcile the missing blobs in scrub report after all the scrubbing is + // completed to reduce the false-positive item. + + // if we reach here, we can make sure the other replicas have committed to scrub_lsn. + scrub_ctx->reconcile_scrub_report(pg_scrub_report); + + // only if pg is successfully scrubbed, we persist scrub metablk. + + // FIXME:: spread this to all followers, so that if leader changes, the new leader konws the last scrub time and can + // trigger the next scrub in time. + save_scrub_superblk(pg_id, is_deep_scrub, true); + SCRUBLOGD(pg_id, task_id, "successfully complete {} scrub task!", is_deep_scrub ? "deep" : "shallow"); +} + +void ScrubManager::add_pg(const pg_id_t pg_id) { + // TODO:: make this thread safe. + LOGINFOMOD(scrubmgr, "added new scrub superblock for pg={}", pg_id); + if (nullptr == m_hs_home_object->get_hs_pg(pg_id)) { + LOGINFOMOD(scrubmgr, "cannot find pg={}!", pg_id); + return; + } + + // to avoid create-pg log replay overriding existing scrub superblock, we only create new superblock when there is + // no existing one + save_scrub_superblk(pg_id, false, false); +} + +void ScrubManager::remove_pg(const pg_id_t pg_id) { + cancel_scrub_task(pg_id); + m_pg_scrub_ctx_map.erase(pg_id); + + auto it = m_pg_scrub_sb_map.find(pg_id); + if (it == m_pg_scrub_sb_map.end()) { + LOGINFOMOD(scrubmgr, "no scrub superblock found for pg={}, no need to remove", pg_id); + return; + } + + LOGINFOMOD(scrubmgr, "removed pg={} in scrub manager!", pg_id); + it->second->destroy(); + m_pg_scrub_sb_map.erase(it); +} + +// this function is called in meta_service thread context +void ScrubManager::on_pg_scrub_meta_blk_found( + sisl::byte_view const& buf, void* meta_cookie, + std::vector< homestore::superblk< pg_scrub_superblk > >& stale_pg_scrub_sbs) { + auto sb = std::make_shared< homestore::superblk< pg_scrub_superblk > >(); + (*sb).load(buf, meta_cookie); + const auto pg_id = (*sb)->pg_id; + + auto hs_pg = m_hs_home_object->get_hs_pg(pg_id); + if (!hs_pg) { + // this is a stale pg scrub superblock, we just log and destroy it. + LOGINFOMOD(scrubmgr, "cannot find pg={}, destroy stale scrub superblock", pg_id); + stale_pg_scrub_sbs.emplace_back(std::move(*sb)); + return; + } + const auto last_deep_scrub_time = (*sb)->last_deep_scrub_timestamp; + const auto last_shallow_scrub_time = (*sb)->last_shallow_scrub_timestamp; + + m_pg_scrub_sb_map.emplace(pg_id, std::move(sb)); + LOGINFOMOD(scrubmgr, "loaded scrub superblock for pg={}, last_deep_scrub_time={}, last_shallow_scrub_time={}", + pg_id, last_deep_scrub_time, last_shallow_scrub_time); +} + +void ScrubManager::save_scrub_superblk(const pg_id_t pg_id, const bool is_deep_scrub, bool force_update) { + const auto current_time = + std::chrono::duration_cast< std::chrono::seconds >(std::chrono::system_clock::now().time_since_epoch()).count(); + + auto it = m_pg_scrub_sb_map.find(pg_id); + if (it == m_pg_scrub_sb_map.end()) { + // Create new superblock for this PG + auto sb = std::make_shared< homestore::superblk< pg_scrub_superblk > >(pg_scrub_meta_name); + (*sb).create(sizeof(pg_scrub_superblk)); + (*sb)->pg_id = pg_id; + (*sb)->last_deep_scrub_timestamp = current_time; + (*sb)->last_shallow_scrub_timestamp = current_time; + (*sb).write(); + m_pg_scrub_sb_map.emplace(pg_id, std::move(sb)); + return; + } + + if (force_update) { + // Update existing superblock + if (is_deep_scrub) { + (*(it->second))->last_deep_scrub_timestamp = current_time; + } else { + (*(it->second))->last_shallow_scrub_timestamp = current_time; + } + (*(it->second)).write(); + } else { + LOGINFOMOD(scrubmgr, "skip updating scrub superblock for pg={} since there is no scrub progress update", pg_id); + } +} + +std::optional< ScrubManager::pg_scrub_superblk > ScrubManager::get_scrub_superblk(const pg_id_t pg_id) const { + auto it = m_pg_scrub_sb_map.find(pg_id); + if (it == m_pg_scrub_sb_map.end()) { + LOGWARNMOD(scrubmgr, "scrub superblk not found for pg {}", pg_id); + return std::nullopt; + } + + return *(*(it->second)); +} + +ScrubManager::PGScrubContext::PGScrubContext(uint64_t task_id, const HSHomeObject::HS_PG* hs_pg) : + task_id(task_id), hs_pg(hs_pg) { + + const auto& members = (hs_pg->pg_info_).members; + for (const auto& member : members) { + // TODO::make the queue size configurable + peer_scrub_result_queue_map_.emplace( + member.id, std::make_shared< folly::MPMCQueue< std::shared_ptr< scrub_result > > >(10)); + } + + m_scrub_executor = std::make_shared< folly::IOThreadPoolExecutor >(peer_scrub_result_queue_map_.size()); + // TODO: handle the following cases: + // 1 the node is removed from the raft group? handle this case later + // 2 pg is destroyed during scrubbing? +} + +ScrubManager::PGScrubContext::~PGScrubContext() { + m_scrub_executor->stop(); + m_scrub_executor.reset(); +} + +void ScrubManager::PGScrubContext::add_scrub_result(std::shared_ptr< ScrubManager::scrub_result > result) { + auto it = peer_scrub_result_queue_map_.find(result->issuer_peer_id); + if (it != peer_scrub_result_queue_map_.end()) { + SCRUBLOGD(hs_pg->pg_id(), task_id, "add scrub result from peer {}, req_id={}, result entries count={}", + result->issuer_peer_id, result->req_id, result->entries.size()); + it->second->blockingWrite(result); + } else { + SCRUBLOGW(hs_pg->pg_id(), task_id, "received scrub result from unknown peer {}, req_id={}, dropping", + result->issuer_peer_id, result->req_id); + } +} + +void ScrubManager::PGScrubContext::send_req_to_peer(const ScrubManager::scrub_req& req, const peer_id_t& peer_id) { + const auto pg_id = hs_pg->pg_id(); + auto& repl_dev = hs_pg->repl_dev_; + if (!repl_dev) { + cancel(); + SCRUBLOGE(pg_id, task_id, + "replication device is not available, cannot send scrub req to peer {}, req_id={}, " + "scrub_type={}", + peer_id, req.req_id, req.scrub_type); + return; + } + + auto flatbuffer = req.build_flat_buffer(); + sisl::io_blob_list_t blob_list; + blob_list.emplace_back(flatbuffer.data(), flatbuffer.size(), false); + + repl_dev->data_request_unidirectional(peer_id, HSHomeObject::PUSH_SCRUB_REQ, blob_list) + .via(&folly::InlineExecutor::instance()) + .thenValue([pg_id, peer_id, task_id = this->task_id, flatbuffer = std::move(flatbuffer), req_id = req.req_id, + scrub_type = req.scrub_type](auto&& response) { + if (response.hasError()) { + SCRUBLOGE(pg_id, task_id, "failed to send scrub req to peer {}, req_id={}, error={}, scrub_type={}", + peer_id, req_id, response.error(), scrub_type); + } else { + SCRUBLOGD(pg_id, task_id, "successfully sent scrub req to peer {}, req_id={}, scrub_type={}", peer_id, + req_id, scrub_type); + } + }); +} + +bool ScrubManager::PGScrubContext::scrub_meta_batch(std::shared_ptr< ScrubManager::MetaScrubReport > scrub_report, + shard_id_t start_shard_id, shard_id_t end_shard_id, + blob_id_t last_blob_id, int64_t scrub_lsn, + std::map< shard_id_t, uint32_t >& shard_blob_count_in_batch) { + const auto pg_id = hs_pg->pg_id(); + SCRUBLOGD(pg_id, task_id, "start scrubbing meta for shard range: {} to {}, last_blob_id={}, scrub_lsn={}", + start_shard_id, end_shard_id, last_blob_id, scrub_lsn); + + std::vector< folly::Future< std::shared_ptr< ScrubManager::range_scrub_result > > > futs; + for (const auto& [peer_id, scrub_result_queue] : peer_scrub_result_queue_map_) { + auto [promise, future] = folly::makePromiseContract< std::shared_ptr< ScrubManager::range_scrub_result > >(); + futs.emplace_back(std::move(future).via(&folly::InlineExecutor::instance())); + m_scrub_executor->add([this, pg_id, peer_id, start_shard_id, end_shard_id, last_blob_id, scrub_lsn, + promise = std::move(promise), scrub_result_queue]() mutable { + std::shared_ptr< ScrubManager::scrub_result > scrub_result; + ScrubManager::scrub_req current_req(pg_id, random_req_id(), scrub_lsn, start_shard_id, 0, end_shard_id, + last_blob_id, SCRUB_TYPE::META, hs_pg->home_obj_.our_uuid()); + auto range_scrub_result = std::make_shared< ScrubManager::range_scrub_result >( + start_shard_id, 0, end_shard_id, last_blob_id, SCRUB_TYPE::META, peer_id); + + for (uint8_t retry_count = 0;;) { + if (cancelled.load()) { + SCRUBLOGD(pg_id, task_id, + "scrub task is cancelled, stop waiting for scrub result from peer {}, " + "shard range: {} to {}, last_blob_id={}, scrub_lsn={}", + peer_id, start_shard_id, end_shard_id, last_blob_id, scrub_lsn); + break; + } + + send_req_to_peer(current_req, peer_id); + + // Drain stale results until we get the expected req_id or time out. + // Do NOT re-send on stale results — the peer already has the outstanding request. + bool got_expected = false; + while (!got_expected) { + // TODO::make the timeout here configurable + if (!scrub_result_queue->tryReadUntil(std::chrono::steady_clock::now() + std::chrono::seconds{10}, + scrub_result)) { + // timeout + SCRUBLOGD(pg_id, task_id, + "did not receive scrub result from peer {} for shard range: {} to {}, " + "last_blob_id={}, scrub_lsn={}, try again", + peer_id, start_shard_id, end_shard_id, last_blob_id, scrub_lsn); + retry_count++; + + // TODO::make the max retry count configurable + if (retry_count > 5) { + SCRUBLOGE(pg_id, task_id, + "did not receive scrub result from peer {} after {} retries, " + "shard range: {} to {}, last_blob_id={}, scrub_lsn={}, " + "cancel this scrub task", + peer_id, retry_count, start_shard_id, end_shard_id, last_blob_id, scrub_lsn); + // this will cancel the entire task + cancel(); + } + break; // timed out — outer loop will re-send or detect cancellation + } + + // TODO:: add more logic to check if the scrub result is the expected one, for example, we can add + // the shard range and scrub_lsn in the scrub req and check if they are consistent with the received + // scrub result + if (scrub_result->req_id != current_req.req_id) { + SCRUBLOGD(pg_id, task_id, + "received scrub result with unexpected req_id from peer {}, expected req_id={}, " + "actual req_id={}, drain and wait again", + peer_id, current_req.req_id, scrub_result->req_id); + continue; // drain stale result and wait again — do NOT re-send + } + + got_expected = true; + } + + if (!got_expected) { + continue; // timed out — outer loop re-sends or cancels + } + + // Got the expected result. Reset retry count since the peer is responsive. + retry_count = 0; + SCRUBLOGD(pg_id, task_id, "meta scrub: received scrub result from peer {}, result entries count={}", + peer_id, scrub_result->entries.size()); + + if (scrub_result->entries.empty()) { + SCRUBLOGD(pg_id, task_id, + "received empty scrub result from peer {}, shard range: {} to {}, " + "last_blob_id={}, scrub_lsn={}, we consider this range scrub is completed and break " + "the loop!", + peer_id, start_shard_id, end_shard_id, last_blob_id, scrub_lsn); + break; + } + + const auto received_batch_size = scrub_result->entries.size(); + SCRUBLOGD(pg_id, task_id, + "received expected meta scrub result from peer {}, req_id={}, start_shard_id={}, " + "start_blob_id={}, end_shard_id={}, end_blob_id={}, scrub_lsn={}, details={}", + peer_id, current_req.req_id, current_req.start_shard_id, current_req.start_blob_id, + current_req.end_shard_id, current_req.end_blob_id, scrub_lsn, scrub_result->to_string()); + + const auto last_shard_id_in_result = (scrub_result->entries).rbegin()->first.shard; + range_scrub_result->add_scrub_result(*scrub_result); + SCRUBLOGD(pg_id, task_id, "after adding new scrub_result for meta scrub, {}", + range_scrub_result->to_string()); + + if (received_batch_size < max_scrub_batch_size) { + // if the received batch size is smaller than the max batch size, it means the peer has no more data + // to scrub in this range, we can consider the range scrub is completed for this peer and break the + // loop to avoid unnecessary scrub req sending and waiting. + SCRUBLOGD(pg_id, task_id, + "received meta scrub result with batch size {} smaller than max batch size {} from peer " + "{}, shard range: {} to {}, last_blob_id={}, scrub_lsn={}, we consider this range scrub " + "is completed and break the loop!", + received_batch_size, max_scrub_batch_size, peer_id, start_shard_id, end_shard_id, + last_blob_id, scrub_lsn); + break; + } + + RELEASE_ASSERT(received_batch_size == max_scrub_batch_size, + "the received batch size {} should be equal to max scrub batch size {} for meta scrub, " + "but it is not, peer {}, shard range: {} to {}, last_blob_id={}, scrub_lsn={}", + received_batch_size, max_scrub_batch_size, peer_id, start_shard_id, end_shard_id, + last_blob_id, scrub_lsn); + + current_req.start_shard_id = last_shard_id_in_result + 1; + if (current_req.start_shard_id > end_shard_id) { + // the range scrub is completed for this batch, we can break the loop and return the result. + SCRUBLOGD(pg_id, task_id, + "completed scrubbing meta for shard range: {} to {}, last_blob_id={}, scrub_lsn={}, " + "peer {}", + start_shard_id, end_shard_id, last_blob_id, scrub_lsn, peer_id); + break; + } + + // new req_id for the next scrub req + current_req.req_id = random_req_id(); + } + + if (cancelled.load()) { + promise.setException(folly::make_exception_wrapper< std::runtime_error >("cancelled")); + } else { + promise.setValue(range_scrub_result); + } + }); + } + + return folly::collectAllUnsafe(futs) + .thenValue([this, pg_id, &scrub_report, &shard_blob_count_in_batch](auto&& results) { + std::map< peer_id_t, std::shared_ptr< range_scrub_result > > peer_scrub_result_map; + for (auto& r : results) { + if (r.hasException()) { + SCRUBLOGE(pg_id, task_id, "scrub meta batch is failed, error={}", r.exception().what()); + return false; + } + auto range_scrub_result = r.value(); + if (!range_scrub_result) { + SCRUBLOGE(pg_id, task_id, "scrub meta batch is failed, receive nullptr scrub result"); + return false; + } + peer_scrub_result_map[range_scrub_result->peer_id] = range_scrub_result; + + SCRUBLOGD(pg_id, task_id, "complete meta range scrub: {}", range_scrub_result->to_string()); + } + + // 1 consolidate peer_scrub_result_map into shard_blob_count_in_batch + std::map< shard_id_t, std::pair< size_t /*occurrence count*/, uint32_t /*max blob count*/ > > + shard_count_map; + + for (const auto& [_, range_scrub_result] : peer_scrub_result_map) { + for (const auto& [route, _] : range_scrub_result->results) { + // shard occurrence count + shard_count_map[route.shard].first++; + + // blob count. in meta scrub result, route.blob stands for the blob count in this shard, we need to + // get the max blob count among all peers since some peer might lose some blobs + const auto blob_count = shard_count_map[route.shard].second; + shard_count_map[route.shard].second = std::max(blob_count, static_cast< uint32_t >(route.blob)); + } + } + + for (const auto& [shard_id, count_pair] : shard_count_map) { + // if shard_id is 0, this is a pg meta scrub result, which does not represent a real shard, we can skip + // it in blob scrub phase. + if (!shard_id) continue; + + if (count_pair.first == peer_scrub_result_map.size()) { + // all peers have this shard, we can consider this shard is successfully scrubbed, and update the + // blob count. for the empty shard, we set the blob count to 1. Actually, empty shard should not + // appear since it does not in pg_index_table. + shard_blob_count_in_batch[shard_id] = std::max(count_pair.second, uint32_t{1}); + } else { + // missing shard + RELEASE_ASSERT( + count_pair.first < peer_scrub_result_map.size(), + "the occurrence count of shard_id {} should not be larger than peer count, but it is {}", + shard_id, count_pair.first); + // not all peers have this shard, we consider this shard is missing, and set blob count to max value + // to make sure it will be scrubbed in a single batch in blob scrub phase + shard_blob_count_in_batch[shard_id] = UINT32_MAX; + } + } + + // 2 consolidate peer_scrub_result_map into scrub_report + scrub_report->merge(peer_scrub_result_map); + + return true; + }) + .get(); +} + +bool ScrubManager::PGScrubContext::scrub_blob_batch(std::shared_ptr< ScrubManager::ShallowScrubReport > scrub_report, + shard_id_t start_shard_id, shard_id_t end_shard_id, + blob_id_t last_blob_id, int64_t scrub_lsn, bool is_deep_scrub) { + const auto pg_id = hs_pg->pg_id(); + const auto scrub_type = is_deep_scrub ? SCRUB_TYPE::DEEP_BLOB : SCRUB_TYPE::SHALLOW_BLOB; + + SCRUBLOGD(pg_id, task_id, + "start scrubbing blob for shard range: {} to {}, last_blob_id={}, scrub_lsn={}, scrub_type={}", + start_shard_id, end_shard_id, last_blob_id, scrub_lsn, scrub_type); + + std::vector< folly::Future< std::shared_ptr< ScrubManager::range_scrub_result > > > futs; + for (const auto& [peer_id, scrub_result_queue] : peer_scrub_result_queue_map_) { + auto [promise, future] = folly::makePromiseContract< std::shared_ptr< ScrubManager::range_scrub_result > >(); + futs.emplace_back(std::move(future).via(&folly::InlineExecutor::instance())); + m_scrub_executor->add([this, pg_id, peer_id, start_shard_id, end_shard_id, last_blob_id, scrub_lsn, scrub_type, + promise = std::move(promise), scrub_result_queue]() mutable { + std::shared_ptr< ScrubManager::scrub_result > scrub_result; + ScrubManager::scrub_req current_req(pg_id, random_req_id(), scrub_lsn, start_shard_id, 0, end_shard_id, + last_blob_id, scrub_type, hs_pg->home_obj_.our_uuid()); + auto range_scrub_result = std::make_shared< ScrubManager::range_scrub_result >( + start_shard_id, 0, end_shard_id, last_blob_id, scrub_type, peer_id); + + for (uint8_t retry_count = 0;;) { + if (cancelled.load()) { + SCRUBLOGD(pg_id, task_id, + "scrub task is cancelled, stop waiting for scrub result from peer {}, " + "shard range: {} to {}, last_blob_id={}, scrub_lsn={}", + peer_id, start_shard_id, end_shard_id, last_blob_id, scrub_lsn); + break; + } + + send_req_to_peer(current_req, peer_id); + + // Drain stale results until we get the expected req_id or time out. + // Do NOT re-send on stale results — the peer already has the outstanding request. + // TODO::make the timeout here configurable + bool got_expected = false; + while (!got_expected) { + if (!scrub_result_queue->tryReadUntil(std::chrono::steady_clock::now() + std::chrono::seconds{10}, + scrub_result)) { + // timeout + SCRUBLOGD(pg_id, task_id, + "did not receive scrub result from peer {} for shard range: {} to {}, " + "last_blob_id={}, scrub_lsn={}, try again", + peer_id, start_shard_id, end_shard_id, last_blob_id, scrub_lsn); + retry_count++; + + // TODO::make the max retry count configurable + if (retry_count > 5) { + SCRUBLOGE(pg_id, task_id, + "did not receive scrub result from peer {} after {} retries, " + "shard range: {} to {}, last_blob_id={}, scrub_lsn={}, " + "cancel this scrub task", + peer_id, retry_count, start_shard_id, end_shard_id, last_blob_id, scrub_lsn); + // this will cancel the entire task + cancel(); + } + break; // timed out — outer loop will re-send or detect cancellation + } + + // TODO:: add more logic to check if the scrub result is the expected one, for example, we can + // add the shard range and scrub_lsn in the scrub req and check if they are consistent with the + // received scrub result + if (scrub_result->req_id != current_req.req_id) { + SCRUBLOGD(pg_id, task_id, + "received scrub result with unexpected req_id from peer {}, expected req_id={}, " + "actual req_id={}, scrub_type={}, drain and wait again", + peer_id, current_req.req_id, scrub_result->req_id, scrub_type); + continue; // drain stale result and wait again — do NOT re-send + } + + got_expected = true; + } + + if (!got_expected) { + continue; // timed out — outer loop re-sends or cancels + } + + // Got the expected result. Reset retry count since the peer is responsive. + retry_count = 0; + SCRUBLOGD(pg_id, task_id, "blob scrub: received scrub result from peer {}, result entries count={}", + peer_id, scrub_result->entries.size()); + + if (scrub_result->entries.empty()) { + SCRUBLOGD(pg_id, task_id, + "received empty scrub result from peer {}, shard range: {} to {}, " + "last_blob_id={}, scrub_lsn={}, we consider this range scrub is completed and break " + "the loop!", + peer_id, start_shard_id, end_shard_id, last_blob_id, scrub_lsn); + break; + } + + const auto received_batch_size = scrub_result->entries.size(); + SCRUBLOGD(pg_id, task_id, + "received expected blob scrub result from peer {}, req_id={}, start_shard_id={}, " + "start_blob_id={}, end_shard_id={}, end_blob_id={}, scrub_lsn={}, details={}", + peer_id, current_req.req_id, current_req.start_shard_id, current_req.start_blob_id, + current_req.end_shard_id, current_req.end_blob_id, scrub_lsn, scrub_result->to_string()); + + const auto last_shard_id_in_result = (scrub_result->entries).rbegin()->first.shard; + const auto last_blob_id_in_result = (scrub_result->entries).rbegin()->first.blob; + + range_scrub_result->add_scrub_result(*scrub_result); + SCRUBLOGD(pg_id, task_id, "after adding new scrub_result for blob scrub, {}", + range_scrub_result->to_string()); + + uint32_t batch_capacity = static_cast< uint32_t >( + scrub_type == SCRUB_TYPE::SHALLOW_BLOB ? max_scrub_batch_size : deep_blob_scrub_batch_size); + if (received_batch_size < batch_capacity) { + // if the received batch size is smaller than the batch capacity, it means the peer has no more data + // to scrub in this range, we can consider the range scrub is completed for this peer and break the + // loop to avoid unnecessary scrub req sending and waiting. + SCRUBLOGD(pg_id, task_id, + "received blob scrub result with batch size {} smaller than batch capacity {} from peer " + "{}, shard range: {} to {}, last_blob_id={}, scrub_lsn={}, we consider this range scrub " + "is completed and break the loop!", + received_batch_size, batch_capacity, peer_id, start_shard_id, end_shard_id, last_blob_id, + scrub_lsn); + break; + } + + RELEASE_ASSERT( + received_batch_size == batch_capacity, + "the received batch size {} should be equal to batch capacity {} for blob scrub, but it is not, " + "peer {}, shard range: {} to {}, last_blob_id={}, scrub_lsn={}, scrub_type={}", + received_batch_size, batch_capacity, peer_id, start_shard_id, end_shard_id, last_blob_id, scrub_lsn, + scrub_type); + + RELEASE_ASSERT(last_blob_id_in_result <= last_blob_id, + "the last_blob_id_in_result {} should not be larger than last_blob_id {}, but it is, " + "peer {}, shard range: {} to {}, scrub_lsn={}, scrub_type={}", + last_blob_id_in_result, last_blob_id, peer_id, start_shard_id, end_shard_id, scrub_lsn, + scrub_type); + + if (last_blob_id_in_result == last_blob_id) { + current_req.start_shard_id = last_shard_id_in_result + 1; + current_req.start_blob_id = 0; + } else { + current_req.start_shard_id = last_shard_id_in_result; + current_req.start_blob_id = last_blob_id_in_result + 1; + } + + // Completed when we've passed the last blob of the last shard in this batch. + if (current_req.start_shard_id > end_shard_id) { + SCRUBLOGD(pg_id, task_id, + "completed scrubbing blob for shard range: {} to {}, last_blob_id={}, scrub_lsn={}, " + "peer {}", + start_shard_id, end_shard_id, last_blob_id, scrub_lsn, peer_id); + break; + } + + // new req_id for the next scrub req + current_req.req_id = random_req_id(); + } + + if (cancelled.load()) { + promise.setException(folly::make_exception_wrapper< std::runtime_error >("cancelled")); + } else { + promise.setValue(range_scrub_result); + } + }); + } + + return folly::collectAllUnsafe(futs) + .thenValue([this, pg_id, &scrub_report](auto&& results) { + std::map< peer_id_t, std::shared_ptr< range_scrub_result > > peer_scrub_result_map; + for (auto& r : results) { + if (r.hasException()) { + SCRUBLOGE(pg_id, task_id, "scrub blob batch is failed, error={}", r.exception().what()); + return false; + } + auto range_scrub_result = r.value(); + if (!range_scrub_result) { + SCRUBLOGE(pg_id, task_id, "scrub blob batch is failed, receive nullptr scrub result"); + return false; + } + peer_scrub_result_map.emplace(range_scrub_result->peer_id, range_scrub_result); + + SCRUBLOGD(pg_id, task_id, "complete blob range scrub: {}", range_scrub_result->to_string()); + } + + // consolidate peer_scrub_result_map into scrub_report + scrub_report->merge(peer_scrub_result_map); + + return true; + }) + .get(); +} + +void ScrubManager::PGScrubContext::reconcile_scrub_report(std::shared_ptr< ShallowScrubReport > scrub_report) { + // A shard/blob reported missing may be a false positive from a concurrent deletion during scrubbing. + // For each peer that is tracked as HAVING the shard/blob (in the existence-tracking set), we re-check + // whether it still holds the item. If it no longer does (concurrent deletion), we remove that peer from + // the existence-tracking set; when the set empties the entry is dropped, eliminating the false positive. + const auto pg_id = hs_pg->pg_id(); + + // TODO::make this configurable + const uint8_t max_reconcile_retry_count = 3; + for (uint8_t retry_count = 0; retry_count < max_reconcile_retry_count; ++retry_count) { + if (cancelled.load()) { + SCRUBLOGD(pg_id, task_id, "scrub task cancelled, skip reconciliation"); + return; + } + + const auto missing_shards = scrub_report->get_missing_shard_ids(); + const auto missing_blobs = scrub_report->get_missing_blobs(); + + if (missing_shards.empty() && missing_blobs.empty()) { + SCRUBLOGD(pg_id, task_id, "no missing shard/blob in scrub report, no need to reconcile"); + return; + } + + std::vector< folly::Future< folly::Unit > > reconcile_futs; + + for (const auto& [shard_id, peer_set] : missing_shards) { + for (const auto& peer_id : peer_set) { + reconcile_futs.emplace_back(std::move( + check_existence_in_peer(peer_id, {shard_id, 0}, false /* check blob */) + .thenTry([this, pg_id, peer_id, shard_id, &scrub_report](folly::Try< bool > result) { + if (result.hasException()) { + SCRUBLOGE(pg_id, task_id, + "failed to check shard existence in peer {}, shard {}, error: {}", peer_id, + shard_id, result.exception().what()); + return; + } + + const auto& exists = result.value(); + if (!exists) { + SCRUBLOGD(pg_id, task_id, + "reconcile check: shard {} confirmed absent on peer {}, removing from " + "existence-tracking set", + shard_id, peer_id); + scrub_report->remove_shard_existence_from_peer(shard_id, peer_id); + } else { + SCRUBLOGD(pg_id, task_id, + "reconcile check: shard {} still present on peer {}, no change", shard_id, + peer_id); + } + }))); + } + } + + for (const auto& [blob_route, peer_set] : missing_blobs) { + for (const auto& peer_id : peer_set) { + reconcile_futs.emplace_back(std::move( + check_existence_in_peer(peer_id, blob_route, true /* check blob */) + .thenTry([this, pg_id, peer_id, blob_route, &scrub_report](folly::Try< bool > result) { + if (result.hasException()) { + SCRUBLOGE( + pg_id, task_id, + "failed to check blob existence in peer {}, shard_id={}, blob_id={}, error: {}", + peer_id, blob_route.shard, blob_route.blob, result.exception().what()); + return; + } + + const auto& exists = result.value(); + if (!exists) { + SCRUBLOGD(pg_id, task_id, + "reconcile check: shard_id={}, blob_id={} confirmed absent on peer {}, " + "removing from existence-tracking set", + blob_route.shard, blob_route.blob, peer_id); + scrub_report->remove_blob_existence_from_peer(blob_route, peer_id); + } else { + SCRUBLOGD( + pg_id, task_id, + "reconcile check: shard_id={}, blob_id={} still present on peer {}, no change", + blob_route.shard, blob_route.blob, peer_id); + } + }))); + } + } + + folly::collectAllUnsafe(reconcile_futs).wait(); + } + + const auto remaining_missing_shards = scrub_report->get_missing_shard_ids().size(); + const auto remaining_missing_blobs = scrub_report->get_missing_blobs().size(); + if (remaining_missing_shards || remaining_missing_blobs) { + SCRUBLOGW(pg_id, task_id, + "reconciliation finished after {} retries but {} missing shards and {} missing blobs remain", + max_reconcile_retry_count, remaining_missing_shards, remaining_missing_blobs); + } else { + SCRUBLOGD(pg_id, task_id, "reconciliation cleared all missing items after {} retries", + max_reconcile_retry_count); + } +} + +folly::Future< bool > ScrubManager::PGScrubContext::check_existence_in_peer(peer_id_t peer_id, BlobRoute blob, + bool check_blob) { + auto [promise, future] = folly::makePromiseContract< bool >(); + const auto pg_id = hs_pg->pg_id(); + + auto repl_dev = hs_pg->repl_dev_; + if (!repl_dev) { + promise.setException(folly::make_exception_wrapper< std::runtime_error >("repl dev is not available")); + return std::move(future).via(&folly::InlineExecutor::instance()); + } + + ScrubManager::scrub_req check_blob_req; + check_blob_req.start_shard_id = blob.shard; + check_blob_req.start_blob_id = blob.blob; + check_blob_req.scrub_type = check_blob ? SCRUB_TYPE::CHECK_BLOB_EXISTENCE : SCRUB_TYPE::CHECK_SHARD_EXISTENCE; + + auto flatbuffer = check_blob_req.build_flat_buffer(); + sisl::io_blob_list_t blob_list; + blob_list.emplace_back(flatbuffer.data(), flatbuffer.size(), false); + + const auto check_type_str = check_blob ? "blob" : "shard"; + + // this is a bidirectional request, no need to add a req_id. + repl_dev->data_request_bidirectional(peer_id, HSHomeObject::PUSH_SCRUB_REQ, blob_list) + .via(&folly::InlineExecutor::instance()) + .thenValue([pg_id, peer_id, task_id = this->task_id, blob, check_type_str, flatbuffer = std::move(flatbuffer), + promise = std::move(promise)](auto&& response) mutable { + if (response.hasError()) { + SCRUBLOGE(pg_id, task_id, "failed to check {} existence in peer {}, blob {}, error code: {}", + check_type_str, peer_id, blob, static_cast< int >(response.error())); + promise.setException( + folly::make_exception_wrapper< std::runtime_error >("rpc bidirectional request failed")); + } else { + const auto& resp_blob = response.value().response_blob(); + if (resp_blob.size() != sizeof(bool)) { + SCRUBLOGE(pg_id, task_id, + "invalid response for {} existence check from peer {}, blob {}, response size={}", + check_type_str, peer_id, blob, resp_blob.size()); + promise.setException( + folly::make_exception_wrapper< std::runtime_error >("invalid response for existence check")); + } else { + const bool exists = *reinterpret_cast< const bool* >(resp_blob.cbytes()); + SCRUBLOGD(pg_id, task_id, + "successfully checked {} existence in peer {}, shard_id={}, blob_id={}, exists={}", + check_type_str, peer_id, blob.shard, blob.blob, exists); + promise.setValue(exists); + } + } + }); + + return std::move(future).via(&folly::InlineExecutor::instance()); +} + +uint64_t ScrubManager::PGScrubContext::random_req_id() const { + static std::atomic< uint64_t > ctr{0}; + + static const uint64_t seed = []() -> uint64_t { + std::random_device rd; + uint64_t s = (uint64_t(rd()) << 32) ^ uint64_t(rd()); + return s ? s : 0x123456789abcdef0ULL; + }(); + + auto splitmix64 = [](uint64_t x) -> uint64_t { + x += 0x9e3779b97f4a7c15ULL; + x = (x ^ (x >> 30)) * 0xbf58476d1ce4e5b9ULL; + x = (x ^ (x >> 27)) * 0x94d049bb133111ebULL; + return x ^ (x >> 31); + }; + + uint64_t x = ctr.fetch_add(1, std::memory_order_relaxed) ^ seed; + return splitmix64(x); +} + +flatbuffers::DetachedBuffer ScrubManager::scrub_req::build_flat_buffer() const { + flatbuffers::FlatBufferBuilder builder; + + // Convert peer_id_t (boost::uuids::uuid, 16 bytes) to a byte vector for the issuer_uuid field. + // issuer_uuid is [ubyte:16] in the schema; if issuer_peer_id is nil (all-zero), it is still + // serialized so the receiver can always read a consistent value. + std::vector< uint8_t > uuid_bytes(issuer_peer_id.begin(), issuer_peer_id.end()); + auto issuer_uuid_offset = builder.CreateVector(uuid_bytes); + + // Build the ScrubReq table. All scalar fields are written unconditionally; absent values on + // the reader side will fall back to FlatBuffers defaults (0 / ScrubType::META). + auto req_offset = + CreateScrubReq(builder, static_cast< uint16_t >(pg_id), req_id, scrub_lsn, start_shard_id, start_blob_id, + end_shard_id, end_blob_id, issuer_uuid_offset, static_cast< ScrubType >(scrub_type)); + + builder.FinishSizePrefixed(req_offset); + return builder.Release(); +} + +bool ScrubManager::scrub_req::load(uint8_t const* buf_ptr, const uint32_t buf_size) { + if (!buf_ptr || buf_size == 0) { + LOGERRORMOD(scrubmgr, "scrub_req::load called with null or empty buffer"); + return false; + } + + auto scrub_req_fb = GetSizePrefixedScrubReq(buf_ptr); + if (!scrub_req_fb) { + LOGERRORMOD(scrubmgr, "scrub_req::load: GetSizePrefixedScrubReq returned null"); + return false; + } + + // Scalar fields always carry a value (FlatBuffers default = 0 when absent in the wire format). + pg_id = static_cast< pg_id_t >(scrub_req_fb->pg_id()); + req_id = scrub_req_fb->req_id(); + scrub_lsn = scrub_req_fb->scrub_lsn(); + start_shard_id = scrub_req_fb->start_shard_id(); + start_blob_id = scrub_req_fb->start_blob_id(); + end_shard_id = scrub_req_fb->end_shard_id(); + end_blob_id = scrub_req_fb->end_blob_id(); + scrub_type = static_cast< SCRUB_TYPE >(scrub_req_fb->scrub_type()); + + // issuer_uuid is a vector field and may be absent (nullptr) when the sender did not set it. + // In that case issuer_peer_id keeps its default-initialized nil UUID. + if (auto uuid_vec = scrub_req_fb->issuer_uuid(); uuid_vec != nullptr) { + const auto copy_len = std::min(static_cast< size_t >(uuid_vec->size()), issuer_peer_id.size()); + std::copy_n(uuid_vec->data(), copy_len, issuer_peer_id.begin()); + } + + return true; +} + +flatbuffers::DetachedBuffer ScrubManager::scrub_result::build_flat_buffer() const { + flatbuffers::FlatBufferBuilder builder; + + std::vector< flatbuffers::Offset< ScrubResultEntry > > entry_offsets; + entry_offsets.reserve(entries.size()); + + { + std::lock_guard lock(mutex_); + for (const auto& [route, result_variant] : entries) { + ScrubStatus status; + uint64_t hash = 0; + if (std::holds_alternative< uint64_t >(result_variant)) { + status = ScrubStatus::NONE; + hash = std::get< uint64_t >(result_variant); + } else { + status = std::get< ScrubStatus >(result_variant); + } + entry_offsets.push_back(CreateScrubResultEntry(builder, route.shard, route.blob, status, hash)); + } + } + + auto entries_offset = builder.CreateVector(entry_offsets); + + std::vector< uint8_t > uuid_bytes(issuer_peer_id.begin(), issuer_peer_id.end()); + auto uuid_offset = builder.CreateVector(uuid_bytes); + + auto result_offset = CreateScrubResult(builder, req_id, uuid_offset, entries_offset); + builder.FinishSizePrefixed(result_offset); + return builder.Release(); +} + +void ScrubManager::scrub_result::add_entry(const scrub_result_entry& entry) { + BlobRoute route{entry.shard_id, entry.blob_id}; + std::lock_guard lock(mutex_); + RELEASE_ASSERT(entries.find(route) == entries.end(), "duplicate scrub result entry for {}", route); + entries[route] = entry.status_or_hash; +} + +bool ScrubManager::scrub_result::load(uint8_t const* buf_ptr, uint32_t buf_size) { + if (!buf_ptr || buf_size == 0) { + LOGERRORMOD(scrubmgr, "scrub_result::load called with null or empty buffer"); + return false; + } + + auto result_fb = GetSizePrefixedScrubResult(buf_ptr); + if (!result_fb) { + LOGERRORMOD(scrubmgr, "scrub_result::load: GetSizePrefixedScrubResult returned null"); + return false; + } + + req_id = result_fb->req_id(); + + // issuer_uuid is a vector field — absent (nullptr) when the sender omitted it; + // in that case issuer_peer_id keeps its default-initialized nil UUID. + if (auto uuid_vec = result_fb->issuer_uuid(); uuid_vec != nullptr) { + const auto copy_len = std::min(static_cast< size_t >(uuid_vec->size()), issuer_peer_id.size()); + std::copy_n(uuid_vec->data(), copy_len, issuer_peer_id.begin()); + } + + { + std::lock_guard lock(mutex_); + entries.clear(); + if (auto results_vec = result_fb->scrub_results(); results_vec != nullptr) { + for (const auto* entry_fb : *results_vec) { + if (!entry_fb) { continue; } + BlobRoute route{entry_fb->shard_id(), entry_fb->blob_id()}; + std::variant< ScrubStatus, uint64_t > result_variant; + if (entry_fb->scrub_result() == ScrubStatus::NONE) { + result_variant = entry_fb->hash(); + } else { + result_variant = static_cast< ScrubStatus >(entry_fb->scrub_result()); + } + entries.emplace(route, std::move(result_variant)); + } + } + } + + return true; +} + +std::string ScrubManager::scrub_result::to_string() const { + std::lock_guard lock(mutex_); + std::stringstream ss; + ss << "scrub_result[req_id=" << req_id << ",issuer=" << issuer_peer_id << ",entries_count=" << entries.size() + << ",entries={"; + bool first = true; + for (const auto& [route, result_variant] : entries) { + if (!first) ss << ","; + ss << "shard=" << static_cast< uint64_t >(route.shard) << ",blob=" << static_cast< uint64_t >(route.blob) + << ":"; + if (std::holds_alternative< uint64_t >(result_variant)) { + ss << fmt::format("{:016x}", std::get< uint64_t >(result_variant)); + } else { + ss << scrub_result_to_string(std::get< ScrubStatus >(result_variant)); + } + first = false; + } + ss << "}]"; + return ss.str(); +} + +std::string ScrubManager::range_scrub_result::to_string() const { + std::stringstream ss; + ss << "range_scrub_result[peer=" << peer_id << ",scrub_type=" << scrub_type << ",range=[shard=" << start_shard_id + << ",blob=" << start_blob_id << "]->[shard=" << end_shard_id << ",blob=" << end_blob_id + << "],results_count=" << results.size() << ",results={"; + bool first = true; + for (const auto& [route, result_variant] : results) { + if (!first) ss << ","; + ss << "shard=" << static_cast< uint64_t >(route.shard) << ",blob=" << static_cast< uint64_t >(route.blob) + << ":"; + if (std::holds_alternative< uint64_t >(result_variant)) { + ss << fmt::format("{:016x}", std::get< uint64_t >(result_variant)); + } else { + ss << scrub_result_to_string(std::get< ScrubStatus >(result_variant)); + } + first = false; + } + ss << "}]"; + return ss.str(); +} + +//=========================== Scrub Report Merge Functions ===========================// + +void ScrubManager::MetaScrubReport::print() const { + std::stringstream ss; + ss << "MetaScrubReport for pg=" << pg_id_ << " | "; + + ss << "CorruptedPgMeta={"; + bool first = true; + for (const auto& [peer_id, scrub_status] : corrupted_pg_metas) { + if (!first) ss << ","; + ss << "peer=" << peer_id << "(" << scrub_result_to_string(scrub_status) << ")"; + first = false; + } + ss << "} | "; + + ss << "CorruptedShardMeta={"; + first = true; + for (const auto& [peer_id, shard_map] : corrupted_shard_metas) { + if (!first) ss << ","; + ss << "peer=" << peer_id << ":["; + bool inner_first = true; + for (const auto& [shard_id, scrub_status] : shard_map) { + if (!inner_first) ss << ","; + ss << shard_id << "(" << scrub_result_to_string(scrub_status) << ")"; + inner_first = false; + } + ss << "]"; + first = false; + } + ss << "} | "; + + ss << "InconsistentShardMeta={"; + first = true; + for (const auto& [shard_id, peer_hash_map] : inconsistent_shard_metas) { + if (!first) ss << ","; + ss << "shard=" << shard_id << ":["; + bool inner_first = true; + for (const auto& [peer_id, hash] : peer_hash_map) { + if (!inner_first) ss << ","; + ss << "peer=" << peer_id << fmt::format("(hash={:016x})", hash); + inner_first = false; + } + ss << "]"; + first = false; + } + ss << "} | "; + + ss << "MissingShards={"; + first = true; + for (const auto& [shard_id, peer_set] : missing_shard_ids) { + if (!first) ss << ","; + ss << "shard=" << shard_id << ":["; + bool inner_first = true; + for (const auto& peer_id : peer_set) { + if (!inner_first) ss << ","; + ss << peer_id; + inner_first = false; + } + ss << "]"; + first = false; + } + ss << "}"; + + LOGINFOMOD(scrubmgr, "{}", ss.str()); +} + +bool ScrubManager::MetaScrubReport::merge( + const std::map< peer_id_t, std::shared_ptr< range_scrub_result > >& peer_scrub_result_map) { + if (peer_scrub_result_map.empty()) { + LOGWARNMOD(scrubmgr, "[pg={}] MetaScrubReport::merge: empty map, skip", pg_id_); + return false; + } + + // All entries must have scrub_type == META and identical range fields. + const auto& ref = peer_scrub_result_map.begin()->second; + if (!ref || ref->scrub_type != SCRUB_TYPE::META) { + LOGWARNMOD(scrubmgr, "[pg={}] MetaScrubReport::merge: first entry is null or not META, skip", pg_id_); + return false; + } + for (auto it = std::next(peer_scrub_result_map.cbegin()); it != peer_scrub_result_map.cend(); ++it) { + const auto& [peer_id, rsr] = *it; + if (!ref->match(rsr)) { + LOGWARNMOD(scrubmgr, + "[pg={}] MetaScrubReport::merge: null, wrong type, or mismatched range from peer {}, skip", + pg_id_, peer_id); + return false; + } + } + + std::map< shard_id_t, std::map< peer_id_t, uint64_t > > shard_hash_map; + std::set< peer_id_t > shard_reporting_peers; + std::map< shard_id_t, std::set< peer_id_t > > shard_present_map; + + for (const auto& [peer_id, range_result] : peer_scrub_result_map) { + shard_reporting_peers.insert(peer_id); + for (const auto& [route, result_variant] : range_result->results) { + const auto shard_id = route.shard; + shard_present_map[shard_id].insert(peer_id); + + // For META results, healthy shards always carry uint64_t (the active-blob count used + // as a placeholder hash); only a non-NONE ScrubStatus indicates corruption. + // ScrubStatus::NONE does not appear in META results: local_scrub_meta stores uint64_t(0) + // directly, and scrub_result::load converts any on-wire NONE to uint64_t via the hash field. + if (std::holds_alternative< ScrubStatus >(result_variant)) { + auto status = std::get< ScrubStatus >(result_variant); + if (status != ScrubStatus::NONE) { + if (shard_id == 0) { + corrupted_pg_metas[peer_id] = status; + } else { + corrupted_shard_metas[peer_id][shard_id] = status; + } + LOGWARNMOD(scrubmgr, "[pg={}] find corruption for META shard={} peer={}", pg_id_, shard_id, + peer_id); + } + continue; + } + + // uint64_t: shard is healthy; use its hash value for cross-peer consistency comparison. + shard_hash_map[shard_id][peer_id] = std::get< uint64_t >(result_variant); + } + } + + // Detect shard meta inconsistency across peers. + // Note: peers that reported corruption (IO_ERROR/MISMATCH) are excluded from shard_hash_map and therefore + // from this check. Corruption and inconsistency are tracked separately; callers must correlate + // corrupted_shard_metas with inconsistent_shard_metas to get the full picture. + for (const auto& [shard_id, peer_hash_map] : shard_hash_map) { + if (peer_hash_map.size() > 1) { + const uint64_t ref_hash = peer_hash_map.begin()->second; + bool consistent = std::all_of(peer_hash_map.begin(), peer_hash_map.end(), + [ref_hash](const auto& kv) { return kv.second == ref_hash; }); + if (!consistent) { + for (const auto& [peer_id, hash] : peer_hash_map) { + inconsistent_shard_metas[shard_id][peer_id] = hash; + } + } + } + } + + // Detect missing shards: shards seen by some peers but absent on others. + // Record which peers HAVE the shard (existence-tracking set; see missing_shard_ids semantics in hpp). + // shard_id=0 (pg_meta) is intentionally excluded: its corruption is captured in + // corrupted_pg_metas, and reconcile_scrub_report uses {shard_id, 0} as the existence- + // check route, which would be wrong for pg_meta whose route is {0, total_shards}. + for (const auto& [shard_id, peer_set] : shard_present_map) { + if (shard_id) { + if (peer_set.size() < shard_reporting_peers.size()) { + RELEASE_ASSERT(missing_shard_ids.find(shard_id) == missing_shard_ids.end(), + "shard_id {} should not already exist in missing_shard_ids", shard_id); + + missing_shard_ids[shard_id] = peer_set; + } + } + + // shard_id == 0 represents pg_meta, which is not a real shard and should not be treated as missing even if some + // peers don't report it. + } + + LOGINFOMOD(scrubmgr, + "[pg={}] Meta scrub merge completed: {} corrupted shard metas, {} inconsistent shard metas, " + "{} peers with missing shards", + pg_id_, corrupted_shard_metas.size(), inconsistent_shard_metas.size(), missing_shard_ids.size()); + return true; +} + +void ScrubManager::MetaScrubReport::remove_shard_existence_from_peer(shard_id_t shard_id, peer_id_t peer) { + std::lock_guard lock(mutex_); + auto it = missing_shard_ids.find(shard_id); + if (it != missing_shard_ids.end()) { + it->second.erase(peer); + if (it->second.empty()) { missing_shard_ids.erase(it); } + } +} + +void ScrubManager::ShallowScrubReport::print() const { + MetaScrubReport::print(); + std::stringstream ss; + ss << "ShallowScrubReport for pg=" << pg_id_ << " | MissingBlobs={"; + bool first = true; + for (const auto& [blob_route, peer_set] : missing_blobs) { + if (!first) ss << ","; + ss << fmt::format("{}", blob_route) << ":["; + bool inner = true; + for (const auto& peer_id : peer_set) { + if (!inner) ss << ","; + ss << peer_id; + inner = false; + } + ss << "]"; + first = false; + } + ss << "}"; + LOGINFOMOD(scrubmgr, "{}", ss.str()); +} + +bool ScrubManager::ShallowScrubReport::merge( + const std::map< peer_id_t, std::shared_ptr< range_scrub_result > >& peer_scrub_result_map) { + if (peer_scrub_result_map.empty()) { + LOGWARNMOD(scrubmgr, "[pg={}] ShallowScrubReport::merge: empty map, skip", pg_id_); + return false; + } + + const auto& ref = peer_scrub_result_map.begin()->second; + if (!ref) { + LOGWARNMOD(scrubmgr, "[pg={}] ShallowScrubReport::merge: first entry is null, skip", pg_id_); + return false; + } + + // META results are fully handled by the base class. + if (ref->scrub_type == SCRUB_TYPE::META) { return MetaScrubReport::merge(peer_scrub_result_map); } + + RELEASE_ASSERT(ref->scrub_type == SCRUB_TYPE::SHALLOW_BLOB || ref->scrub_type == SCRUB_TYPE::DEEP_BLOB, + "unexpected scrub_type {} in ShallowScrubReport::merge", (int)ref->scrub_type); + + for (auto it = std::next(peer_scrub_result_map.cbegin()); it != peer_scrub_result_map.cend(); ++it) { + const auto& [peer_id, rsr] = *it; + if (!ref->match(rsr)) { + LOGWARNMOD(scrubmgr, + "[pg={}] ShallowScrubReport::merge: null, wrong type, or mismatched range from peer {}, skip", + pg_id_, peer_id); + return false; + } + } + + // Detect missing blobs: track which peers reported each blob, find absent ones + std::map< BlobRoute, std::set< peer_id_t > > blob_peers_map; + for (const auto& [peer_id, range_result] : peer_scrub_result_map) { + if (!range_result) continue; + for (const auto& [route, result_variant] : range_result->results) { + blob_peers_map[route].insert(peer_id); + } + } + + for (const auto& [blob_route, peer_set] : blob_peers_map) { + if (peer_set.size() < peer_scrub_result_map.size()) { + RELEASE_ASSERT(missing_blobs.find(blob_route) == missing_blobs.end(), + "blob_route {} should not already exist in missing_blobs", blob_route); + missing_blobs[blob_route] = peer_set; + } + } + + LOGDEBUGMOD(scrubmgr, "[pg={}] Shallow scrub merge completed!", pg_id_); + return true; +} + +void ScrubManager::ShallowScrubReport::remove_blob_existence_from_peer(BlobRoute blob_route, peer_id_t peer) { + std::lock_guard lock(mutex_); + auto it = missing_blobs.find(blob_route); + if (it != missing_blobs.end()) { + it->second.erase(peer); + if (it->second.empty()) { missing_blobs.erase(it); } + } +} + +void ScrubManager::DeepScrubReport::print() const { + ShallowScrubReport::print(); + + std::stringstream ss; + ss << "DeepScrubReport for pg=" << pg_id_ << " | CorruptedBlobs={"; + bool first = true; + for (const auto& [peer_id, blob_map] : corrupted_blobs) { + if (!first) ss << ","; + ss << "peer=" << peer_id << ":["; + bool inner = true; + for (const auto& [blob_route, scrub_result] : blob_map) { + if (!inner) ss << ","; + ss << fmt::format("{}", blob_route) << "(" << scrub_result_to_string(scrub_result) << ")"; + inner = false; + } + ss << "]"; + first = false; + } + ss << "} | InconsistentBlobs={"; + first = true; + for (const auto& [blob_route, peer_hash_map] : inconsistent_blobs) { + if (!first) ss << ","; + ss << fmt::format("{}", blob_route) << ":["; + bool inner = true; + for (const auto& [peer_id, hash] : peer_hash_map) { + if (!inner) ss << ","; + ss << "peer=" << peer_id << fmt::format("(hash={:016x})", hash); + inner = false; + } + ss << "]"; + first = false; + } + ss << "}"; + LOGINFOMOD(scrubmgr, "{}", ss.str()); +} + +bool ScrubManager::DeepScrubReport::merge( + const std::map< peer_id_t, std::shared_ptr< range_scrub_result > >& peer_scrub_result_map) { + if (peer_scrub_result_map.empty()) { + LOGWARNMOD(scrubmgr, "[pg={}] DeepScrubReport::merge: empty map, skip", pg_id_); + return false; + } + + const auto& ref = peer_scrub_result_map.begin()->second; + if (!ref) { + LOGWARNMOD(scrubmgr, "[pg={}] DeepScrubReport::merge: first entry is null, skip", pg_id_); + return false; + } + + const auto scrub_type = ref->scrub_type; + + // META and SHALLOW_BLOB results are fully handled by parent classes; no deep-specific work needed. + if (scrub_type != SCRUB_TYPE::DEEP_BLOB) { return ShallowScrubReport::merge(peer_scrub_result_map); } + + // DEEP_BLOB: first detect missing blobs via ShallowScrubReport (which also validates the range), + // then add corrupted-blob and hash-inconsistency detection. + if (!ShallowScrubReport::merge(peer_scrub_result_map)) { return false; } + + // Detect corrupted blobs (non-NONE scrub_result) reported by any peer. + // Detect hash inconsistencies among healthy blobs in the same pass. + std::map< BlobRoute, std::map< peer_id_t, uint64_t > > hash_map_per_blob; + for (const auto& [peer_id, range_result] : peer_scrub_result_map) { + if (!range_result) continue; + for (const auto& [blob_route, result_variant] : range_result->results) { + if (std::holds_alternative< ScrubStatus >(result_variant)) { + auto status = std::get< ScrubStatus >(result_variant); + if (status != ScrubStatus::NONE) { + corrupted_blobs[peer_id][blob_route] = status; + LOGWARNMOD(scrubmgr, "[pg={}] find corruption for blob shard_id={}, blob_id={}, peer={}", pg_id_, + blob_route.shard, blob_route.blob, peer_id); + } + } else { + hash_map_per_blob[blob_route][peer_id] = std::get< uint64_t >(result_variant); + } + } + } + + // Note: peers that reported corruption (IO_ERROR/MISMATCH) have no hash entry in hash_map_per_blob. + // Hash inconsistency therefore requires ≥2 healthy peers for detection. Corruption is captured + // separately in corrupted_blobs. + for (const auto& [blob_route, hash_map] : hash_map_per_blob) { + if (hash_map.size() > 1) { + uint64_t ref_hash = hash_map.begin()->second; + bool consistent = std::all_of(hash_map.begin(), hash_map.end(), + [ref_hash](const auto& kv) { return kv.second == ref_hash; }); + if (!consistent) { + for (const auto& [peer_id, hash_val] : hash_map) { + inconsistent_blobs[blob_route][peer_id] = hash_val; + } + } + } + } + + LOGINFOMOD(scrubmgr, + "[pg={}] Deep blob scrub merge completed: {} missing blobs, {} corrupted blobs, {} inconsistent blobs", + pg_id_, missing_blobs.size(), corrupted_blobs.size(), inconsistent_blobs.size()); + return true; +} + +} // namespace homeobject \ No newline at end of file diff --git a/src/lib/homestore_backend/scrub_manager.hpp b/src/lib/homestore_backend/scrub_manager.hpp new file mode 100644 index 000000000..0dd651645 --- /dev/null +++ b/src/lib/homestore_backend/scrub_manager.hpp @@ -0,0 +1,366 @@ +#pragma once + +#include +#include + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wuninitialized" +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#include +#include +#include +#include +#pragma GCC diagnostic pop + +#include +#include "homeobject/common.hpp" +#include +#include +#include "lib/blob_route.hpp" +#include "MPMCPriorityQueue.hpp" +#include "generated/scrub_common_generated.h" +#include "generated/scrub_req_generated.h" +#include "generated/scrub_result_generated.h" + +namespace homeobject { + +class HSHomeObject; + +ENUM(SCRUB_TRIGGER_TYPE, uint8_t, PERIODICALLY = 0, MANUALLY); +ENUM(SCRUB_TYPE, uint8_t, META = 0, DEEP_BLOB, SHALLOW_BLOB, CHECK_BLOB_EXISTENCE, CHECK_SHARD_EXISTENCE); + +inline const char* scrub_result_to_string(ScrubStatus type) { + switch (type) { + case ScrubStatus::NONE: + return "NONE"; + case ScrubStatus::IO_ERROR: + return "IO_ERROR"; + case ScrubStatus::MISMATCH: + return "MISMATCH"; + case ScrubStatus::NOT_FOUND: + return "NOT_FOUND"; + default: + return "UNKNOWN"; + } +} + +class ScrubManager { +public: + explicit ScrubManager(HSHomeObject* homeobject); + ~ScrubManager(); + + // Disallow copy and move + ScrubManager(const ScrubManager&) = delete; + ScrubManager(ScrubManager&&) = delete; + ScrubManager& operator=(const ScrubManager&) = delete; + ScrubManager& operator=(ScrubManager&&) = delete; + + // pg scrub superblk + // TODO:: move this into pg_super_blk with a separate PR, since this is not a backward incompatible change. + // other backward incompatible meta change will be: + // 1 shard sealed lsn. + +#pragma pack(1) + struct pg_scrub_superblk { + uint64_t last_deep_scrub_timestamp{0}; + uint64_t last_shallow_scrub_timestamp{0}; + pg_id_t pg_id{0}; + static std::string name() { return pg_scrub_meta_name; } + }; +#pragma pack() + + // scrub req + struct scrub_req { + scrub_req() = default; + ~scrub_req() = default; + + scrub_req(pg_id_t pg_id, uint64_t req_id, int64_t scrub_lsn, uint64_t start_shard_id, uint64_t start_blob_id, + uint64_t end_shard_id, uint64_t end_blob_id, SCRUB_TYPE scrub_type, peer_id_t issuer_peer_id) : + pg_id{pg_id}, + req_id{req_id}, + scrub_lsn{scrub_lsn}, + start_shard_id{start_shard_id}, + start_blob_id{start_blob_id}, + end_shard_id{end_shard_id}, + end_blob_id{end_blob_id}, + scrub_type{scrub_type}, + issuer_peer_id{issuer_peer_id} {} + + flatbuffers::DetachedBuffer build_flat_buffer() const; + bool load(uint8_t const* buf_ptr, uint32_t buf_size); + + pg_id_t pg_id{0}; + uint64_t req_id{0}; + int64_t scrub_lsn{0}; + uint64_t start_shard_id{0}; + uint64_t start_blob_id{0}; + uint64_t end_shard_id{0}; + uint64_t end_blob_id{0}; + SCRUB_TYPE scrub_type{SCRUB_TYPE::META}; + peer_id_t issuer_peer_id{}; + }; + + class range_scrub_result; + + // Maps to the ScrubResult / ScrubResultEntry tables in scrub_result.fbs. + // One entry inside a ScrubResult — maps to the ScrubResultEntry FlatBuffers table. + struct scrub_result_entry { + shard_id_t shard_id{0}; + blob_id_t blob_id{0}; + + // only when ScrubStatus is NONE, we store the CRC64 hash + std::variant< ScrubStatus, uint64_t > status_or_hash{ScrubStatus::NONE}; + }; + + // Full result for one scrub request — maps to the ScrubResult FlatBuffers table. + class scrub_result { + public: + scrub_result() = default; + scrub_result(uint64_t req_id, peer_id_t issuer_peer_id) : req_id{req_id}, issuer_peer_id{issuer_peer_id} {} + ~scrub_result() = default; + + void add_entry(const scrub_result_entry& entry); + flatbuffers::DetachedBuffer build_flat_buffer() const; + bool load(uint8_t const* buf_ptr, uint32_t buf_size); + std::string to_string() const; + + uint64_t req_id{0}; + peer_id_t issuer_peer_id{}; + + private: + friend class ScrubManager; + friend class PGScrubContext; + friend class range_scrub_result; + std::map< BlobRoute, std::variant< ScrubStatus, uint64_t > > entries; + mutable std::mutex mutex_; + }; + + // Aggregated results across a contiguous range of shards/blobs, which collects multiple scrub_result objects (one + // per scrub req) that together cover [start_shard_id, start_shard_id] to [end_shard_id, end_blob_id]. + class range_scrub_result { + public: + range_scrub_result(uint64_t start_shard_id, uint64_t start_blob_id, uint64_t end_shard_id, uint64_t end_blob_id, + SCRUB_TYPE scrub_type, peer_id_t peer_id) : + start_shard_id{start_shard_id}, + start_blob_id{start_blob_id}, + end_shard_id{end_shard_id}, + end_blob_id{end_blob_id}, + scrub_type{scrub_type}, + peer_id{peer_id} {} + ~range_scrub_result() = default; + + bool match(const std::shared_ptr< range_scrub_result >& other) const { + return other && scrub_type == other->scrub_type && start_shard_id == other->start_shard_id && + start_blob_id == other->start_blob_id && end_shard_id == other->end_shard_id && + end_blob_id == other->end_blob_id; + } + + std::string to_string() const; + + // this will only be called in a single thread, add a lock if we want to make it thread safe. + // TODO:: add a lock if we want to make it thread safe, but currently the design is one range_scrub_result per + // scrub req, so it should be only accessed by one thread. + void add_scrub_result(scrub_result& result) { + results.merge(result.entries); + + // the newly added result should not overlap with existing results in the range_scrub_result, otherwise it + // means there are duplicate scrub reqs covering the same shard/blob range, which should not happen. + RELEASE_ASSERT(result.entries.empty(), + "should not have duplicate blob route in range_scrub_result, scrub_type={}, " + "start_shard_id={}, start_blob_id={}, end_shard_id={}, end_blob_id={}, req_id={}", + (int)scrub_type, start_shard_id, start_blob_id, end_shard_id, end_blob_id, result.req_id); + } + + uint64_t start_shard_id{0}; + uint64_t start_blob_id{0}; + uint64_t end_shard_id{0}; + uint64_t end_blob_id{0}; + SCRUB_TYPE scrub_type{SCRUB_TYPE::META}; + peer_id_t peer_id{}; + + private: + friend class ScrubManager; + friend class PGScrubContext; + friend class MetaScrubReport; + friend class ShallowScrubReport; + friend class DeepScrubReport; + std::map< BlobRoute, std::variant< ScrubStatus, uint64_t > > results; + }; + + // scrub report + // base class for all scrub reports — handles PG and shard scrub results (SCRUB_TYPE::META) + class MetaScrubReport { + public: + MetaScrubReport(pg_id_t pg_id) : pg_id_(pg_id) {} + virtual ~MetaScrubReport() = default; + + pg_id_t get_pg_id() const { return pg_id_; } + // shard_id starts from 1, so we use shard_id=0 for pg meta + const auto& get_corrupted_shards() const { return corrupted_shard_metas; } + const auto& get_corrupted_pg_metas() const { return corrupted_pg_metas; } + const auto& get_inconsistent_shard_metas() const { return inconsistent_shard_metas; } + const auto& get_missing_shard_ids() const { return missing_shard_ids; } + + virtual bool + merge(const std::map< peer_id_t, std::shared_ptr< range_scrub_result > >& peer_scrub_result_map) = 0; + virtual void print() const = 0; + + protected: + void remove_shard_existence_from_peer(shard_id_t shard_id, peer_id_t peer); + + private: + friend class ScrubManager; + std::map< peer_id_t, std::map< shard_id_t, ScrubStatus > > corrupted_shard_metas; + std::map< peer_id_t, ScrubStatus > corrupted_pg_metas; + std::map< shard_id_t, std::map< peer_id_t, uint64_t > > inconsistent_shard_metas; + // peer set means shard only exists on these peers. + std::map< shard_id_t, std::set< peer_id_t > > missing_shard_ids; + pg_id_t pg_id_; + std::mutex mutex_; + }; + + // shallow scrub report for a pg — extends MetaScrubReport with missing blob tracking (SHALLOW_BLOB) + class ShallowScrubReport : public MetaScrubReport { + public: + ShallowScrubReport(pg_id_t pg_id) : MetaScrubReport(pg_id) {} + ~ShallowScrubReport() override = default; + + const auto& get_missing_blobs() const { return missing_blobs; } + bool merge(const std::map< peer_id_t, std::shared_ptr< range_scrub_result > >& peer_scrub_result_map) override; + void print() const override; + + protected: + void remove_blob_existence_from_peer(BlobRoute blob_route, peer_id_t peer); + + private: + friend class ScrubManager; + // peer set means blob only exists on these peers. + std::map< BlobRoute, std::set< peer_id_t > > missing_blobs; + }; + + // deep scrub report for a pg — extends ShallowScrubReport with blob corruption and inconsistency (DEEP_BLOB) + class DeepScrubReport : public ShallowScrubReport { + public: + DeepScrubReport(pg_id_t pg_id) : ShallowScrubReport(pg_id) {} + ~DeepScrubReport() override = default; + + const auto& get_corrupted_blobs() const { return corrupted_blobs; } + const auto& get_inconsistent_blobs() const { return inconsistent_blobs; } + bool merge(const std::map< peer_id_t, std::shared_ptr< range_scrub_result > >& peer_scrub_result_map) override; + void print() const override; + + private: + std::map< peer_id_t, std::map< BlobRoute, ScrubStatus > > corrupted_blobs; + std::map< BlobRoute, std::map< peer_id_t, uint64_t > > inconsistent_blobs; + }; + + // PG Scrub Context — full definition lives in scrub_manager.cpp to avoid referencing + // HSHomeObject::HS_PG while HSHomeObject is still an incomplete type in this header. +private: + class PGScrubContext; + + // scrub scheduler +public: + void start(); + void stop(); + + folly::SemiFuture< std::shared_ptr< ShallowScrubReport > > + submit_scrub_task(const pg_id_t& pg_id, const bool is_deep, + SCRUB_TRIGGER_TYPE trigger_type = SCRUB_TRIGGER_TYPE::PERIODICALLY); + + // cancel will only cancel a running scrub task. for those submitted but not running tasks in the queue, cancel will + // not remove them from the queue. + void cancel_scrub_task(const pg_id_t& pg_id); + + void add_scrub_result(const pg_id_t pg_id, std::shared_ptr< scrub_result > scrub_result); + // new pg is created + void add_pg(const pg_id_t pg_id); + // new pg permanently removed + void remove_pg(const pg_id_t pg_id); + std::optional< pg_scrub_superblk > get_scrub_superblk(const pg_id_t pg_id) const; + void save_scrub_superblk(const pg_id_t pg_id, const bool is_deep_scrub, bool force_update = true); + void add_scrub_req(std::shared_ptr< scrub_req > req); + + // local scrub + std::shared_ptr< scrub_result > local_scrub_blob(std::shared_ptr< scrub_req > req); + std::shared_ptr< scrub_result > local_scrub_meta(std::shared_ptr< scrub_req > req); + +private: + inline static auto const pg_scrub_meta_name = std::string("PG_SCRUB"); + // TODO: persist this into metablk. + inline static atomic_uint64_t scrub_task_id{1}; + + // refer to docs/adr/scrub-blob-range-coverage.md + static constexpr uint64_t max_scrub_batch_size = 100'000; + static constexpr uint64_t deep_blob_scrub_batch_size = 10; + + struct scrub_task { + scrub_task(uint64_t last_scrub_time, pg_id_t pg_id, bool is_deep_scrub, SCRUB_TRIGGER_TYPE trigger_type, + folly::Promise< std::shared_ptr< ShallowScrubReport > > promise) : + task_id{scrub_task_id.fetch_add(1)}, + last_scrub_time{last_scrub_time}, + pg_id{pg_id}, + is_deep_scrub{is_deep_scrub}, + triggered{trigger_type}, + scrub_report_promise{ + std::make_shared< folly::Promise< std::shared_ptr< ShallowScrubReport > > >(std::move(promise))} {} + + ~scrub_task() { + if (scrub_report_promise && !scrub_report_promise->isFulfilled()) { + scrub_report_promise->setValue(nullptr); + } + } + + scrub_task(scrub_task&&) = default; + scrub_task& operator=(scrub_task&&) = default; + scrub_task(const scrub_task&) = delete; + scrub_task& operator=(const scrub_task&) = delete; + + uint64_t task_id; + uint64_t last_scrub_time; + pg_id_t pg_id; + bool is_deep_scrub; + SCRUB_TRIGGER_TYPE triggered; + std::shared_ptr< folly::Promise< std::shared_ptr< ShallowScrubReport > > > scrub_report_promise; + + bool operator==(const scrub_task& other) const noexcept { return task_id == other.task_id; } + + // manually > periodically; among equal triggers, earlier task_id wins. + bool operator<(const scrub_task& other) const noexcept { + using U = std::underlying_type_t< SCRUB_TRIGGER_TYPE >; + if (static_cast< U >(triggered) != static_cast< U >(other.triggered)) { + return static_cast< U >(triggered) < static_cast< U >(other.triggered); + } + return task_id > other.task_id; + } + }; + + void scan_pg_for_scrub(); + void handle_pg_scrub_task(scrub_task task); + bool is_eligible_for_deep_scrub(const pg_id_t& pg_id); + bool is_eligible_for_shallow_scrub(const pg_id_t& pg_id); + void on_pg_scrub_meta_blk_found(sisl::byte_view const& buf, void* meta_cookie, + std::vector< homestore::superblk< pg_scrub_superblk > >& stale_pg_scrub_sbs); + void handle_deep_pg_scrub_report(std::shared_ptr< DeepScrubReport > report); + void handle_shallow_pg_scrub_report(std::shared_ptr< ShallowScrubReport > report); + void handle_scrub_req(std::shared_ptr< scrub_req > req); + bool wait_for_scrub_lsn_commit(shared< homestore::ReplDev > repl_dev, int64_t scrub_lsn); + uint64_t compute_crc64(const void* data, size_t len, uint64_t crc = 0) const; + + iomgr::timer_handle_t m_scrub_timer_hdl{iomgr::null_timer_handle}; + iomgr::io_fiber_t m_scrub_timer_fiber{nullptr}; + HSHomeObject* m_hs_home_object{nullptr}; + MPMCPriorityQueue< scrub_task > m_scrub_task_queue; + std::shared_ptr< folly::IOThreadPoolExecutor > m_scrub_executor; + folly::ConcurrentHashMap< pg_id_t, std::shared_ptr< PGScrubContext > > m_pg_scrub_ctx_map; + folly::ConcurrentHashMap< pg_id_t, std::shared_ptr< homestore::superblk< pg_scrub_superblk > > > m_pg_scrub_sb_map; + + std::shared_ptr< folly::IOThreadPoolExecutor > m_scrub_req_executor; +}; +} // namespace homeobject + +// TODO:: consider the following scenarios and decide how we want to handle them in scrub manager +// 1 baseline resync +// 2 replace member +// 3 permanently destroy pg +// 4 GC \ No newline at end of file diff --git a/src/lib/homestore_backend/tests/CMakeLists.txt b/src/lib/homestore_backend/tests/CMakeLists.txt index a40812ab3..8eceb3d1f 100644 --- a/src/lib/homestore_backend/tests/CMakeLists.txt +++ b/src/lib/homestore_backend/tests/CMakeLists.txt @@ -30,3 +30,12 @@ add_test(NAME HeapChunkSelectorTest COMMAND test_heap_chunk_selector) add_library(homestore_tests_gc OBJECT) target_sources(homestore_tests_gc PRIVATE test_homestore_backend.cpp hs_gc_tests.cpp) target_link_libraries(homestore_tests_gc homeobject_homestore ${COMMON_TEST_DEPS}) + +add_library(homestore_tests_scrubber OBJECT) +target_sources(homestore_tests_scrubber PRIVATE test_homestore_backend.cpp hs_scrubber_tests.cpp) +target_link_libraries(homestore_tests_scrubber homeobject_homestore ${COMMON_TEST_DEPS}) + +add_executable(test_mpmc_priority_queue) +target_sources(test_mpmc_priority_queue PRIVATE test_mpmc_priority_queue.cpp) +target_link_libraries(test_mpmc_priority_queue homeobject_homestore ${COMMON_TEST_DEPS}) +add_test(NAME MPMCPriorityQueueTest COMMAND test_mpmc_priority_queue) diff --git a/src/lib/homestore_backend/tests/homeobj_fixture.hpp b/src/lib/homestore_backend/tests/homeobj_fixture.hpp index 499968ab3..955884847 100644 --- a/src/lib/homestore_backend/tests/homeobj_fixture.hpp +++ b/src/lib/homestore_backend/tests/homeobj_fixture.hpp @@ -49,7 +49,7 @@ class HomeObjectFixture : public ::testing::Test { HSHomeObject::_hs_chunk_size = SISL_OPTIONS["chunk_size"].as< uint64_t >() * Mi; _obj_inst = std::dynamic_pointer_cast< HSHomeObject >(g_helper->build_new_homeobject()); - + // Used to export metrics, it should be called after init_homeobject if (SISL_OPTIONS["enable_http"].as< bool >()) { g_helper->app->start_http_server(); } if (!g_helper->is_current_testcase_restarted()) { @@ -906,6 +906,27 @@ class HomeObjectFixture : public ::testing::Test { LOGINFO("Flip {} set", flip_name); } + void set_callback_flip(const std::string flip_name, std::function< void() > callback, uint32_t count = 1, + uint32_t percent = 100) { + flip::FlipCondition null_cond; + flip::FlipFrequency freq; + freq.set_count(count); + freq.set_percent(percent); + m_fc.inject_callback_flip(flip_name, {null_cond}, freq, callback); + LOGINFO("Flip {} with callback set", flip_name); + } + + template < typename T > + void set_callback_retval_flip(const std::string flip_name, std::function< T() > callback, uint32_t count = 1, + uint32_t percent = 100) { + flip::FlipCondition null_cond; + flip::FlipFrequency freq; + freq.set_count(count); + freq.set_percent(percent); + ASSERT_TRUE(m_fc.inject_callback_retval_flip(flip_name, {null_cond}, freq, callback)); + LOGINFO("Flip {} with callback retval set", flip_name); + } + void remove_flip(const std::string flip_name) { m_fc.remove_flip(flip_name); LOGINFO("Flip {} removed", flip_name); diff --git a/src/lib/homestore_backend/tests/hs_scrubber_tests.cpp b/src/lib/homestore_backend/tests/hs_scrubber_tests.cpp new file mode 100644 index 000000000..4ca4f9797 --- /dev/null +++ b/src/lib/homestore_backend/tests/hs_scrubber_tests.cpp @@ -0,0 +1,1251 @@ +#include "homeobj_fixture.hpp" +#include +#include +#include +#include +#include "lib/homestore_backend/hs_homeobject.hpp" + +using namespace homeobject; +using BlobHeader = HSHomeObject::BlobHeader; + +// Helper function to delete a blob from index table +static void delete_blob_from_index(shared< homestore::IndexTable< BlobRouteKey, BlobRouteValue > > pg_index_table, + shard_id_t shard_id, blob_id_t blob_id) { + LOGINFO("Deleting blob from index, shard_id={}, blob_id={}", shard_id, blob_id); + BlobRouteKey blob_key{BlobRoute{shard_id, blob_id}}; + BlobRouteValue out_value; + homestore::BtreeSingleRemoveRequest remove_req{&blob_key, &out_value}; + auto status = pg_index_table->remove(remove_req); + ASSERT_TRUE(status == homestore::btree_status_t::success) + << "Failed to remove blob key from index table, status=" << status; +} + +static void delete_shard_from_index(shared< homestore::IndexTable< BlobRouteKey, BlobRouteValue > > pg_index_table, + shard_id_t missing_shard_id) { + LOGINFO("Deleting shard from index, shard_id={}", missing_shard_id); + auto start_key = BlobRouteKey{BlobRoute{missing_shard_id, 0}}; + auto end_key = BlobRouteKey{BlobRoute{missing_shard_id, std::numeric_limits< uint64_t >::max()}}; + homestore::BtreeRangeRemoveRequest< BlobRouteKey > range_remove_req{ + homestore::BtreeKeyRange< BlobRouteKey >{ + std::move(start_key), true /* inclusive */, std::move(end_key), true /* inclusive */ + }, + nullptr, std::numeric_limits< uint32_t >::max(), + [](homestore::BtreeKey const& key, homestore::BtreeValue const& value) -> bool { return true; }}; + + auto status = pg_index_table->remove(range_remove_req); + ASSERT_TRUE(status == homestore::btree_status_t::success || status == homestore::btree_status_t::not_found) + << "Failed to remove shard keys from index table, status=" << status; +} + +// Helper function to corrupt a blob's data +static void corrupt_blob_data(shared< homestore::IndexTable< BlobRouteKey, BlobRouteValue > > pg_index_table, + shard_id_t shard_id, blob_id_t blob_id) { + auto& data_service = homestore::data_service(); + const auto blk_size = data_service.get_blk_size(); + + BlobRouteKey blob_key{BlobRoute{shard_id, blob_id}}; + BlobRouteValue out_value; + homestore::BtreeSingleGetRequest blob_get_req{&blob_key, &out_value}; + + auto status = pg_index_table->get(blob_get_req); + ASSERT_TRUE(status == homestore::btree_status_t::success) + << "Failed to get blob key from index table, status=" << status; + + auto pbas = out_value.pbas(); + auto total_size = pbas.blk_count() * blk_size; + sisl::sg_list data_sgs; + data_sgs.size = total_size; + data_sgs.iovs.emplace_back(iovec{.iov_base = iomanager.iobuf_alloc(blk_size, total_size), .iov_len = total_size}); + + data_service.async_read(pbas, data_sgs, total_size) + .thenValue([&](auto&& err) { + if (err) { + LOGE("Failed to read blob data, blob_id={}, err={}", blob_id, err.message()); + iomanager.iobuf_free(reinterpret_cast< uint8_t* >(data_sgs.iovs[0].iov_base)); + throw std::runtime_error(fmt::format("Failed to read blob data: {}", err.message())); + } + + auto* data_ptr = reinterpret_cast< uint8_t* >(data_sgs.iovs[0].iov_base); + for (size_t i = 0; i < data_sgs.iovs[0].iov_len / 2; i++) { + data_ptr[i] ^= 0xFF; // Flip first half of data + } + + return data_service.async_write(data_sgs, pbas).thenValue([data_sgs = std::move(data_sgs)](auto&& err) { + iomanager.iobuf_free(reinterpret_cast< uint8_t* >(data_sgs.iovs[0].iov_base)); + ASSERT_FALSE(err) << "Failed to write corrupted blob data"; + }); + }) + .get(); +} + +// Helper function to make a blob inconsistent (valid but different hash) +static void make_blob_inconsistent(shared< homestore::IndexTable< BlobRouteKey, BlobRouteValue > > pg_index_table, + shard_id_t shard_id, blob_id_t blob_id, HSHomeObject* obj_inst) { + auto& data_service = homestore::data_service(); + const auto blk_size = data_service.get_blk_size(); + + BlobRouteKey blob_key{BlobRoute{shard_id, blob_id}}; + BlobRouteValue out_value; + homestore::BtreeSingleGetRequest blob_get_req{&blob_key, &out_value}; + + auto status = pg_index_table->get(blob_get_req); + ASSERT_TRUE(status == homestore::btree_status_t::success) << "Failed to get blob key from index table"; + + auto pbas = out_value.pbas(); + auto total_size = pbas.blk_count() * blk_size; + sisl::sg_list data_sgs; + data_sgs.size = total_size; + data_sgs.iovs.emplace_back(iovec{.iov_base = iomanager.iobuf_alloc(blk_size, total_size), .iov_len = total_size}); + + data_service.async_read(pbas, data_sgs, total_size) + .thenValue([&](auto&& err) { + if (err) { + LOGE("Failed to read blob data, blob_id={}, err={}", blob_id, err.message()); + iomanager.iobuf_free(reinterpret_cast< uint8_t* >(data_sgs.iovs[0].iov_base)); + throw std::runtime_error(fmt::format("Failed to read blob data: {}", err.message())); + } + + // Modify blob data and recompute valid hash + uint8_t* read_buf = r_cast< uint8_t* >(data_sgs.iovs[0].iov_base); + auto header = r_cast< BlobHeader* >(read_buf); + uint8_t* blob_bytes = read_buf + header->data_offset; + + std::mt19937 rng{std::random_device{}()}; + std::uniform_int_distribution< int > dist(0, 255); + + for (size_t i = 0; i < header->blob_size / 2; i++) { + blob_bytes[i] ^= static_cast< uint8_t >(dist(rng)); + } + + uint8_t computed_hash[BlobHeader::blob_max_hash_len]{}; + obj_inst->compute_blob_payload_hash(header->hash_algorithm, blob_bytes, header->blob_size, computed_hash, + BlobHeader::blob_max_hash_len); + + std::memcpy(header->hash, computed_hash, BlobHeader::blob_max_hash_len); + std::memset(header->header_hash, 0, BlobHeader::blob_max_hash_len); + uint32_t computed_header_hash = crc32_ieee(0, (uint8_t*)header, sizeof(BlobHeader)); + std::memcpy(header->header_hash, &computed_header_hash, sizeof(uint32_t)); + + if (!obj_inst->verify_blob(data_sgs.iovs[0].iov_base, header->shard_id, header->blob_id)) { + LOGE("Blob verification failed after modification, blob_id={}", blob_id); + iomanager.iobuf_free(reinterpret_cast< uint8_t* >(data_sgs.iovs[0].iov_base)); + throw std::runtime_error(fmt::format("Blob verification failed for blob_id={}", blob_id)); + } + + return data_service.async_write(data_sgs, pbas).thenValue([data_sgs = std::move(data_sgs)](auto&& err) { + iomanager.iobuf_free(reinterpret_cast< uint8_t* >(data_sgs.iovs[0].iov_base)); + ASSERT_FALSE(err) << "Failed to write inconsistent blob data"; + }); + }) + .get(); +} + +// Helper function to verify missing blobs in scrub report +// missing_blobs[blob_route] = set of peers that HAVE the blob; peer_id is missing it iff not in that set. +static void verify_missing_blobs(std::shared_ptr< ScrubManager::DeepScrubReport > report, const peer_id_t& peer_id, + const BlobRoute& expected_blob) { + const auto& missing_blobs = report->get_missing_blobs(); + auto it = missing_blobs.find(expected_blob); + EXPECT_TRUE(it != missing_blobs.end()) + << "Missing blob should be reported for shard_id=" << expected_blob.shard << ", blob_id=" << expected_blob.blob; + if (it != missing_blobs.end()) { + EXPECT_TRUE(it->second.count(peer_id) == 0) + << "peer_id=" << peer_id << " should not have the blob (it is missing on this peer)"; + } +} + +// Helper function to verify corrupted blobs in scrub report +static void verify_corrupted_blobs(std::shared_ptr< ScrubManager::DeepScrubReport > report, const peer_id_t& peer_id, + const BlobRoute& expected_blob) { + const auto& corrupted_blobs = report->get_corrupted_blobs(); + auto it = corrupted_blobs.find(peer_id); + EXPECT_TRUE(it != corrupted_blobs.end()) << "Corrupted blob should be reported for peer_id=" << peer_id; + if (it != corrupted_blobs.end()) { + EXPECT_TRUE(it->second.count(expected_blob) == 1) << "Expected corrupted blob should be in the report"; + } +} + +// Helper function to verify missing shards in scrub report +// missing_shard_ids[shard_id] = set of peers that HAVE the shard; peer_id is missing it if not in that set. +static void verify_missing_shards(std::shared_ptr< ScrubManager::DeepScrubReport > report, const peer_id_t& peer_id, + shard_id_t expected_shard) { + const auto& missing_shards = report->get_missing_shard_ids(); + auto it = missing_shards.find(expected_shard); + EXPECT_TRUE(it != missing_shards.end()) << "Missing shard should be reported for shard_id=" << expected_shard; + if (it != missing_shards.end()) { + EXPECT_TRUE(it->second.count(peer_id) == 0) + << "peer_id=" << peer_id << " should not have the shard (it is missing on this peer)"; + } +} + +TEST_F(HomeObjectFixture, BasicScrubTest) { + const pg_id_t pg_id = 1; + create_pg(pg_id); + auto scrub_mgr = _obj_inst->scrub_manager(); + + // empty pg scrub should report no issues + run_on_pg_leader(pg_id, [&]() { + // Deep scrub on empty PG should complete without errors + auto scrub_report = scrub_mgr->submit_scrub_task(pg_id, true /* is_deep */, SCRUB_TRIGGER_TYPE::MANUALLY).get(); + + ASSERT_NE(scrub_report, nullptr) << "Deep scrub report should not be null for empty PG"; + auto deep_scrub_report = std::dynamic_pointer_cast< ScrubManager::DeepScrubReport >(scrub_report); + ASSERT_NE(deep_scrub_report, nullptr) << "Should be DeepScrubReport"; + + // Empty PG should have no issues + EXPECT_TRUE(deep_scrub_report->get_corrupted_shards().empty()) << "Empty PG should have no corrupted shards"; + EXPECT_TRUE(deep_scrub_report->get_corrupted_pg_metas().empty()) + << "No PG metas should be corrupted in normal case"; + EXPECT_TRUE(deep_scrub_report->get_inconsistent_shard_metas().empty()) + << "No shard metas should be inconsistent in normal case"; + EXPECT_TRUE(deep_scrub_report->get_missing_shard_ids().empty()) << "Empty PG should have no missing shards"; + EXPECT_TRUE(deep_scrub_report->get_missing_blobs().empty()) << "Empty PG should have no missing blobs"; + + EXPECT_TRUE(deep_scrub_report->get_corrupted_blobs().empty()) << "Empty PG should have no corrupted blobs"; + EXPECT_TRUE(deep_scrub_report->get_inconsistent_blobs().empty()) + << "Empty PG should have no inconsistent blobs"; + + // Shallow scrub on empty PG + scrub_report = scrub_mgr->submit_scrub_task(pg_id, false /* is_deep */, SCRUB_TRIGGER_TYPE::MANUALLY).get(); + + EXPECT_TRUE(scrub_report->get_corrupted_shards().empty()) << "Empty PG should have no corrupted shards"; + EXPECT_TRUE(scrub_report->get_corrupted_pg_metas().empty()) << "No PG metas should be corrupted in normal case"; + EXPECT_TRUE(scrub_report->get_inconsistent_shard_metas().empty()) + << "No shard metas should be inconsistent in normal case"; + EXPECT_TRUE(scrub_report->get_missing_shard_ids().empty()) << "Empty PG should have no missing shards"; + EXPECT_TRUE(scrub_report->get_missing_blobs().empty()) << "Empty PG should have no missing blobs"; + }); + + const uint64_t num_shards = SISL_OPTIONS["num_shards"].as< uint64_t >(); + // follower test uses indices 0-2, leader test uses indices 3-5 in blob_op_shard_id + const uint64_t num_blobs_per_shard = std::max(SISL_OPTIONS["num_blobs"].as< uint64_t >(), uint64_t{6}); + const uint64_t shard_size = 64 * Mi; + + std::map< pg_id_t, std::vector< shard_id_t > > pg_shard_id_vec; + std::map< pg_id_t, blob_id_t > pg_blob_id; + pg_blob_id[pg_id] = 0; + + std::map< shard_id_t, std::map< blob_id_t, uint64_t > > shard_blob_ids_map; + + // Create multiple shards + for (uint64_t i = 0; i < num_shards; i++) { + auto shard_info = create_shard(pg_id, shard_size, "shard meta"); + pg_shard_id_vec[pg_id].push_back(shard_info.id); + LOGINFO("Created pg={} shard={} (shard {}/{})", pg_id, shard_info.id, i + 1, num_shards); + } + + // pg with empty shard scrub should report no issues + run_on_pg_leader(pg_id, [&]() { + // Deep scrub on PG with empty shards should complete without errors + auto scrub_report = scrub_mgr->submit_scrub_task(pg_id, true /* is_deep */, SCRUB_TRIGGER_TYPE::MANUALLY).get(); + + ASSERT_NE(scrub_report, nullptr) << "Deep scrub report should not be null for PG with empty shards"; + auto deep_scrub_report = std::dynamic_pointer_cast< ScrubManager::DeepScrubReport >(scrub_report); + ASSERT_NE(deep_scrub_report, nullptr) << "Should be DeepScrubReport"; + + // PG with empty shards should have no issues + EXPECT_TRUE(deep_scrub_report->get_corrupted_shards().empty()) + << "PG with empty shards should have no corrupted shards"; + EXPECT_TRUE(deep_scrub_report->get_corrupted_pg_metas().empty()) + << "No PG metas should be corrupted in normal case"; + EXPECT_TRUE(deep_scrub_report->get_inconsistent_shard_metas().empty()) + << "No shard metas should be inconsistent in normal case"; + EXPECT_TRUE(deep_scrub_report->get_missing_shard_ids().empty()) + << "PG with empty shards should have no missing shards"; + EXPECT_TRUE(deep_scrub_report->get_missing_blobs().empty()) + << "PG with empty shards should have no missing blobs"; + + EXPECT_TRUE(deep_scrub_report->get_corrupted_blobs().empty()) + << "PG with empty shards should have no corrupted blobs"; + EXPECT_TRUE(deep_scrub_report->get_inconsistent_blobs().empty()) + << "PG with empty shards should have no inconsistent blobs"; + + // Shallow scrub on PG with empty shards should complete without errors + scrub_report = scrub_mgr->submit_scrub_task(pg_id, false /* is_deep */, SCRUB_TRIGGER_TYPE::MANUALLY).get(); + + EXPECT_TRUE(scrub_report->get_corrupted_shards().empty()) + << "PG with empty shards should have no corrupted shards"; + EXPECT_TRUE(scrub_report->get_corrupted_pg_metas().empty()) << "No PG metas should be corrupted in normal case"; + EXPECT_TRUE(scrub_report->get_inconsistent_shard_metas().empty()) + << "No shard metas should be inconsistent in normal case"; + EXPECT_TRUE(scrub_report->get_missing_shard_ids().empty()) + << "PG with empty shards should have no missing shards"; + EXPECT_TRUE(scrub_report->get_missing_blobs().empty()) << "PG with empty shards should have no missing blobs"; + }); + + // Create blobs in all shards + shard_blob_ids_map = put_blobs(pg_shard_id_vec, num_blobs_per_shard, pg_blob_id); + LOGINFO("Created {} blobs per shard, total {} blobs", num_blobs_per_shard, num_shards * num_blobs_per_shard); + + // Verify blobs were created + verify_get_blob(pg_shard_id_vec, num_blobs_per_shard); + + // everything is healthy, deep scrub should report no issues. + run_on_pg_leader(pg_id, [&]() { + // Deep scrub on healthy PG should complete without errors + auto scrub_report = scrub_mgr->submit_scrub_task(pg_id, true /* is_deep */, SCRUB_TRIGGER_TYPE::MANUALLY).get(); + + ASSERT_NE(scrub_report, nullptr) << "Deep scrub report should not be null for healthy PG"; + auto deep_scrub_report = std::dynamic_pointer_cast< ScrubManager::DeepScrubReport >(scrub_report); + ASSERT_NE(deep_scrub_report, nullptr) << "Should be DeepScrubReport"; + + // Healthy PG with blobs should have no issues + EXPECT_TRUE(deep_scrub_report->get_corrupted_shards().empty()) << "Healthy PG should have no corrupted shards"; + EXPECT_TRUE(deep_scrub_report->get_corrupted_pg_metas().empty()) + << "No PG metas should be corrupted in normal case"; + EXPECT_TRUE(deep_scrub_report->get_inconsistent_shard_metas().empty()) + << "No shard metas should be inconsistent in normal case"; + EXPECT_TRUE(deep_scrub_report->get_missing_shard_ids().empty()) << "Healthy PG should have no missing shards"; + EXPECT_TRUE(deep_scrub_report->get_missing_blobs().empty()) << "Healthy PG should have no missing blobs"; + EXPECT_TRUE(deep_scrub_report->get_corrupted_blobs().empty()) << "Healthy PG should have no corrupted blobs"; + EXPECT_TRUE(deep_scrub_report->get_inconsistent_blobs().empty()) + << "Healthy PG should have no inconsistent blobs"; + + // Shallow scrub on healthy PG should complete without errors + scrub_report = scrub_mgr->submit_scrub_task(pg_id, false /* is_deep */, SCRUB_TRIGGER_TYPE::MANUALLY).get(); + + EXPECT_TRUE(scrub_report->get_corrupted_shards().empty()) << "Healthy PG should have no corrupted shards"; + EXPECT_TRUE(scrub_report->get_corrupted_pg_metas().empty()) << "No PG metas should be corrupted in normal case"; + EXPECT_TRUE(scrub_report->get_inconsistent_shard_metas().empty()) + << "No shard metas should be inconsistent in normal case"; + EXPECT_TRUE(scrub_report->get_missing_shard_ids().empty()) << "Healthy PG should have no missing shards"; + EXPECT_TRUE(scrub_report->get_missing_blobs().empty()) << "Healthy PG should have no missing blobs"; + }); + + g_helper->sync(); + + const auto hs_pg = _obj_inst->get_hs_pg(pg_id); + ASSERT_TRUE(hs_pg) << "PG should exist for pg_id=" << pg_id; + + ASSERT_GE(num_shards, 2u) << "BasicScrubTest requires at least 2 shards"; + // First shard: simulate a missing shard (entire shard deleted from followers) + const auto missing_shard_id = shard_blob_ids_map.begin()->first; + // Second shard: simulate blob-level issues; this shard still exists on followers + const auto blob_op_shard_id = std::next(shard_blob_ids_map.begin())->first; + auto it = shard_blob_ids_map[blob_op_shard_id].begin(); + const auto missing_blob_id = it->first; + const auto corrupted_blob_id = (++it)->first; + const auto inconsistent_blob_id = (++it)->first; + + // TODO:: add corruptted shard and corrupted pg meta after we have the implementation for corrupting them. + + // Corrupt data on followers + run_on_pg_follower(pg_id, [&]() { + auto& pg_index_table = hs_pg->index_table_; + + // 1. Remove missing_shard_id to simulate missing shard + delete_shard_from_index(pg_index_table, missing_shard_id); + + // 2. Delete missing_blob_id from blob_op_shard_id (different shard, still exists on follower) + delete_blob_from_index(pg_index_table, blob_op_shard_id, missing_blob_id); + + // 3. Make corrupted_blob_id corrupted + corrupt_blob_data(pg_index_table, blob_op_shard_id, corrupted_blob_id); + + // 4. Make inconsistent_blob_id inconsistent (valid but different hash) + make_blob_inconsistent(pg_index_table, blob_op_shard_id, inconsistent_blob_id, _obj_inst.get()); + }); + + g_helper->sync(); + + run_on_pg_leader(pg_id, [&]() { + // do deep scrub and check the scrub report + auto scrub_report = scrub_mgr->submit_scrub_task(pg_id, true /* is_deep */, SCRUB_TRIGGER_TYPE::MANUALLY).get(); + + ASSERT_NE(scrub_report, nullptr) << "Deep scrub report should not be null"; + auto deep_scrub_report = std::dynamic_pointer_cast< ScrubManager::DeepScrubReport >(scrub_report); + ASSERT_NE(deep_scrub_report, nullptr) << "Should be DeepScrubReport"; + + const auto& members = (hs_pg->pg_info_).members; + std::set< peer_id_t > follower_peer_ids; + const auto& leader_uuid = _obj_inst->our_uuid(); + for (const auto& member : members) { + if (member.id == leader_uuid) { continue; } + follower_peer_ids.insert(member.id); + } + + // Verify missing blobs, missing shards, and corrupted blobs for all followers + for (const auto& peer_id : follower_peer_ids) { + verify_missing_blobs(deep_scrub_report, peer_id, BlobRoute{blob_op_shard_id, missing_blob_id}); + verify_missing_shards(deep_scrub_report, peer_id, missing_shard_id); + verify_corrupted_blobs(deep_scrub_report, peer_id, BlobRoute{blob_op_shard_id, corrupted_blob_id}); + } + + // False positive guard: exactly one shard should be missing, no healthy shard must leak in. + EXPECT_EQ(deep_scrub_report->get_missing_shard_ids().size(), 1u) + << "Exactly one shard should be reported missing after follower corruption"; + + // False positive guard: each follower must have exactly the one blob we corrupted. + { + const auto& all_corrupted = deep_scrub_report->get_corrupted_blobs(); + for (const auto& peer_id : follower_peer_ids) { + auto cit = all_corrupted.find(peer_id); + if (cit != all_corrupted.end()) { + EXPECT_EQ(cit->second.size(), 1u) + << "Follower peer_id=" << peer_id << " should have exactly 1 corrupted blob"; + } + } + } + + const auto inconsistent_blobs = deep_scrub_report->get_inconsistent_blobs(); + EXPECT_TRUE(inconsistent_blobs.size() == 1) + << "Inconsistent blob should be reported in deep scrub report for one of the followers"; + const auto it = inconsistent_blobs.find(BlobRoute{blob_op_shard_id, inconsistent_blob_id}); + EXPECT_TRUE(it != inconsistent_blobs.end()) + << "The inconsistent blob should be reported in deep scrub report for blob_id=" << inconsistent_blob_id; + auto& inconsistent_blob_peers = it->second; + + // inconsistent_blob_peers should contains all the peers. + EXPECT_TRUE(inconsistent_blob_peers.size() == follower_peer_ids.size() + 1) + << "Inconsistent blob should be reported in deep scrub report for all followers"; + for (const auto& peer_id : follower_peer_ids) { + EXPECT_TRUE(inconsistent_blob_peers.count(peer_id) == 1) + << "The inconsistent blob should be reported in deep scrub report for peer_id=" << peer_id; + } + EXPECT_TRUE(inconsistent_blob_peers.count(leader_uuid) == 1) + << "The inconsistent blob should be reported in deep scrub report for leader peer_id=" << leader_uuid; + + // do shallow scrub, shallow scrub can only find missing blob/shard + auto shallow_scrub_report = scrub_mgr->submit_scrub_task(pg_id, false, SCRUB_TRIGGER_TYPE::MANUALLY).get(); + ASSERT_NE(shallow_scrub_report, nullptr) << "Shallow scrub report should not be null"; + + auto miss_blob_in_shallow_report = shallow_scrub_report->get_missing_blobs(); + EXPECT_TRUE(miss_blob_in_shallow_report.size() == num_blobs_per_shard + 1) + << "Should report all blobs from missing_shard_id (" << num_blobs_per_shard + << ") plus 1 from blob_op_shard_id, got " << miss_blob_in_shallow_report.size(); + { + auto it = miss_blob_in_shallow_report.find(BlobRoute{blob_op_shard_id, missing_blob_id}); + EXPECT_TRUE(it != miss_blob_in_shallow_report.end()) + << "The missing blob should be reported in shallow scrub report"; + if (it != miss_blob_in_shallow_report.end()) { + for (const auto& peer_id : follower_peer_ids) { + EXPECT_TRUE(it->second.count(peer_id) == 0) + << "Follower peer_id=" << peer_id << " should not have the missing blob"; + } + } + } + + // Verify each individual blob from missing_shard_id appears in the report. + // Checking only the total count is insufficient: if the reported blob_ids were wrong + // but the count matched, the test would still pass. + for (const auto& [blob_id, _] : shard_blob_ids_map[missing_shard_id]) { + auto it = miss_blob_in_shallow_report.find(BlobRoute{missing_shard_id, blob_id}); + EXPECT_TRUE(it != miss_blob_in_shallow_report.end()) + << "Blob " << blob_id << " from missing_shard_id=" << missing_shard_id + << " should be reported in shallow scrub report"; + if (it != miss_blob_in_shallow_report.end()) { + for (const auto& peer_id : follower_peer_ids) { + EXPECT_TRUE(it->second.count(peer_id) == 0) + << "Follower peer_id=" << peer_id << " should not have blob " << blob_id + << " from missing_shard_id"; + } + } + } + + // missing_shard_ids[shard_id] = peers that have the shard; followers are absent from that set. + const auto missing_shards_in_shallow_report = shallow_scrub_report->get_missing_shard_ids(); + EXPECT_TRUE(missing_shards_in_shallow_report.size() == 1) + << "One missing shard should be reported in shallow scrub report"; + { + auto it = missing_shards_in_shallow_report.find(missing_shard_id); + EXPECT_TRUE(it != missing_shards_in_shallow_report.end()) + << "The missing shard should be reported in shallow scrub report"; + if (it != missing_shards_in_shallow_report.end()) { + for (const auto& peer_id : follower_peer_ids) { + EXPECT_TRUE(it->second.count(peer_id) == 0) + << "Follower peer_id=" << peer_id << " should not have the missing shard"; + } + } + } + }); + + g_helper->sync(); + + // Test case for leader missing/corrupted + LOGINFO("Starting leader missing/corrupted test case"); + + // Get new blob ids for leader corruption test. + // Must use blob_op_shard_id (not missing_shard_id) because followers deleted the entire + // missing_shard_id — no hash comparison is possible for blobs in that shard. + // blob_op_shard_id exists on both leader and followers; skip the first 3 blobs already + // used by the follower test (missing/corrupted/inconsistent at indices 0/1/2). + auto& leader_shard_blobs = shard_blob_ids_map[blob_op_shard_id]; + auto leader_it = leader_shard_blobs.begin(); + std::advance(leader_it, 3); + const auto leader_missing_blob_id = leader_it->first; + const auto leader_corrupted_blob_id = (++leader_it)->first; + const auto leader_inconsistent_blob_id = (++leader_it)->first; + + // Corrupt data on leader + run_on_pg_leader(pg_id, [&]() { + auto& pg_index_table = hs_pg->index_table_; + + // 1. Delete leader_missing_blob_id from pg_index table on leader + delete_blob_from_index(pg_index_table, blob_op_shard_id, leader_missing_blob_id); + LOGINFO("Deleted blob {} from leader index table", leader_missing_blob_id); + + // 2. Make leader_corrupted_blob_id corrupted on leader + corrupt_blob_data(pg_index_table, blob_op_shard_id, leader_corrupted_blob_id); + LOGINFO("Corrupted blob {} on leader", leader_corrupted_blob_id); + + // 3. Make leader_inconsistent_blob_id inconsistent on leader + make_blob_inconsistent(pg_index_table, blob_op_shard_id, leader_inconsistent_blob_id, _obj_inst.get()); + LOGINFO("Made blob {} inconsistent on leader", leader_inconsistent_blob_id); + }); + + g_helper->sync(); + + // Run scrub and verify both leader and follower corruptions are detected + run_on_pg_leader(pg_id, [&]() { + LOGINFO("Running deep scrub to detect both leader and follower corruptions"); + auto scrub_report = scrub_mgr->submit_scrub_task(pg_id, true /* is_deep */, SCRUB_TRIGGER_TYPE::MANUALLY).get(); + + ASSERT_NE(scrub_report, nullptr) << "Deep scrub report should not be null"; + auto deep_scrub_report = std::dynamic_pointer_cast< ScrubManager::DeepScrubReport >(scrub_report); + ASSERT_NE(deep_scrub_report, nullptr) << "Should be DeepScrubReport"; + + const auto& leader_uuid = _obj_inst->our_uuid(); + const auto& members = (hs_pg->pg_info_).members; + std::set< peer_id_t > follower_peer_ids; + for (const auto& member : members) { + if (member.id != leader_uuid) { follower_peer_ids.insert(member.id); } + } + + // ========== Verify Missing Blobs ========== + LOGINFO("Verifying missing blobs detection"); + verify_missing_blobs(deep_scrub_report, leader_uuid, BlobRoute{blob_op_shard_id, leader_missing_blob_id}); + for (const auto& peer_id : follower_peer_ids) { + verify_missing_blobs(deep_scrub_report, peer_id, BlobRoute{blob_op_shard_id, missing_blob_id}); + } + + // ========== Verify Missing Shards ========== + LOGINFO("Verifying missing shards detection"); + for (const auto& peer_id : follower_peer_ids) { + verify_missing_shards(deep_scrub_report, peer_id, missing_shard_id); + } + + // ========== Verify Corrupted Blobs ========== + LOGINFO("Verifying corrupted blobs detection"); + verify_corrupted_blobs(deep_scrub_report, leader_uuid, BlobRoute{blob_op_shard_id, leader_corrupted_blob_id}); + for (const auto& peer_id : follower_peer_ids) { + verify_corrupted_blobs(deep_scrub_report, peer_id, BlobRoute{blob_op_shard_id, corrupted_blob_id}); + } + + // False positive guard: still only one missing shard (follower's missing_shard_id); + // leader corruption was blob-level only, no shard should be newly added. + EXPECT_EQ(deep_scrub_report->get_missing_shard_ids().size(), 1u) + << "Exactly one shard should be reported missing in combined leader+follower test"; + + // False positive guard: each node should have exactly the one blob we corrupted. + { + const auto& all_corrupted = deep_scrub_report->get_corrupted_blobs(); + auto lit = all_corrupted.find(leader_uuid); + if (lit != all_corrupted.end()) { + EXPECT_EQ(lit->second.size(), 1u) << "Leader should have exactly 1 corrupted blob"; + } + for (const auto& peer_id : follower_peer_ids) { + auto fit = all_corrupted.find(peer_id); + if (fit != all_corrupted.end()) { + EXPECT_EQ(fit->second.size(), 1u) + << "Follower peer_id=" << peer_id << " should have exactly 1 corrupted blob"; + } + } + } + + // ========== Verify Inconsistent Blobs ========== + const auto inconsistent_blobs = deep_scrub_report->get_inconsistent_blobs(); + LOGINFO("Verifying inconsistent blobs detection, inconsistent_blobs.size()={}", inconsistent_blobs.size()); + + // Should have 2 inconsistent blobs: one from follower test, one from leader test + EXPECT_TRUE(inconsistent_blobs.size() == 2) + << "Should have 2 inconsistent blobs (1 from follower, 1 from leader)"; + + // Verify leader's inconsistent blob + auto leader_inconsistent_it = inconsistent_blobs.find(BlobRoute{blob_op_shard_id, leader_inconsistent_blob_id}); + EXPECT_TRUE(leader_inconsistent_it != inconsistent_blobs.end()) + << "The leader's inconsistent blob should be reported in deep scrub report"; + if (leader_inconsistent_it != inconsistent_blobs.end()) { + auto& inconsistent_blob_peers = leader_inconsistent_it->second; + // All peers including leader should be in the inconsistent blob report + EXPECT_TRUE(inconsistent_blob_peers.size() == follower_peer_ids.size() + 1) + << "Leader's inconsistent blob should be reported for all peers including leader"; + EXPECT_TRUE(inconsistent_blob_peers.count(leader_uuid) == 1) + << "Leader should be in the inconsistent blob peers"; + for (const auto& peer_id : follower_peer_ids) { + EXPECT_TRUE(inconsistent_blob_peers.count(peer_id) == 1) + << "Follower peer_id=" << peer_id << " should be in leader's inconsistent blob peers"; + } + } + + // Verify follower's inconsistent blob (from earlier test) + auto follower_inconsistent_it = inconsistent_blobs.find(BlobRoute{blob_op_shard_id, inconsistent_blob_id}); + EXPECT_TRUE(follower_inconsistent_it != inconsistent_blobs.end()) + << "The follower's inconsistent blob should be reported in deep scrub report"; + if (follower_inconsistent_it != inconsistent_blobs.end()) { + auto& inconsistent_blob_peers = follower_inconsistent_it->second; + // All peers should be in the inconsistent blob report + EXPECT_TRUE(inconsistent_blob_peers.size() == follower_peer_ids.size() + 1) + << "Follower's inconsistent blob should be reported for all peers"; + EXPECT_TRUE(inconsistent_blob_peers.count(leader_uuid) == 1) + << "Leader should be in follower's inconsistent blob peers"; + for (const auto& peer_id : follower_peer_ids) { + EXPECT_TRUE(inconsistent_blob_peers.count(peer_id) == 1) + << "Follower peer_id=" << peer_id << " should be in follower's inconsistent blob peers"; + } + } + }); + + g_helper->sync(); +} + +// Test leader missing an entire shard: followers have the shard but leader's index doesn't. +// Verifies that deep and shallow scrub both detect the missing shard and all its blobs. +TEST_F(HomeObjectFixture, LeaderMissingShardTest) { + const pg_id_t pg_id = 1; + create_pg(pg_id); + auto scrub_mgr = _obj_inst->scrub_manager(); + + const uint64_t num_blobs_per_shard = SISL_OPTIONS["num_blobs"].as< uint64_t >(); + const uint64_t shard_size = 64 * Mi; + + std::map< pg_id_t, std::vector< shard_id_t > > pg_shard_id_vec; + std::map< pg_id_t, blob_id_t > pg_blob_id; + pg_blob_id[pg_id] = 0; + + // Two shards: one will be deleted from the leader's index, one stays healthy on all peers. + auto missing_shard_info = create_shard(pg_id, shard_size, "leader missing shard"); + auto healthy_shard_info = create_shard(pg_id, shard_size, "healthy shard"); + pg_shard_id_vec[pg_id] = {missing_shard_info.id, healthy_shard_info.id}; + + const auto leader_missing_shard_id = missing_shard_info.id; + const auto healthy_shard_id = healthy_shard_info.id; + + auto shard_blob_ids_map = put_blobs(pg_shard_id_vec, num_blobs_per_shard, pg_blob_id); + verify_get_blob(pg_shard_id_vec, num_blobs_per_shard); + + g_helper->sync(); + + const auto hs_pg = _obj_inst->get_hs_pg(pg_id); + ASSERT_TRUE(hs_pg) << "PG should exist for pg_id=" << pg_id; + + // Simulate leader losing the shard by removing it from the local B-tree index only. + // Followers still have the shard, so scrub should detect the leader as missing it. + run_on_pg_leader(pg_id, [&]() { + delete_shard_from_index(hs_pg->index_table_, leader_missing_shard_id); + LOGINFO("Deleted shard {} from leader index to simulate leader missing shard", leader_missing_shard_id); + }); + + g_helper->sync(); + + run_on_pg_leader(pg_id, [&]() { + const auto& leader_uuid = _obj_inst->our_uuid(); + std::set< peer_id_t > follower_peer_ids; + for (const auto& member : hs_pg->pg_info_.members) { + if (member.id != leader_uuid) { follower_peer_ids.insert(member.id); } + } + + // ===== Deep scrub ===== + auto scrub_report = scrub_mgr->submit_scrub_task(pg_id, true, SCRUB_TRIGGER_TYPE::MANUALLY).get(); + ASSERT_NE(scrub_report, nullptr) << "Deep scrub report should not be null"; + auto deep_scrub_report = std::dynamic_pointer_cast< ScrubManager::DeepScrubReport >(scrub_report); + ASSERT_NE(deep_scrub_report, nullptr) << "Should be DeepScrubReport"; + + // The missing shard must appear in missing_shard_ids. + // missing_shard_ids[shard_id] = peers that HAVE the shard; leader absent from this set. + verify_missing_shards(deep_scrub_report, leader_uuid, leader_missing_shard_id); + { + const auto& missing_shards = deep_scrub_report->get_missing_shard_ids(); + auto it = missing_shards.find(leader_missing_shard_id); + ASSERT_TRUE(it != missing_shards.end()) << "leader_missing_shard_id must be in missing_shard_ids"; + for (const auto& peer_id : follower_peer_ids) { + EXPECT_TRUE(it->second.count(peer_id) == 1) + << "Follower peer_id=" << peer_id << " should be in peer set (it has the shard)"; + } + // Healthy shard must not appear as missing. + EXPECT_TRUE(missing_shards.find(healthy_shard_id) == missing_shards.end()) + << "Healthy shard should not be reported as missing"; + } + + // Every blob in the missing shard must be reported as missing on the leader. + for (const auto& [blob_id, _] : shard_blob_ids_map[leader_missing_shard_id]) { + verify_missing_blobs(deep_scrub_report, leader_uuid, BlobRoute{leader_missing_shard_id, blob_id}); + } + + // Blobs from the healthy shard must not appear in missing_blobs. + { + const auto& missing_blobs = deep_scrub_report->get_missing_blobs(); + for (const auto& [blob_id, _] : shard_blob_ids_map[healthy_shard_id]) { + EXPECT_TRUE(missing_blobs.find(BlobRoute{healthy_shard_id, blob_id}) == missing_blobs.end()) + << "Healthy blob " << blob_id << " should not be reported as missing"; + } + } + + // ===== Shallow scrub ===== + auto shallow_scrub_report = scrub_mgr->submit_scrub_task(pg_id, false, SCRUB_TRIGGER_TYPE::MANUALLY).get(); + ASSERT_NE(shallow_scrub_report, nullptr) << "Shallow scrub report should not be null"; + + // Missing shard must be detected in shallow scrub as well. + { + const auto& shallow_missing_shards = shallow_scrub_report->get_missing_shard_ids(); + auto it = shallow_missing_shards.find(leader_missing_shard_id); + EXPECT_TRUE(it != shallow_missing_shards.end()) << "Missing shard should appear in shallow scrub report"; + if (it != shallow_missing_shards.end()) { + EXPECT_TRUE(it->second.count(leader_uuid) == 0) << "Leader should not have the missing shard"; + for (const auto& peer_id : follower_peer_ids) { + EXPECT_TRUE(it->second.count(peer_id) == 1) + << "Follower peer_id=" << peer_id << " should have the shard"; + } + } + } + + // Shallow scrub missing_blobs should contain exactly the blobs from the missing shard. + EXPECT_EQ(shallow_scrub_report->get_missing_blobs().size(), num_blobs_per_shard) + << "Missing blob count should equal num_blobs_per_shard"; + { + const auto& shallow_missing_blobs = shallow_scrub_report->get_missing_blobs(); + for (const auto& [blob_id, _] : shard_blob_ids_map[leader_missing_shard_id]) { + auto it = shallow_missing_blobs.find(BlobRoute{leader_missing_shard_id, blob_id}); + EXPECT_TRUE(it != shallow_missing_blobs.end()) + << "Blob " << blob_id << " from missing shard should be in shallow scrub report"; + if (it != shallow_missing_blobs.end()) { + EXPECT_TRUE(it->second.count(leader_uuid) == 0) << "Leader should not have blob " << blob_id; + for (const auto& peer_id : follower_peer_ids) { + EXPECT_TRUE(it->second.count(peer_id) == 1) + << "Follower peer_id=" << peer_id << " should have blob " << blob_id; + } + } + } + } + }); + + g_helper->sync(); +} + +// Test scrub superblock persistence across deep and shallow scrubs +TEST_F(HomeObjectFixture, ScrubSuperblockPersistenceTest) { + const pg_id_t pg_id = 1; + create_pg(pg_id); + + const uint64_t shard_size = 64 * Mi; + create_shard(pg_id, shard_size, "shard_meta"); + auto scrub_mgr = _obj_inst->scrub_manager(); + + run_on_pg_leader(pg_id, [&]() { + // Get initial scrub superblock (should be newly created) + auto initial_sb = scrub_mgr->get_scrub_superblk(pg_id); + ASSERT_TRUE(initial_sb.has_value()) << "Should have scrub superblock"; + + auto initial_deep_scrub_time = initial_sb->last_deep_scrub_timestamp; + auto initial_shallow_scrub_time = initial_sb->last_shallow_scrub_timestamp; + + // Give some time to ensure timestamps will be different + std::this_thread::sleep_for(std::chrono::seconds(2)); + + // Run a deep scrub + scrub_mgr->submit_scrub_task(pg_id, true /* is_deep */, SCRUB_TRIGGER_TYPE::MANUALLY).get(); + + // Check that deep scrub timestamp updated + auto after_deep_sb = scrub_mgr->get_scrub_superblk(pg_id); + ASSERT_TRUE(after_deep_sb.has_value()); + EXPECT_GT(after_deep_sb->last_deep_scrub_timestamp, initial_deep_scrub_time) + << "Deep scrub timestamp should be updated"; + EXPECT_EQ(after_deep_sb->last_shallow_scrub_timestamp, initial_shallow_scrub_time) + << "Shallow scrub timestamp should not change after deep scrub"; + + std::this_thread::sleep_for(std::chrono::seconds(2)); + + // Run a shallow scrub + scrub_mgr->submit_scrub_task(pg_id, false /* is_deep */, SCRUB_TRIGGER_TYPE::MANUALLY).get(); + + // Check that shallow scrub timestamp updated + auto after_shallow_sb = scrub_mgr->get_scrub_superblk(pg_id); + ASSERT_TRUE(after_shallow_sb.has_value()); + EXPECT_EQ(after_shallow_sb->last_deep_scrub_timestamp, after_deep_sb->last_deep_scrub_timestamp) + << "Deep scrub timestamp should not change after shallow scrub"; + EXPECT_GT(after_shallow_sb->last_shallow_scrub_timestamp, after_deep_sb->last_shallow_scrub_timestamp) + << "Shallow scrub timestamp should be updated"; + }); + + g_helper->sync(); +} + +// Test cancel scrub task +TEST_F(HomeObjectFixture, CancelScrubTaskTest) { + const pg_id_t pg_id = 1; + create_pg(pg_id); + auto scrub_mgr = _obj_inst->scrub_manager(); + + const uint64_t shard_size = 64 * Mi; + auto shard_info = create_shard(pg_id, shard_size, "shard meta"); + + std::map< pg_id_t, std::vector< shard_id_t > > pg_shard_id_vec; + std::map< pg_id_t, blob_id_t > pg_blob_id; + pg_shard_id_vec[pg_id].push_back(shard_info.id); + pg_blob_id[pg_id] = 0; + + const uint64_t num_blobs = 10; + put_blobs(pg_shard_id_vec, num_blobs, pg_blob_id); + g_helper->sync(); + + // Submit a scrub task and then cancel it + run_on_pg_leader(pg_id, [&]() { + auto scrub_future = scrub_mgr->submit_scrub_task(pg_id, true, SCRUB_TRIGGER_TYPE::MANUALLY); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + scrub_mgr->cancel_scrub_task(pg_id); + LOGINFO("Cancelled scrub task for pg={}", pg_id); + auto scrub_report = std::move(scrub_future).get(); + LOGINFO("Scrub task cancelled, report: {}", scrub_report ? "present" : "null"); + + // The critical invariant: cancel must clear in_scrubbing so that a subsequent + // submit_scrub_task is accepted. A null return here means the state was not cleaned up. + auto followup_report = scrub_mgr->submit_scrub_task(pg_id, true, SCRUB_TRIGGER_TYPE::MANUALLY).get(); + EXPECT_NE(followup_report, nullptr) << "A new scrub task should be accepted after cancellation; " + "null means in_scrubbing was not cleared"; + scrub_mgr->cancel_scrub_task(pg_id); + LOGINFO("Cancel non-existent scrub task for pg={} - should not crash", pg_id); + }); + + g_helper->sync(); +} + +// Test concurrent scrubs on multiple PGs +TEST_F(HomeObjectFixture, ConcurrentScrubsOnMultiplePGsTest) { + const uint64_t num_pgs = 3; + const uint64_t shard_size = 64 * Mi; + + std::vector< pg_id_t > pg_ids; + std::map< pg_id_t, std::vector< shard_id_t > > pg_shard_id_vec; + std::map< pg_id_t, blob_id_t > pg_blob_id; + + // Create multiple PGs with shards and blobs + for (uint64_t i = 1; i <= num_pgs; ++i) { + pg_id_t pg_id = i; + pg_ids.push_back(pg_id); + create_pg(pg_id); + auto shard_info = create_shard(pg_id, shard_size, "shard meta " + std::to_string(pg_id)); + pg_shard_id_vec[pg_id].push_back(shard_info.id); + pg_blob_id[pg_id] = 0; + put_blobs(pg_shard_id_vec, 5, pg_blob_id); + } + + auto scrub_mgr = _obj_inst->scrub_manager(); + + // Submit scrub tasks for all PGs concurrently + std::vector< std::pair< pg_id_t, folly::SemiFuture< std::shared_ptr< ScrubManager::ShallowScrubReport > > > > + scrub_futures; + + for (const auto& pg_id : pg_ids) { + run_on_pg_leader(pg_id, [&]() { + auto future = scrub_mgr->submit_scrub_task(pg_id, true, SCRUB_TRIGGER_TYPE::MANUALLY); + scrub_futures.emplace_back(pg_id, std::move(future)); + LOGINFO("Submitted deep scrub for pg={}", pg_id); + }); + } + + // Wait for all scrub tasks to complete and verify each report is clean + for (auto& [pg_id, future] : scrub_futures) { + auto report = std::move(future).get(); + ASSERT_NE(report, nullptr) << "Scrub report should not be null for pg=" << pg_id; + + auto deep_report = std::dynamic_pointer_cast< ScrubManager::DeepScrubReport >(report); + ASSERT_NE(deep_report, nullptr) << "Should be DeepScrubReport for pg=" << pg_id; + + EXPECT_TRUE(deep_report->get_missing_shard_ids().empty()) << "pg=" << pg_id << " should have no missing shards"; + EXPECT_TRUE(deep_report->get_missing_blobs().empty()) << "pg=" << pg_id << " should have no missing blobs"; + EXPECT_TRUE(deep_report->get_corrupted_blobs().empty()) << "pg=" << pg_id << " should have no corrupted blobs"; + EXPECT_TRUE(deep_report->get_inconsistent_blobs().empty()) + << "pg=" << pg_id << " should have no inconsistent blobs"; + LOGINFO("PG {} concurrent scrub completed cleanly", pg_id); + } + + g_helper->sync(); +} + +// Test deleted blob filter in scrub report +TEST_F(HomeObjectFixture, ReconcileScrubReportTest) { + const pg_id_t pg_id = 1; + create_pg(pg_id); + auto scrub_mgr = _obj_inst->scrub_manager(); + + const uint64_t shard_size = 64 * Mi; + auto shard_info = create_shard(pg_id, shard_size, "shard meta"); + + std::map< pg_id_t, std::vector< shard_id_t > > pg_shard_id_vec; + std::map< pg_id_t, blob_id_t > pg_blob_id; + pg_shard_id_vec[pg_id].push_back(shard_info.id); + pg_blob_id[pg_id] = 0; + + std::map< shard_id_t, std::map< blob_id_t, uint64_t > > shard_blob_ids_map; + + // Create some blobs + const uint64_t num_blobs = 10; + shard_blob_ids_map = put_blobs(pg_shard_id_vec, num_blobs, pg_blob_id); + const auto hs_pg = _obj_inst->get_hs_pg(pg_id); + ASSERT_TRUE(hs_pg) << "PG should exist for pg_id=" << pg_id; + + const auto shard_id = shard_info.id; + auto& shard_blobs = shard_blob_ids_map[shard_id]; + + // Select blobs to test: + // - missing_blob_to_delete: will be missing from leader index AND deleted via blob delete + // - missing_blob_not_deleted: will be missing from leader index but NOT deleted + auto it = shard_blobs.begin(); + const auto missing_blob_to_delete = it->first; // First blob: will be deleted via blob delete + const auto missing_blob_not_deleted = (++it)->first; // Second blob: will NOT be deleted + + // Delete both blobs from index table to simulate missing blobs on followers + run_on_pg_follower(pg_id, [&]() { + auto& pg_index_table = hs_pg->index_table_; + delete_blob_from_index(pg_index_table, shard_id, missing_blob_to_delete); + delete_blob_from_index(pg_index_table, shard_id, missing_blob_not_deleted); + LOGINFO("Deleted blobs {} and {} from follower index table", missing_blob_to_delete, missing_blob_not_deleted); + }); + + g_helper->sync(); + + run_on_pg_leader(pg_id, [&]() { + // only the blob that was deleted via blob delete should be filtered out, the other missing blob should be + // reported in the scrub report + std::set< peer_id_t > follower_peer_ids; + const auto& leader_uuid = _obj_inst->our_uuid(); + const auto& members = (hs_pg->pg_info_).members; + for (const auto& member : members) { + if (member.id == leader_uuid) { continue; } + follower_peer_ids.insert(member.id); + } + + auto scrub_report = + scrub_mgr->submit_scrub_task(pg_id, false /* shallow */, SCRUB_TRIGGER_TYPE::MANUALLY).get(); + + // missing_blobs[blob_route] = peers that have the blob; followers are absent from that set. + auto missing_blobs = scrub_report->get_missing_blobs(); + EXPECT_TRUE(missing_blobs.size() == 2) << "There should be two missing blobs in scrub report"; + for (const auto& blob_route : + {BlobRoute{shard_id, missing_blob_to_delete}, BlobRoute{shard_id, missing_blob_not_deleted}}) { + auto it = missing_blobs.find(blob_route); + ASSERT_TRUE(it != missing_blobs.end()) << "Missing blob should be reported in scrub report"; + for (const auto& peer_id : follower_peer_ids) { + EXPECT_TRUE(it->second.count(peer_id) == 0) + << "Follower peer_id=" << peer_id << " should not have the missing blob"; + } + } + +#ifdef _PRERELEASE + set_callback_flip( + "delete_missing_blob_through_raft", std::function< void() >([this, missing_blob_to_delete, shard_id]() { + auto ret = + _obj_inst->blob_manager()->del(shard_id, missing_blob_to_delete, generateRandomTraceId()).get(); + if (!ret) { + FAIL() << "Blob deletion via raft failed for shard=" << shard_id + << " blob=" << missing_blob_to_delete << ", error=" << fmt::format("{}", ret.error()); + } else { + LOGINFO("Successfully deleted blob {} in shard {} via raft", missing_blob_to_delete, shard_id); + } + // wait until all the pending gc tasks for this pg are completed + std::this_thread::sleep_for(std::chrono::seconds(2)); + })); + + scrub_report = scrub_mgr->submit_scrub_task(pg_id, false /* shallow */, SCRUB_TRIGGER_TYPE::MANUALLY).get(); + + remove_flip("delete_missing_blob_through_raft"); + + // Verify the scrub report + ASSERT_NE(scrub_report, nullptr) << "Scrub report should not be null"; + + missing_blobs = scrub_report->get_missing_blobs(); + EXPECT_TRUE(missing_blobs.size() == 1) << "There should be one missing blob in scrub report after deletion"; + { + auto it = missing_blobs.find(BlobRoute{shard_id, missing_blob_not_deleted}); + ASSERT_TRUE(it != missing_blobs.end()) + << "The missing blob that was not deleted should be reported in scrub report"; + for (const auto& peer_id : follower_peer_ids) { + EXPECT_TRUE(it->second.count(peer_id) == 0) + << "Follower peer_id=" << peer_id << " should not have the missing blob"; + } + } +#endif + }); + + g_helper->sync(); +} + +// Test add and remove PG from scrub manager +TEST_F(HomeObjectFixture, AddRemovePGScrubTest) { + const pg_id_t pg_id = 1; + const uint64_t shard_size = 64 * Mi; + + // Create PG and verify scrub superblock is created + create_pg(pg_id); + create_shard(pg_id, shard_size, "shard meta"); + + auto scrub_mgr = _obj_inst->scrub_manager(); + + // Verify scrub superblock exists + run_on_pg_leader(pg_id, [&]() { + auto sb = scrub_mgr->get_scrub_superblk(pg_id); + ASSERT_TRUE(sb.has_value()) << "Scrub superblock should exist after PG creation"; + LOGINFO("Scrub superblock created for pg={}", pg_id); + }); + + // Run a scrub to update timestamps + run_on_pg_leader(pg_id, [&]() { + // Get initial timestamp before scrub + auto sb_before = scrub_mgr->get_scrub_superblk(pg_id); + ASSERT_TRUE(sb_before.has_value()) << "Scrub superblock should exist before scrub"; + uint64_t timestamp_before = sb_before->last_shallow_scrub_timestamp; + LOGINFO("Timestamp before scrub: {}", timestamp_before); + + // Wait a bit to ensure timestamp will be different + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + + auto report = scrub_mgr->submit_scrub_task(pg_id, false, SCRUB_TRIGGER_TYPE::MANUALLY).get(); + ASSERT_NE(report, nullptr) << "Scrub report should not be null"; + + // Verify timestamp was updated after scrub + auto sb_after = scrub_mgr->get_scrub_superblk(pg_id); + ASSERT_TRUE(sb_after.has_value()) << "Scrub superblock should exist after scrub"; + uint64_t timestamp_after = sb_after->last_shallow_scrub_timestamp; + EXPECT_GT(timestamp_after, timestamp_before) << "Shallow scrub timestamp should be updated after scrub"; + LOGINFO("Timestamp after scrub: {} (updated from {})", timestamp_after, timestamp_before); + }); + g_helper->sync(); + + // Now delete the PG - this should cancel any running scrub and remove superblock + _obj_inst->pg_manager()->destroy_pg(pg_id); + auto report = scrub_mgr->submit_scrub_task(pg_id, false, SCRUB_TRIGGER_TYPE::MANUALLY).get(); + ASSERT_EQ(report, nullptr) << "Scrub report should be null after PG deletion"; + LOGINFO("Scrub task for deleted pg={} returned null report as expected", pg_id); + + // Verify scrub superblock is cleaned up: remove_pg erases the entry from m_pg_scrub_sb_map, + // so get_scrub_superblk must return nullopt after the PG is fully deleted. + auto cleaned_sb = scrub_mgr->get_scrub_superblk(pg_id); + EXPECT_FALSE(cleaned_sb.has_value()) << "Scrub superblock should be removed from scrub manager after PG deletion"; + LOGINFO("PG deleted, scrub superblock correctly cleaned up"); +} + +// Test local scrub methods +TEST_F(HomeObjectFixture, LocalScrubMethodsTest) { + const pg_id_t pg_id = 1; + create_pg(pg_id); + auto scrub_mgr = _obj_inst->scrub_manager(); + + const uint64_t shard_size = 64 * Mi; + auto shard_info = create_shard(pg_id, shard_size, "shard meta"); + + std::map< pg_id_t, std::vector< shard_id_t > > pg_shard_id_vec; + std::map< pg_id_t, blob_id_t > pg_blob_id; + pg_shard_id_vec[pg_id].push_back(shard_info.id); + pg_blob_id[pg_id] = 0; + + // Create blobs first + const uint64_t num_blobs = 10; + auto shard_blob_ids_map = put_blobs(pg_shard_id_vec, num_blobs, pg_blob_id); + LOGINFO("Created {} blobs for local scrub test", num_blobs); + const auto hs_pg = _obj_inst->get_hs_pg(pg_id); + ASSERT_TRUE(hs_pg) << "PG should exist for pg_id=" << pg_id; + + const auto shard_id = shard_info.id; + auto& shard_blobs = shard_blob_ids_map[shard_id]; + + auto it = shard_blobs.begin(); + const auto corrupted_blob_id = it->first; + + // corrupt one blob's data in the index table to simulate a blob-level corruption that should be detected by local + // blob scrub. + auto& pg_index_table = hs_pg->index_table_; + corrupt_blob_data(pg_index_table, shard_id, corrupted_blob_id); + + auto my_uuid = _obj_inst->our_uuid(); + const int64_t scrub_lsn = hs_pg->repl_dev_->get_last_commit_lsn(); + + // Test local_scrub_meta: covers range [0, shard_id] + auto meta_req = std::make_shared< ScrubManager::scrub_req >(pg_id, 1, scrub_lsn, 0, 0, shard_id, UINT64_MAX, + SCRUB_TYPE::META, my_uuid); + auto meta_result = scrub_mgr->local_scrub_meta(meta_req); + ASSERT_NE(meta_result, nullptr) << "local_scrub_meta should return a result"; + LOGINFO("Meta scrub completed, {} entries", meta_result->entries.size()); + + // Test local_scrub_blob (shallow): all entries should carry ScrubStatus::NONE + auto shallow_req = std::make_shared< ScrubManager::scrub_req >(pg_id, 2, scrub_lsn, shard_id, 0, shard_id, + UINT64_MAX, SCRUB_TYPE::SHALLOW_BLOB, my_uuid); + auto shallow_result = scrub_mgr->local_scrub_blob(shallow_req); + ASSERT_NE(shallow_result, nullptr) << "local_scrub_blob (shallow) should return a result"; + LOGINFO("Shallow blob scrub completed, {} entries", shallow_result->entries.size()); + for (const auto& [route, val] : shallow_result->entries) { + auto* status = std::get_if< ScrubStatus >(&val); + ASSERT_TRUE(status != nullptr) << "Shallow entry should carry ScrubStatus"; + EXPECT_EQ(*status, ScrubStatus::NONE) << "Shallow blob entry should be NONE"; + } + + // Test local_scrub_blob (deep): should detect the corrupted blob + auto deep_req = std::make_shared< ScrubManager::scrub_req >(pg_id, 3, scrub_lsn, shard_id, 0, shard_id, UINT64_MAX, + SCRUB_TYPE::DEEP_BLOB, my_uuid); + auto deep_result = scrub_mgr->local_scrub_blob(deep_req); + ASSERT_NE(deep_result, nullptr) << "local_scrub_blob (deep) should return a result"; + LOGINFO("Deep blob scrub completed, {} entries", deep_result->entries.size()); + + auto corrupted_it = deep_result->entries.find(BlobRoute{shard_id, corrupted_blob_id}); + EXPECT_TRUE(corrupted_it != deep_result->entries.end()) << "Corrupted blob should appear in deep scrub result"; + if (corrupted_it != deep_result->entries.end()) { + auto* status = std::get_if< ScrubStatus >(&corrupted_it->second); + ASSERT_TRUE(status != nullptr) << "Corrupted blob result should be ScrubStatus"; + EXPECT_EQ(*status, ScrubStatus::MISMATCH) << "Corrupted blob should have MISMATCH status"; + LOGINFO("Deep scrub correctly detected corrupted blob {}", corrupted_blob_id); + } +} + +// Test scrub request serialization and deserialization +TEST_F(HomeObjectFixture, ScrubRequestSerializationTest) { + const pg_id_t pg_id = 10; + auto my_uuid = _obj_inst->our_uuid(); + + // Test META scrub_req serialization + { + auto req = std::make_shared< ScrubManager::scrub_req >(pg_id, 1, 100, 0, 0, UINT64_MAX, UINT64_MAX, + SCRUB_TYPE::META, my_uuid); + auto buffer = req->build_flat_buffer(); + EXPECT_GT(buffer.size(), 0) << "Serialized buffer should not be empty"; + auto req_loaded = std::make_shared< ScrubManager::scrub_req >(); + bool load_success = req_loaded->load(buffer.data(), buffer.size()); + EXPECT_TRUE(load_success) << "Deserialization should succeed"; + + EXPECT_EQ(req_loaded->pg_id, pg_id); + EXPECT_EQ(req_loaded->req_id, 1u); + EXPECT_EQ(req_loaded->scrub_lsn, 100); + EXPECT_EQ(req_loaded->start_shard_id, 0u); + EXPECT_EQ(req_loaded->end_shard_id, UINT64_MAX); + EXPECT_EQ(req_loaded->scrub_type, SCRUB_TYPE::META); + EXPECT_EQ(req_loaded->issuer_peer_id, my_uuid); + } + + // Test DEEP_BLOB scrub_req serialization + { + auto req = std::make_shared< ScrubManager::scrub_req >(pg_id, 2, 200, 100, 0, 200, UINT64_MAX, + SCRUB_TYPE::DEEP_BLOB, my_uuid); + auto buffer = req->build_flat_buffer(); + EXPECT_GT(buffer.size(), 0); + auto req_loaded = std::make_shared< ScrubManager::scrub_req >(); + bool load_success = req_loaded->load(buffer.data(), buffer.size()); + EXPECT_TRUE(load_success); + + EXPECT_EQ(req_loaded->pg_id, pg_id); + EXPECT_EQ(req_loaded->req_id, 2u); + EXPECT_EQ(req_loaded->scrub_lsn, 200); + EXPECT_EQ(req_loaded->start_shard_id, 100u); + EXPECT_EQ(req_loaded->end_shard_id, 200u); + EXPECT_EQ(req_loaded->scrub_type, SCRUB_TYPE::DEEP_BLOB); + } + + // Test SHALLOW_BLOB scrub_req serialization + { + auto req = std::make_shared< ScrubManager::scrub_req >(pg_id, 3, 300, 0, 0, 100, UINT64_MAX, + SCRUB_TYPE::SHALLOW_BLOB, my_uuid); + auto buffer = req->build_flat_buffer(); + EXPECT_GT(buffer.size(), 0); + auto req_loaded = std::make_shared< ScrubManager::scrub_req >(); + bool load_success = req_loaded->load(buffer.data(), buffer.size()); + EXPECT_TRUE(load_success); + + EXPECT_EQ(req_loaded->pg_id, pg_id); + EXPECT_EQ(req_loaded->req_id, 3u); + EXPECT_EQ(req_loaded->scrub_lsn, 300); + EXPECT_EQ(req_loaded->start_shard_id, 0u); + EXPECT_EQ(req_loaded->end_shard_id, 100u); + EXPECT_EQ(req_loaded->scrub_type, SCRUB_TYPE::SHALLOW_BLOB); + } +} + +// Test scrub_result serialization and deserialization. +// scrub_result carries three distinct entry kinds that follow different wire/load paths: +// - uint64_t hash : written as (status=NONE, hash=value); loaded back as uint64_t +// - ScrubStatus::NONE : written as (status=NONE, hash=0); loaded back as uint64_t(0) +// - non-NONE ScrubStatus: written as (status=X, hash=0); loaded back as ScrubStatus +TEST_F(HomeObjectFixture, ScrubResultSerializationTest) { + const auto my_uuid = _obj_inst->our_uuid(); + const uint64_t req_id = 99; + + // ---- Case 1: hash entries (deep blob scrub) ---- + { + auto result = std::make_shared< ScrubManager::scrub_result >(req_id, my_uuid); + result->add_entry({10, 1, uint64_t{0xDEADBEEFCAFEBABEULL}}); + result->add_entry({10, 2, uint64_t{0x123456789ABCDEF0ULL}}); + + auto buf = result->build_flat_buffer(); + EXPECT_GT(buf.size(), 0u); + + auto loaded = std::make_shared< ScrubManager::scrub_result >(); + EXPECT_TRUE(loaded->load(buf.data(), buf.size())); + + EXPECT_EQ(loaded->req_id, req_id); + EXPECT_EQ(loaded->issuer_peer_id, my_uuid); + EXPECT_EQ(loaded->entries.size(), 2u); + + auto it1 = loaded->entries.find(BlobRoute{10, 1}); + ASSERT_NE(it1, loaded->entries.end()); + auto* h1 = std::get_if< uint64_t >(&it1->second); + ASSERT_NE(h1, nullptr) << "Hash entry should deserialize as uint64_t"; + EXPECT_EQ(*h1, 0xDEADBEEFCAFEBABEULL); + + auto it2 = loaded->entries.find(BlobRoute{10, 2}); + ASSERT_NE(it2, loaded->entries.end()); + auto* h2 = std::get_if< uint64_t >(&it2->second); + ASSERT_NE(h2, nullptr); + EXPECT_EQ(*h2, 0x123456789ABCDEF0ULL); + } + + // ---- Case 2: ScrubStatus::NONE (shallow blob scrub existence entries) ---- + // On the wire NONE maps to (status=NONE, hash=0) and loads back as uint64_t(0). + { + auto result = std::make_shared< ScrubManager::scrub_result >(req_id + 1, my_uuid); + result->add_entry({20, 1, ScrubStatus::NONE}); + result->add_entry({20, 2, ScrubStatus::NONE}); + + auto buf = result->build_flat_buffer(); + auto loaded = std::make_shared< ScrubManager::scrub_result >(); + EXPECT_TRUE(loaded->load(buf.data(), buf.size())); + + EXPECT_EQ(loaded->entries.size(), 2u); + for (const BlobRoute& route : {BlobRoute{20, 1}, BlobRoute{20, 2}}) { + auto it = loaded->entries.find(route); + ASSERT_NE(it, loaded->entries.end()); + auto* h = std::get_if< uint64_t >(&it->second); + ASSERT_NE(h, nullptr) << "NONE entry should deserialize as uint64_t(0)"; + EXPECT_EQ(*h, 0u); + } + } + + // ---- Case 3: error status entries (IO_ERROR, MISMATCH) ---- + { + auto result = std::make_shared< ScrubManager::scrub_result >(req_id + 2, my_uuid); + result->add_entry({30, 1, ScrubStatus::IO_ERROR}); + result->add_entry({30, 2, ScrubStatus::MISMATCH}); + + auto buf = result->build_flat_buffer(); + auto loaded = std::make_shared< ScrubManager::scrub_result >(); + EXPECT_TRUE(loaded->load(buf.data(), buf.size())); + + EXPECT_EQ(loaded->entries.size(), 2u); + + auto it1 = loaded->entries.find(BlobRoute{30, 1}); + ASSERT_NE(it1, loaded->entries.end()); + auto* s1 = std::get_if< ScrubStatus >(&it1->second); + ASSERT_NE(s1, nullptr) << "IO_ERROR entry should deserialize as ScrubStatus"; + EXPECT_EQ(*s1, ScrubStatus::IO_ERROR); + + auto it2 = loaded->entries.find(BlobRoute{30, 2}); + ASSERT_NE(it2, loaded->entries.end()); + auto* s2 = std::get_if< ScrubStatus >(&it2->second); + ASSERT_NE(s2, nullptr) << "MISMATCH entry should deserialize as ScrubStatus"; + EXPECT_EQ(*s2, ScrubStatus::MISMATCH); + } + + // ---- Case 4: empty result ---- + { + auto result = std::make_shared< ScrubManager::scrub_result >(req_id + 3, my_uuid); + auto buf = result->build_flat_buffer(); + auto loaded = std::make_shared< ScrubManager::scrub_result >(); + EXPECT_TRUE(loaded->load(buf.data(), buf.size())); + EXPECT_EQ(loaded->req_id, req_id + 3); + EXPECT_TRUE(loaded->entries.empty()) << "Empty result should round-trip with no entries"; + } +} diff --git a/src/lib/homestore_backend/tests/test_mpmc_priority_queue.cpp b/src/lib/homestore_backend/tests/test_mpmc_priority_queue.cpp new file mode 100644 index 000000000..c90ad7d1b --- /dev/null +++ b/src/lib/homestore_backend/tests/test_mpmc_priority_queue.cpp @@ -0,0 +1,413 @@ +#include +#include +#include +#include +#include +#include + +#include "../MPMCPriorityQueue.hpp" + +using namespace homeobject; +using namespace std::chrono_literals; + +// ============================================================================ +// Basic Functionality Tests +// ============================================================================ + +TEST(MPMCPriorityQueueTest, BasicPushPop) { + MPMCPriorityQueue< int > queue; + + // Push elements + queue.push(5); + queue.push(2); + queue.push(8); + queue.push(1); + + EXPECT_EQ(queue.size(), 4); + EXPECT_FALSE(queue.empty()); + + // Pop in priority order (max heap by default) + auto r1 = queue.pop(); + EXPECT_TRUE(r1.is_ok()); + EXPECT_EQ(r1.value.value(), 8); + + auto r2 = queue.pop(); + EXPECT_TRUE(r2.is_ok()); + EXPECT_EQ(r2.value.value(), 5); + + auto r3 = queue.pop(); + EXPECT_TRUE(r3.is_ok()); + EXPECT_EQ(r3.value.value(), 2); + + auto r4 = queue.pop(); + EXPECT_TRUE(r4.is_ok()); + EXPECT_EQ(r4.value.value(), 1); + + EXPECT_EQ(queue.size(), 0); + EXPECT_TRUE(queue.empty()); +} + +TEST(MPMCPriorityQueueTest, CustomComparator) { + // Min-heap using std::greater + MPMCPriorityQueue< int, std::greater< int > > queue; + + queue.push(5); + queue.push(2); + queue.push(8); + queue.push(1); + + // Pop in ascending order + EXPECT_EQ(queue.pop().value.value(), 1); + EXPECT_EQ(queue.pop().value.value(), 2); + EXPECT_EQ(queue.pop().value.value(), 5); + EXPECT_EQ(queue.pop().value.value(), 8); +} + +TEST(MPMCPriorityQueueTest, MoveSemantics) { + struct MoveOnly { + int value; + + explicit MoveOnly(int v) : value(v) {} + MoveOnly(const MoveOnly&) = delete; + MoveOnly& operator=(const MoveOnly&) = delete; + MoveOnly(MoveOnly&&) = default; + MoveOnly& operator=(MoveOnly&&) = default; + + bool operator<(const MoveOnly& other) const { return value < other.value; } + }; + + MPMCPriorityQueue< MoveOnly > queue; + + queue.push(MoveOnly(5)); + queue.push(MoveOnly(2)); + queue.push(MoveOnly(8)); + + EXPECT_EQ(queue.pop().value.value().value, 8); + EXPECT_EQ(queue.pop().value.value().value, 5); + EXPECT_EQ(queue.pop().value.value().value, 2); +} + +// ============================================================================ +// Close Operation Tests +// ============================================================================ + +TEST(MPMCPriorityQueueTest, Close) { + MPMCPriorityQueue< int > queue; + + queue.push(1); + queue.push(2); + queue.push(3); + + EXPECT_FALSE(queue.is_closed()); + queue.close(); + EXPECT_TRUE(queue.is_closed()); + + // Can still pop existing elements + EXPECT_EQ(queue.pop().value.value(), 3); + EXPECT_EQ(queue.pop().value.value(), 2); + EXPECT_EQ(queue.pop().value.value(), 1); + + // Now should return Closed status + auto result = queue.pop(); + EXPECT_TRUE(result.is_closed()); + EXPECT_FALSE(result.value.has_value()); +} + +TEST(MPMCPriorityQueueTest, PushAfterClose) { + MPMCPriorityQueue< int > queue; + + queue.push(1); + queue.close(); + + // Pushes after close are ignored + queue.push(2); + queue.push(3); + + EXPECT_EQ(queue.size(), 1); + + auto r1 = queue.pop(); + EXPECT_TRUE(r1.is_ok()); + EXPECT_EQ(r1.value.value(), 1); + + auto r2 = queue.pop(); + EXPECT_TRUE(r2.is_closed()); +} + +TEST(MPMCPriorityQueueTest, CloseIdempotent) { + MPMCPriorityQueue< int > queue; + + queue.push(1); + queue.close(); + queue.close(); // Should be safe + queue.close(); + + EXPECT_TRUE(queue.is_closed()); + EXPECT_EQ(queue.size(), 1); +} + +// ============================================================================ +// Blocking Behavior Tests +// ============================================================================ + +TEST(MPMCPriorityQueueTest, BlockingPop) { + MPMCPriorityQueue< int > queue; + std::atomic< bool > pop_started{false}; + std::atomic< bool > pop_completed{false}; + + // Consumer thread that will block + std::thread consumer([&]() { + pop_started = true; + auto result = queue.pop(); + pop_completed = true; + + EXPECT_TRUE(result.is_ok()); + EXPECT_EQ(result.value.value(), 42); + }); + + // Wait for consumer to start + while (!pop_started) { + std::this_thread::yield(); + } + + std::this_thread::sleep_for(50ms); + EXPECT_FALSE(pop_completed); + + // Unblock consumer by pushing + queue.push(42); + + consumer.join(); + EXPECT_TRUE(pop_completed); +} + +TEST(MPMCPriorityQueueTest, CloseUnblocksWaiters) { + MPMCPriorityQueue< int > queue; + std::atomic< int > closed_count{0}; + + // Start multiple waiting consumers + std::vector< std::thread > consumers; + for (int i = 0; i < 5; ++i) { + consumers.emplace_back([&]() { + auto result = queue.pop(); + if (result.is_closed()) { closed_count.fetch_add(1, std::memory_order_relaxed); } + }); + } + + std::this_thread::sleep_for(100ms); + + // Close should wake all waiters + queue.close(); + + for (auto& t : consumers) { + t.join(); + } + + EXPECT_EQ(closed_count.load(), 5); +} + +// ============================================================================ +// Multi-threaded Producer Tests +// ============================================================================ + +TEST(MPMCPriorityQueueTest, MultipleProducers) { + MPMCPriorityQueue< int > queue; + constexpr int num_producers = 4; + constexpr int items_per_producer = 250; + + std::vector< std::thread > producers; + for (int i = 0; i < num_producers; ++i) { + producers.emplace_back([&, i]() { + for (int j = 0; j < items_per_producer; ++j) { + queue.push(i * items_per_producer + j); + } + }); + } + + for (auto& t : producers) { + t.join(); + } + + EXPECT_EQ(queue.size(), num_producers * items_per_producer); + + // Verify all elements come out in descending order + std::vector< int > popped; + for (int i = 0; i < num_producers * items_per_producer; ++i) { + auto result = queue.pop(); + ASSERT_TRUE(result.is_ok()); + popped.push_back(result.value.value()); + } + + EXPECT_TRUE(std::is_sorted(popped.rbegin(), popped.rend())); +} + +// ============================================================================ +// Multi-threaded Consumer Tests +// ============================================================================ + +TEST(MPMCPriorityQueueTest, MultipleConsumers) { + MPMCPriorityQueue< int > queue; + constexpr int num_items = 1000; + + // Fill queue + for (int i = 0; i < num_items; ++i) { + queue.push(i); + } + + constexpr int num_consumers = 4; + std::vector< std::thread > consumers; + std::atomic< int > total_consumed{0}; + + for (int i = 0; i < num_consumers; ++i) { + consumers.emplace_back([&]() { + int count = 0; + while (true) { + auto result = queue.pop(); + if (result.is_closed()) { break; } + ++count; + } + total_consumed.fetch_add(count, std::memory_order_relaxed); + }); + } + + // Give consumers time to start + std::this_thread::sleep_for(50ms); + + // Close to signal completion + queue.close(); + + for (auto& t : consumers) { + t.join(); + } + + EXPECT_EQ(total_consumed.load(), num_items); +} + +// ============================================================================ +// Concurrent Producers and Consumers +// ============================================================================ + +TEST(MPMCPriorityQueueTest, ConcurrentProducersConsumers) { + MPMCPriorityQueue< int > queue; + constexpr int num_producers = 3; + constexpr int num_consumers = 3; + constexpr int items_per_producer = 200; + + std::atomic< int > total_consumed{0}; + std::vector< std::thread > threads; + + // Start consumers + for (int i = 0; i < num_consumers; ++i) { + threads.emplace_back([&]() { + int count = 0; + while (true) { + auto result = queue.pop(); + if (result.is_closed()) { break; } + ++count; + } + total_consumed.fetch_add(count, std::memory_order_relaxed); + }); + } + + // Start producers + for (int i = 0; i < num_producers; ++i) { + threads.emplace_back([&, i]() { + for (int j = 0; j < items_per_producer; ++j) { + queue.push(i * items_per_producer + j); + std::this_thread::sleep_for(10us); // Simulate work + } + }); + } + + // Wait for producers + for (int i = num_consumers; i < num_consumers + num_producers; ++i) { + threads[i].join(); + } + + // Close and wait for consumers + queue.close(); + for (int i = 0; i < num_consumers; ++i) { + threads[i].join(); + } + + EXPECT_EQ(total_consumed.load(), num_producers * items_per_producer); +} + +// ============================================================================ +// Stress Test +// ============================================================================ + +TEST(MPMCPriorityQueueTest, StressTest) { + MPMCPriorityQueue< int > queue; + constexpr int num_threads = 8; + constexpr int operations_per_thread = 1000; + + std::atomic< int > push_count{0}; + std::atomic< int > pop_count{0}; + std::vector< std::thread > threads; + + // Half producers, half consumers + for (int i = 0; i < num_threads / 2; ++i) { + threads.emplace_back([&]() { + for (int j = 0; j < operations_per_thread; ++j) { + queue.push(j); + push_count.fetch_add(1, std::memory_order_relaxed); + } + }); + } + + for (int i = 0; i < num_threads / 2; ++i) { + threads.emplace_back([&]() { + for (int j = 0; j < operations_per_thread; ++j) { + auto result = queue.pop(); + if (result.is_ok()) { pop_count.fetch_add(1, std::memory_order_relaxed); } + } + }); + } + + for (auto& t : threads) { + t.join(); + } + + EXPECT_EQ(push_count.load(), (num_threads / 2) * operations_per_thread); + + // Pop remaining elements + while (!queue.empty()) { + auto result = queue.pop(); + if (result.is_ok()) { pop_count.fetch_add(1, std::memory_order_relaxed); } + } + + EXPECT_EQ(pop_count.load(), push_count.load()); +} + +// ============================================================================ +// Destructor Test +// ============================================================================ + +TEST(MPMCPriorityQueueTest, DestructorClosesQueue) { + std::atomic< bool > consumer_unblocked{false}; + + std::thread consumer([&]() { + auto queue = std::make_unique< MPMCPriorityQueue< int > >(); + queue->push(1); + + std::thread waiter([&, q = queue.get()]() { + auto first_result = q->pop(); // Pop the 1 + (void)first_result; // Explicitly ignore the result + auto result = q->pop(); // This will block until destructor closes queue + if (result.is_closed()) { consumer_unblocked = true; } + }); + + std::this_thread::sleep_for(100ms); + // Destructor will be called here + queue.reset(); + + waiter.join(); + }); + + consumer.join(); + EXPECT_TRUE(consumer_unblocked); +} + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +}