From 47a45721bc8fff2dabaeef2a508d8e48dca74d27 Mon Sep 17 00:00:00 2001 From: Vidya Silai Date: Tue, 11 Nov 2025 10:57:47 -0500 Subject: [PATCH 01/48] Initial commit --- conanfile.py | 2 +- .../core/algo/compute_running_total.hpp | 4 +- src/turtle_kv/core/testing/generate.hpp | 14 +- src/turtle_kv/kv_store.cpp | 4 +- src/turtle_kv/kv_store_scanner.cpp | 9 +- src/turtle_kv/tree/algo/segmented_levels.hpp | 12 + src/turtle_kv/tree/algo/segments.hpp | 16 + src/turtle_kv/tree/batch_update.cpp | 124 +++- src/turtle_kv/tree/batch_update.hpp | 28 +- src/turtle_kv/tree/in_memory_leaf.cpp | 25 + src/turtle_kv/tree/in_memory_leaf.hpp | 6 +- src/turtle_kv/tree/in_memory_node.cpp | 600 +++++++++++++++++- src/turtle_kv/tree/in_memory_node.hpp | 36 ++ src/turtle_kv/tree/in_memory_node.test.cpp | 30 +- src/turtle_kv/tree/packed_leaf_page.hpp | 33 +- src/turtle_kv/tree/subtree.cpp | 163 ++++- src/turtle_kv/tree/subtree.hpp | 25 + .../tree/testing/random_leaf_generator.hpp | 2 +- 18 files changed, 1073 insertions(+), 60 deletions(-) diff --git a/conanfile.py b/conanfile.py index 9492d30..0d0b7ef 100644 --- a/conanfile.py +++ b/conanfile.py @@ -60,7 +60,7 @@ def requirements(self): self.requires("gperftools/2.16", **VISIBLE) self.requires("llfs/0.42.0", **VISIBLE) self.requires("pcg-cpp/cci.20220409", **VISIBLE) - self.requires("vqf/0.2.5", **VISIBLE) + self.requires("vqf/0.2.5-devel", **VISIBLE) self.requires("zlib/1.3.1", **OVERRIDE) if platform.system() == "Linux": diff --git a/src/turtle_kv/core/algo/compute_running_total.hpp b/src/turtle_kv/core/algo/compute_running_total.hpp index 33a0568..9ff8f73 100644 --- a/src/turtle_kv/core/algo/compute_running_total.hpp +++ b/src/turtle_kv/core/algo/compute_running_total.hpp @@ -14,10 +14,10 @@ namespace turtle_kv { //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - // -template +template inline batt::RunningTotal compute_running_total( batt::WorkerPool& worker_pool, - const MergeCompactor::ResultSet& result_set, + const MergeCompactor::ResultSet& result_set, DecayToItem decay_to_item [[maybe_unused]] = {}) { auto merged_edits = result_set.get(); diff --git a/src/turtle_kv/core/testing/generate.hpp b/src/turtle_kv/core/testing/generate.hpp index bafa95a..06b4c37 100644 --- a/src/turtle_kv/core/testing/generate.hpp +++ b/src/turtle_kv/core/testing/generate.hpp @@ -184,8 +184,11 @@ class RandomResultSetGenerator : public MinMaxSize } template - MergeCompactor::ResultSet - operator()(DecayToItem, Rng& rng, llfs::StableStringStore& store) + MergeCompactor::ResultSet operator()( + DecayToItem, + Rng& rng, + llfs::StableStringStore& store, + Optional> to_delete = None) { using ResultSet = MergeCompactor::ResultSet; using Item = typename ResultSet::value_type; @@ -199,6 +202,13 @@ class RandomResultSetGenerator : public MinMaxSize items.emplace_back(this->key_generator_(rng, store), ValueView::from_str(store.store(std::string(this->value_size_, ch)))); } + + if (to_delete) { + for (const KeyView& delete_key : *to_delete) { + items.emplace_back(delete_key, ValueView::deleted()); + } + } + std::sort(items.begin(), items.end(), KeyOrder{}); items.erase(std::unique(items.begin(), items.end(), diff --git a/src/turtle_kv/kv_store.cpp b/src/turtle_kv/kv_store.cpp index 727ac30..93b1e5f 100644 --- a/src/turtle_kv/kv_store.cpp +++ b/src/turtle_kv/kv_store.cpp @@ -766,9 +766,7 @@ StatusOr KVStore::scan_keys(const KeyView& min_key, // Status KVStore::remove(const KeyView& key) noexcept /*override*/ { - (void)key; - - return batt::StatusCode::kUnimplemented; + return this->put(key, ValueView::deleted()); } //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - diff --git a/src/turtle_kv/kv_store_scanner.cpp b/src/turtle_kv/kv_store_scanner.cpp index 61f6d02..36b2d14 100644 --- a/src/turtle_kv/kv_store_scanner.cpp +++ b/src/turtle_kv/kv_store_scanner.cpp @@ -443,7 +443,14 @@ Status KVStoreScanner::set_next_item() } } else { - break; + // TODO [vsilai 11-10-2025]: need to fix key only scans to look at values. + // + if (!this->keys_only_ && this->next_item_->value == ValueView::deleted()) { + this->next_item_ = None; + continue; + } else { + break; + } } if (scan_level->advance()) { diff --git a/src/turtle_kv/tree/algo/segmented_levels.hpp b/src/turtle_kv/tree/algo/segmented_levels.hpp index c9864e1..822233e 100644 --- a/src/turtle_kv/tree/algo/segmented_levels.hpp +++ b/src/turtle_kv/tree/algo/segmented_levels.hpp @@ -305,6 +305,18 @@ struct SegmentedLevelAlgorithms { return OkStatus(); } + /** \brief Merges the two given pivots, effectively erasing `right_pivot`. + */ + void merge_pivots(i32 left_pivot, i32 right_pivot) + { + const usize segment_count = this->level_.segment_count(); + + for (usize segment_i = 0; segment_i < segment_count; ++segment_i) { + SegmentT& segment = this->level_.get_segment(segment_i); + in_segment(segment).merge_pivots(left_pivot, right_pivot, this->level_); + } + } + /** \brief Invokes `fn` for each SegmentT& selected by `pivot_selector`. * * `pivot_selector` can be: diff --git a/src/turtle_kv/tree/algo/segments.hpp b/src/turtle_kv/tree/algo/segments.hpp index 54f8016..0798dfb 100644 --- a/src/turtle_kv/tree/algo/segments.hpp +++ b/src/turtle_kv/tree/algo/segments.hpp @@ -116,6 +116,22 @@ struct SegmentAlgorithms { return true; } + /** \brief Merges the two given pivots, effectively erasing `right_pivot`. + */ + template + [[nodiscard]] void merge_pivots(i32 left_pivot, i32 right_pivot, const LevelT& level) + { + BATT_CHECK(!this->segment_.is_pivot_active(left_pivot)); + + u32 new_flushed_upper_bound = this->segment_.get_flushed_item_upper_bound(level, right_pivot); + bool new_is_active = this->segment_.is_pivot_active(right_pivot); + + this->segment_.set_pivot_active(left_pivot, new_is_active); + this->segment_.set_flushed_item_upper_bound(left_pivot, new_flushed_upper_bound); + + this->segment_.remove_pivot(right_pivot); + } + /** \brief Invokes the speficied `fn` for each active pivot in the specified range, passing a * reference to the segment and the pivot index (i32). */ diff --git a/src/turtle_kv/tree/batch_update.cpp b/src/turtle_kv/tree/batch_update.cpp index 94babdd..8e5675d 100644 --- a/src/turtle_kv/tree/batch_update.cpp +++ b/src/turtle_kv/tree/batch_update.cpp @@ -9,7 +9,129 @@ using TrimResult = BatchUpdate::TrimResult; // void BatchUpdate::update_edit_size_totals() { - this->edit_size_totals.emplace(this->context.compute_running_total(this->result_set)); + this->edit_size_totals.emplace( + this->context.compute_running_total(this->result_set)); +} + +//==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - +// +void BatchUpdate::update_edit_size_totals_decayed( + const MergeCompactor::ResultSet& decayed_result_set) +{ + this->edit_size_totals.emplace( + this->context.compute_running_total(decayed_result_set)); +} + +//==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - +// +void BatchUpdate::decay_batch_to_items( + MergeCompactor::ResultSet& output_result_set) +{ + const batt::TaskCount max_tasks{this->context.worker_pool.size() + 1}; + std::vector decayed_items; + + if (max_tasks == 1) { + for (const EditView& edit : this->result_set.get()) { + Optional maybe_item = to_item_view(edit); + if (maybe_item) { + decayed_items.emplace_back(EditView::from_item_view(*maybe_item)); + } + } + } else { + const ParallelAlgoDefaults& algo_defaults = parallel_algo_defaults(); + + auto actual_edits = result_set.get(); + const auto src_begin = actual_edits.begin(); + const auto src_end = actual_edits.end(); + + const batt::WorkSlicePlan plan{batt::WorkSliceParams{ + algo_defaults.copy_decayed_items.min_task_size, + max_tasks, + }, + src_begin, + src_end}; + + BATT_CHECK_GT(plan.n_tasks, 0); + + batt::SmallVec output_size_per_shard(plan.n_tasks); + BATT_CHECK_EQ(output_size_per_shard.size(), plan.n_tasks); + + // First count the number of non-decayed items in the output for each shard. + { + batt::ScopedWorkContext work_context{this->context.worker_pool}; + + BATT_CHECK_OK(batt::slice_work( + work_context, + plan, + /*gen_work_fn=*/ + [&](usize task_index, isize task_offset, isize task_size) { + return [src_begin, task_index, task_offset, task_size, &output_size_per_shard] { + BATT_CHECK_LT(task_index, output_size_per_shard.size()); + + auto task_src_begin = std::next(src_begin, task_offset); + const auto task_src_end = std::next(task_src_begin, task_size); + + usize output_size = 0; + + for (; task_src_begin != task_src_end; ++task_src_begin) { + if (decays_to_item(*task_src_begin)) { + output_size += 1; + } + } + output_size_per_shard[task_index] = output_size; + }; + })) + << "worker_pool must not be closed!"; + } + + // Change to a rolling sum and do the actual copy. + // + usize output_total_size = 0; + batt::SmallVec output_shard_offset; + for (usize output_shard_size : output_size_per_shard) { + output_shard_offset.emplace_back(output_total_size); + output_total_size += output_shard_size; + } + + decayed_items.resize(output_total_size); + { + this->context.worker_pool.reset(); + + batt::ScopedWorkContext work_context{this->context.worker_pool}; + + BATT_CHECK_OK( + batt::slice_work(work_context, + plan, + /*gen_work_fn=*/ + [&](usize task_index, isize task_offset, isize task_size) { + return [src_begin, + &output_shard_offset, + &output_size_per_shard, + task_index, + task_offset, + task_size, + &decayed_items] { + auto task_src_begin = std::next(src_begin, task_offset); + const auto task_src_end = std::next(task_src_begin, task_size); + + BATT_CHECK_LT(task_index, output_shard_offset.size()); + auto task_dst_begin = + std::next(decayed_items.data(), output_shard_offset[task_index]); + + for (; task_src_begin != task_src_end; ++task_src_begin) { + Optional maybe_item = to_item_view(*task_src_begin); + if (maybe_item) { + *task_dst_begin = EditView::from_item_view(*maybe_item); + ++task_dst_begin; + } + } + }; + })) + << "worker_pool must not be closed!"; + } + } + + output_result_set.append(std::move(decayed_items)); } //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - diff --git a/src/turtle_kv/tree/batch_update.hpp b/src/turtle_kv/tree/batch_update.hpp index 161c370..7eeab1d 100644 --- a/src/turtle_kv/tree/batch_update.hpp +++ b/src/turtle_kv/tree/batch_update.hpp @@ -29,17 +29,18 @@ struct BatchUpdateContext { /** \brief Uses the worker_pool to perform a parallel merge-compaction of the lines * produced by the passed `generator_fn`, up to and including (but stopping at) `max_key`. */ - template - StatusOr> merge_compact_edits( + template + StatusOr> merge_compact_edits( const KeyView& max_key, GeneratorFn&& generator_fn); /** \brief Computes and returns the running total (prefix sum) of the edit sizes in result_set. */ + template batt::RunningTotal compute_running_total( - const MergeCompactor::ResultSet& result_set) const + const MergeCompactor::ResultSet& result_set) const { - return ::turtle_kv::compute_running_total(this->worker_pool, result_set); + return ::turtle_kv::compute_running_total(this->worker_pool, result_set); } }; @@ -63,6 +64,16 @@ struct BatchUpdate { */ void update_edit_size_totals(); + /** \brief Resets `this->edit_size_totals` to reflect the decayed version of `this->result_set`. + */ + void update_edit_size_totals_decayed( + const MergeCompactor::ResultSet& decayed_result_set); + + /** \brief Fills the output buffer `ResultSet` passed into the function with only the + * edits from this batch that decay to base-level items (e.g., no tombstones). + */ + void decay_batch_to_items(MergeCompactor::ResultSet& output_result_set); + /** \brief Returns the inclusive (closed) interval of keys in this batch. */ CInterval get_key_crange() const @@ -90,9 +101,10 @@ std::ostream& operator<<(std::ostream& out, const BatchUpdate::TrimResult& t); //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - // -template -inline StatusOr> -BatchUpdateContext::merge_compact_edits(const KeyView& max_key, GeneratorFn&& generator_fn) +template +inline StatusOr> BatchUpdateContext::merge_compact_edits( + const KeyView& max_key, + GeneratorFn&& generator_fn) { MergeCompactor compactor{this->worker_pool}; @@ -100,7 +112,7 @@ BatchUpdateContext::merge_compact_edits(const KeyView& max_key, GeneratorFn&& ge BATT_REQUIRE_OK(BATT_FORWARD(generator_fn)(compactor)); compactor.finish_push_levels(); - MergeCompactor::EditBuffer edit_buffer; + MergeCompactor::OutputBuffer edit_buffer; this->worker_pool.reset(); return compactor.read(edit_buffer, max_key); diff --git a/src/turtle_kv/tree/in_memory_leaf.cpp b/src/turtle_kv/tree/in_memory_leaf.cpp index 9c02cf4..1708bf1 100644 --- a/src/turtle_kv/tree/in_memory_leaf.cpp +++ b/src/turtle_kv/tree/in_memory_leaf.cpp @@ -149,6 +149,31 @@ auto InMemoryLeaf::make_split_plan() const -> StatusOr return plan; } +//==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - +// +StatusOr> InMemoryLeaf::try_merge(BatchUpdateContext& context, + InMemoryLeaf& sibling) +{ + auto merged_leaf = + std::make_unique(batt::make_copy(this->pinned_leaf_page_), this->tree_options); + + // Concatenate the two leaves' result sets in the correct order. + // + bool right_sibling = this->get_max_key() < sibling.get_min_key(); + if (right_sibling) { + merged_leaf->result_set = + MergeCompactor::ResultSet::concat(std::move(this->result_set), + std::move(sibling.result_set)); + } else { + merged_leaf->result_set = MergeCompactor::ResultSet::concat(std::move(sibling.result_set), + std::move(this->result_set)); + } + + merged_leaf->set_edit_size_totals(context.compute_running_total(merged_leaf->result_set)); + + return {std::move(merged_leaf)}; +} + //=#=#==#==#===============+=+=+=+=++=++++++++++++++-++-+--+-+----+--------------- //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - diff --git a/src/turtle_kv/tree/in_memory_leaf.hpp b/src/turtle_kv/tree/in_memory_leaf.hpp index 9d71d93..879fee4 100644 --- a/src/turtle_kv/tree/in_memory_leaf.hpp +++ b/src/turtle_kv/tree/in_memory_leaf.hpp @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include @@ -32,7 +33,7 @@ struct InMemoryLeaf { llfs::PinnedPage pinned_leaf_page_; TreeOptions tree_options; - MergeCompactor::ResultSet result_set; + MergeCompactor::ResultSet result_set; std::shared_ptr shared_edit_size_totals_; Optional edit_size_totals; mutable std::atomic future_id_{~u64{0}}; @@ -91,6 +92,9 @@ struct InMemoryLeaf { StatusOr make_split_plan() const; + StatusOr> try_merge(BatchUpdateContext& context, + InMemoryLeaf& sibling); + Status start_serialize(TreeSerializeContext& context); StatusOr finish_serialize(TreeSerializeContext& context); diff --git a/src/turtle_kv/tree/in_memory_node.cpp b/src/turtle_kv/tree/in_memory_node.cpp index 72fc976..032c566 100644 --- a/src/turtle_kv/tree/in_memory_node.cpp +++ b/src/turtle_kv/tree/in_memory_node.cpp @@ -288,7 +288,7 @@ Status InMemoryNode::update_buffer_insert(BatchUpdate& update) BATT_ASSIGN_OK_RESULT( // new_merged_level.result_set, - update.context.merge_compact_edits( // + update.context.merge_compact_edits( // global_max_key(), [&](MergeCompactor& compactor) -> Status { compactor.push_level(update.result_set.live_edit_slices()); @@ -402,7 +402,7 @@ Status InMemoryNode::compact_update_buffer_levels(BatchUpdateContext& update_con Status segment_load_status; BATT_ASSIGN_OK_RESULT(new_merged_level.result_set, - update_context.merge_compact_edits( + update_context.merge_compact_edits( global_max_key(), [&](MergeCompactor& compactor) -> Status { this->push_levels_to_merge(compactor, @@ -447,10 +447,10 @@ StatusOr InMemoryNode::collect_pivot_batch(BatchUpdateContext& upda // Merge/compact all pending edits for the specified pivot. // - BATT_ASSIGN_OK_RESULT( // - pivot_batch.result_set, // - update_context.merge_compact_edits( // - /*max_key=*/pivot_key_range.upper_bound, // + BATT_ASSIGN_OK_RESULT( // + pivot_batch.result_set, // + update_context.merge_compact_edits( // + /*max_key=*/pivot_key_range.upper_bound, // [&](MergeCompactor& compactor) -> Status { this->push_levels_to_merge(compactor, update_context.page_loader, @@ -589,8 +589,7 @@ Status InMemoryNode::make_child_viable(BatchUpdateContext& update_context, i32 p //----- --- -- - - - - [&](const NeedsMerge&) -> Status { - BATT_PANIC() << "TODO [tastolfi 2025-03-16] implement me!"; - return batt::StatusCode::kUnimplemented; + return this->merge_child(update_context, pivot_i); }); return status; @@ -691,6 +690,552 @@ Status InMemoryNode::split_child(BatchUpdateContext& update_context, i32 pivot_i return OkStatus(); } +//==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - +// +Status InMemoryNode::merge_child(BatchUpdateContext& update_context, i32 pivot_i) +{ + Subtree& child = this->children[pivot_i]; + + // Decide which sibling to merge with. Edge cases: child that needs merge is the leftmost or + // rightmost child in the node. + // + i32 sibling_i = pivot_i; + i32 right_sibling = pivot_i + 1; + i32 left_sibling = pivot_i - 1; + + bool need_compaction = false; + u64 active_segmented_levels = this->update_buffer.compute_active_segmented_levels(); + if (pivot_i == 0) { + sibling_i = right_sibling; + if (get_bit(active_segmented_levels, pivot_i)) { + need_compaction = true; + } + } else if ((usize)pivot_i == this->children.size() - 1) { + sibling_i = left_sibling; + if (get_bit(active_segmented_levels, left_sibling)) { + need_compaction = true; + } + } else { + if (!get_bit(active_segmented_levels, pivot_i)) { + sibling_i = right_sibling; + } else { + if (!get_bit(active_segmented_levels, left_sibling)) { + sibling_i = left_sibling; + } else { + sibling_i = right_sibling; + need_compaction = true; + } + } + } + + BATT_CHECK_NE(pivot_i, sibling_i); + BATT_REQUIRE_OK(this->children[sibling_i].to_in_memory_subtree(update_context.page_loader, + this->tree_options, + this->height - 1)); + + // Call child.try_merge(). + // + Subtree& sibling = this->children[sibling_i]; + StatusOr> status_or_merged = child.try_merge(update_context, sibling); + if (!status_or_merged.ok()) { + LOG(ERROR) << BATT_INSPECT(child.get_viability()); + } + BATT_REQUIRE_OK(status_or_merged); + + if (!*status_or_merged) { + if (!batt::is_case(child.get_viability())) { + BATT_ASSIGN_OK_RESULT(KeyView new_pivot_key, child.try_borrow(update_context, sibling)); + + this->pivot_keys_[std::max(pivot_i, sibling_i)] = new_pivot_key; + + BATT_REQUIRE_OK(this->compact_update_buffer_levels(update_context)); + } + BATT_CHECK(batt::is_case(child.get_viability())); + return OkStatus(); + } + Subtree& merged_subtree = **status_or_merged; + + // Erase rightmost of {child subtree, sibling} in this->child_pages, overwrite leftmost + // with new PinnedPage{}. + // + i32 pivot_to_erase = std::max(pivot_i, sibling_i); + i32 pivot_to_overwrite = std::min(pivot_i, sibling_i); + + this->child_pages[pivot_to_overwrite] = llfs::PinnedPage{}; + this->child_pages.erase(this->child_pages.begin() + pivot_to_erase); + + // Update the update_buffer levels. + // + if (need_compaction) { + BATT_REQUIRE_OK(this->compact_update_buffer_levels(update_context)); + } else { + for (Level& level : this->update_buffer.levels) { + if (batt::is_case(level)) { + SegmentedLevel& segmented_level = std::get(level); + in_segmented_level(*this, segmented_level, update_context.page_loader) + .merge_pivots(pivot_to_overwrite, pivot_to_erase); + } + } + } + + // Update this->children, following same update method as with this->child_pages. + // + this->children[pivot_to_overwrite] = std::move(merged_subtree); + this->children.erase(this->children.begin() + pivot_to_erase); + + // Update pending_bytes. The leftmost of {subtree, sibling} should be incremented by the removed + // subtree's pending bytes values. Erase the pending bytes of the removed subtree. + // + this->pending_bytes[pivot_to_overwrite] += this->pending_bytes[pivot_to_erase]; + this->pending_bytes.erase(this->pending_bytes.begin() + pivot_to_erase); + + bool is_pending_bytes_exact = get_bit(this->pending_bytes_is_exact, pivot_to_overwrite) & + get_bit(this->pending_bytes_is_exact, pivot_to_erase); + this->pending_bytes_is_exact = + set_bit(this->pending_bytes_is_exact, pivot_to_overwrite, is_pending_bytes_exact); + this->pending_bytes_is_exact = remove_bit(this->pending_bytes_is_exact, pivot_to_erase); + + // Remove the pivot key of the removed child subtree from this->pivot_keys_. + // + this->pivot_keys_.erase(this->pivot_keys_.begin() + pivot_to_erase); + + if ((usize)pivot_to_erase == this->children.size()) { + BATT_ASSIGN_OK_RESULT( + this->max_key_, + this->children.back().get_max_key(update_context.page_loader, this->child_pages.back())); + } + + // Finally, split the newly merged child if needed. + // + SubtreeViability merged_viability = merged_subtree.get_viability(); + if (batt::is_case(merged_viability)) { + BATT_REQUIRE_OK(this->split_child(update_context, pivot_to_overwrite)); + } else { + BATT_CHECK(batt::is_case(merged_viability)); + } + + return OkStatus(); +} + +//==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - +// +StatusOr> InMemoryNode::flush_and_shrink(BatchUpdateContext& context) +{ + // If more than one pivot exists, nothings needs to be done. + // + usize pivot_count = this->pivot_count(); + if (pivot_count > 1) { + return None; + } + + i32 single_pivot_i = 0; + BATT_CHECK_EQ(this->pending_bytes.size(), 1); + usize pending_bytes_count = this->pending_bytes[single_pivot_i]; + + // Flush until we have nothing left in the update buffer or until we gain more pivots. + // + while (pivot_count == 1 && pending_bytes_count > 0) { + BATT_REQUIRE_OK(this->flush_to_pivot(context, single_pivot_i)); + pivot_count = this->pivot_count(); + pending_bytes_count = this->pending_bytes[single_pivot_i]; + } + + // If still only one pivot remains, return the child. + // + if (pivot_count == 1) { + return std::move(this->children[single_pivot_i]); + } else { + return None; + } +} + +//==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - +// +StatusOr> InMemoryNode::try_merge(BatchUpdateContext& context, + InMemoryNode& sibling) +{ + // If merging both full nodes will cause the merged node's pivot count to exceed the max + // possible pivot count, return null so that we can try a borrow. + // + if (this->pivot_count() + sibling.pivot_count() > this->max_pivot_count()) { + return nullptr; + } + + auto new_node = std::make_unique(batt::make_copy(this->pinned_node_page_), + this->tree_options, + this->is_size_tiered()); + + const auto concat_metadata = [&](InMemoryNode& left, InMemoryNode& right) { + new_node->max_key_ = right.max_key_; + + new_node->latest_flush_pivot_i_ = None; + + new_node->pending_bytes.insert(new_node->pending_bytes.end(), + left.pending_bytes.begin(), + left.pending_bytes.end()); + new_node->pending_bytes.insert(new_node->pending_bytes.end(), + right.pending_bytes.begin(), + right.pending_bytes.end()); + + new_node->pending_bytes_is_exact = left.pending_bytes_is_exact & right.pending_bytes_is_exact; + + new_node->child_pages.insert(new_node->child_pages.end(), + std::make_move_iterator(left.child_pages.begin()), + std::make_move_iterator(left.child_pages.end())); + new_node->child_pages.insert(new_node->child_pages.end(), + std::make_move_iterator(right.child_pages.begin()), + std::make_move_iterator(right.child_pages.end())); + + new_node->children.insert(new_node->children.end(), + std::make_move_iterator(left.children.begin()), + std::make_move_iterator(left.children.end())); + new_node->children.insert(new_node->children.end(), + std::make_move_iterator(right.children.begin()), + std::make_move_iterator(right.children.end())); + + new_node->pivot_keys_.insert(new_node->pivot_keys_.end(), + left.pivot_keys_.begin(), + left.pivot_keys_.end() - 1); + new_node->pivot_keys_.insert(new_node->pivot_keys_.end(), + right.pivot_keys_.begin(), + right.pivot_keys_.end()); + }; + + const auto merge_update_buffers = [&](InMemoryNode& left, InMemoryNode& right) -> Status { + usize i = 0; + for (; i < left.update_buffer.levels.size(); ++i) { + Level& left_level = left.update_buffer.levels[i]; + BATT_REQUIRE_OK(batt::case_of( // + left_level, // + [&](EmptyLevel&) -> Status { + if (i < right.update_buffer.levels.size()) { + Level& right_level = right.update_buffer.levels[i]; + if (!batt::is_case(right_level)) { + new_node->update_buffer.levels.emplace_back(std::move(right_level)); + } + } + + return OkStatus(); + }, + [&](MergedLevel& left_merged_level) -> Status { + if (i < right.update_buffer.levels.size()) { + BATT_REQUIRE_OK(batt::case_of( + right.update_buffer.levels[i], + [&](EmptyLevel&) -> Status { + new_node->update_buffer.levels.emplace_back(std::move(left_merged_level)); + return OkStatus(); + }, + [&](MergedLevel& right_merged_level) -> Status { + new_node->update_buffer.levels.emplace_back( + left_merged_level.concat(right_merged_level)); + return OkStatus(); + }, + [&](SegmentedLevel& right_segmented_level) -> Status { + // When merging a MergedLevel and a SegmentedLevel, create a new MergedLevel. + // + MergedLevel new_merged_level; + HasPageRefs has_page_refs{false}; + Status segment_load_status; + + Slice levels_to_merge = + as_slice(right.update_buffer.levels.data() + i, usize{1}); + + BATT_ASSIGN_OK_RESULT( // + new_merged_level.result_set, + context.merge_compact_edits( // + global_max_key(), + [&](MergeCompactor& compactor) -> Status { + compactor.push_level(left_merged_level.result_set.live_edit_slices()); + right.push_levels_to_merge(compactor, + context.page_loader, + segment_load_status, + has_page_refs, + levels_to_merge, + /*min_pivot_i=*/0, + /*only_pivot=*/false); + return OkStatus(); + })); + + BATT_REQUIRE_OK(segment_load_status); + + new_node->update_buffer.levels.emplace_back(std::move(new_merged_level)); + + return OkStatus(); + })); + } else { + new_node->update_buffer.levels.emplace_back(std::move(left_merged_level)); + } + + return OkStatus(); + }, + [&](SegmentedLevel& left_segmented_level) -> Status { + if (i < right.update_buffer.levels.size()) { + BATT_REQUIRE_OK(batt::case_of( + right.update_buffer.levels[i], + [&](EmptyLevel&) -> Status { + new_node->update_buffer.levels.emplace_back(std::move(left_segmented_level)); + return OkStatus(); + }, + [&](MergedLevel& right_merged_level) -> Status { + MergedLevel new_merged_level; + HasPageRefs has_page_refs{false}; + Status segment_load_status; + + Slice levels_to_merge = + as_slice(left.update_buffer.levels.data() + i, usize{1}); + + BATT_ASSIGN_OK_RESULT( // + new_merged_level.result_set, + context.merge_compact_edits( // + global_max_key(), + [&](MergeCompactor& compactor) -> Status { + left.push_levels_to_merge(compactor, + context.page_loader, + segment_load_status, + has_page_refs, + levels_to_merge, + /*min_pivot_i=*/0, + /*only_pivot=*/false); + compactor.push_level( + right_merged_level.result_set.live_edit_slices()); + return OkStatus(); + })); + + BATT_REQUIRE_OK(segment_load_status); + + new_node->update_buffer.levels.emplace_back(std::move(new_merged_level)); + + return OkStatus(); + }, + [&](SegmentedLevel& right_segmented_level) -> Status { + // First shift the right level's bitsets to the left by the number of pivots + // in the left node. + // + usize left_node_pivot_count = left.pivot_count(); + for (usize segment_i = 0; segment_i < right_segmented_level.segment_count(); + ++segment_i) { + Segment& segment = right_segmented_level.get_segment(segment_i); + segment.flushed_pivots <<= left_node_pivot_count; + segment.active_pivots <<= left_node_pivot_count; + } + + new_node->update_buffer.levels.emplace_back(std::move(left_segmented_level)); + SegmentedLevel& new_segmented_level = + std::get(new_node->update_buffer.levels.back()); + new_segmented_level.segments.insert( + new_segmented_level.segments.end(), + std::make_move_iterator(right_segmented_level.segments.begin()), + std::make_move_iterator(right_segmented_level.segments.end())); + + return OkStatus(); + })); + } else { + new_node->update_buffer.levels.emplace_back(std::move(left_segmented_level)); + } + + return OkStatus(); + })); + } + + // Carry over any remaining levels from the right node's update buffer. + // + for (; i < right.update_buffer.levels.size(); ++i) { + Level& right_level = right.update_buffer.levels[i]; + if (!batt::is_case(right_level)) { + new_node->update_buffer.levels.emplace_back(std::move(right_level)); + } + } + + return OkStatus(); + }; + + if (this->get_max_key() < sibling.get_min_key()) { + concat_metadata(*this, sibling); + BATT_REQUIRE_OK(merge_update_buffers(*this, sibling)); + } else { + concat_metadata(sibling, *this); + BATT_REQUIRE_OK(merge_update_buffers(sibling, *this)); + } + + return {std::move(new_node)}; +} + +//==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - +// +StatusOr InMemoryNode::try_borrow(BatchUpdateContext& context, InMemoryNode& sibling) +{ + BATT_CHECK(batt::is_case(sibling.get_viability())); + + bool right_sibling = this->get_max_key() < sibling.get_min_key(); + + BATT_CHECK_LT(this->pivot_count(), 4); + u32 num_pivots_to_borrow = 4 - this->pivot_count(); + + i32 borrowed_min_pivot_i = -1; + KeyView borrowed_max_pivot_key; + if (right_sibling) { + borrowed_min_pivot_i = 0; + borrowed_max_pivot_key = sibling.get_pivot_key(num_pivots_to_borrow); + } else { + borrowed_min_pivot_i = sibling.pivot_count() - num_pivots_to_borrow; + borrowed_max_pivot_key = sibling.get_pivot_key(sibling.pivot_count()); + } + Interval borrowed_pivot_range{sibling.get_pivot_key(borrowed_min_pivot_i), + borrowed_max_pivot_key}; + + BatchUpdate borrowed_pivot_batch{ + .context = context, + .result_set = {}, + .edit_size_totals = None, + }; + + Status segment_load_status; + HasPageRefs has_page_refs{false}; + + BATT_ASSIGN_OK_RESULT( // + borrowed_pivot_batch.result_set, // + context.merge_compact_edits( // + /*max_key=*/borrowed_max_pivot_key, // + [&](MergeCompactor& compactor) -> Status { + sibling.push_levels_to_merge(compactor, + context.page_loader, + segment_load_status, + has_page_refs, + as_slice(sibling.update_buffer.levels), + /*min_pivot_i=*/borrowed_min_pivot_i, + /*only_pivot=*/false); + return OkStatus(); + })); + + BATT_REQUIRE_OK(segment_load_status); + + borrowed_pivot_batch.result_set.drop_key_range_half_open(Interval{ + borrowed_max_pivot_key, + sibling.key_upper_bound(), + }); + + borrowed_pivot_batch.edit_size_totals = None; + + if (right_sibling) { + this->pending_bytes.insert(this->pending_bytes.end(), + sibling.pending_bytes.begin(), + sibling.pending_bytes.begin() + num_pivots_to_borrow); + sibling.pending_bytes.erase(sibling.pending_bytes.begin(), + sibling.pending_bytes.begin() + num_pivots_to_borrow); + + u64 borrowed_pending_bytes_exact = + sibling.pending_bytes_is_exact & ((u64{1} << num_pivots_to_borrow) - 1); + u64 mask = ((u64{1} << num_pivots_to_borrow) - 1) << (this->pivot_count() - 1); + this->pending_bytes_is_exact = (this->pending_bytes_is_exact & ~mask) | + (borrowed_pending_bytes_exact << (this->pivot_count() - 1)); + + this->pivot_keys_.pop_back(); + this->pivot_keys_.insert(this->pivot_keys_.end(), + sibling.pivot_keys_.begin(), + sibling.pivot_keys_.begin() + num_pivots_to_borrow + 1); + sibling.pivot_keys_.erase(sibling.pivot_keys_.begin(), + sibling.pivot_keys_.begin() + num_pivots_to_borrow); + + this->child_pages.insert( + this->child_pages.end(), + std::make_move_iterator(sibling.child_pages.begin()), + std::make_move_iterator(sibling.child_pages.begin() + num_pivots_to_borrow)); + sibling.child_pages.erase(sibling.child_pages.begin(), + sibling.child_pages.begin() + num_pivots_to_borrow); + + this->children.insert(this->children.end(), + std::make_move_iterator(sibling.children.begin()), + std::make_move_iterator(sibling.children.begin() + num_pivots_to_borrow)); + sibling.children.erase(sibling.children.begin(), + sibling.children.begin() + num_pivots_to_borrow); + + BATT_ASSIGN_OK_RESULT( + this->max_key_, + this->children.back().get_max_key(context.page_loader, this->child_pages.back())); + } else { + this->pending_bytes.insert(this->pending_bytes.begin(), + sibling.pending_bytes.end() - num_pivots_to_borrow, + sibling.pending_bytes.end()); + sibling.pending_bytes.erase(sibling.pending_bytes.end() - num_pivots_to_borrow, + sibling.pending_bytes.end()); + + u64 borrowed_pending_bytes_exact = + sibling.pending_bytes_is_exact >> (64 - num_pivots_to_borrow); + this->pending_bytes_is_exact <<= num_pivots_to_borrow; + this->pending_bytes_is_exact |= borrowed_pending_bytes_exact; + + sibling.pivot_keys_.pop_back(); + this->pivot_keys_.insert(this->pivot_keys_.begin(), + sibling.pivot_keys_.end() - num_pivots_to_borrow, + sibling.pivot_keys_.end()); + sibling.pivot_keys_.erase(sibling.pivot_keys_.end() - num_pivots_to_borrow + 1, + sibling.pivot_keys_.end()); + + this->child_pages.insert( + this->child_pages.begin(), + std::make_move_iterator(sibling.child_pages.end() - num_pivots_to_borrow), + std::make_move_iterator(sibling.child_pages.end())); + sibling.child_pages.erase(sibling.child_pages.end() - num_pivots_to_borrow, + sibling.child_pages.end()); + + this->children.insert(this->children.begin(), + std::make_move_iterator(sibling.children.end() - num_pivots_to_borrow), + std::make_move_iterator(sibling.children.end())); + sibling.children.erase(sibling.children.end() - num_pivots_to_borrow, sibling.children.end()); + + BATT_ASSIGN_OK_RESULT( + sibling.max_key_, + sibling.children.back().get_max_key(context.page_loader, sibling.child_pages.back())); + } + + BATT_REQUIRE_OK(this->update_buffer_insert(borrowed_pivot_batch)); + + for (Level& level : sibling.update_buffer.levels) { + batt::case_of( // + level, // + [](EmptyLevel&) { + // nothing to do + }, + [&](MergedLevel& merged_level) { + merged_level.result_set.drop_key_range_half_open(borrowed_pivot_range); + }, + [&](SegmentedLevel& segmented_level) { + for (usize segment_i = 0; segment_i < segmented_level.segment_count(); ++segment_i) { + Segment& segment = segmented_level.get_segment(segment_i); + if (right_sibling) { + segment.flushed_pivots >>= num_pivots_to_borrow; + segment.active_pivots >>= num_pivots_to_borrow; + segment.flushed_item_upper_bound_.erase( + segment.flushed_item_upper_bound_.begin(), + segment.flushed_item_upper_bound_.begin() + num_pivots_to_borrow); + } else { + u64 mask = (u64{1} << (64 - num_pivots_to_borrow)) - 1; + segment.flushed_pivots &= mask; + segment.active_pivots &= mask; + segment.flushed_item_upper_bound_.erase( + segment.flushed_item_upper_bound_.end() - num_pivots_to_borrow, + segment.flushed_item_upper_bound_.end()); + } + } + }); + } + + KeyView left_child_max; + KeyView right_child_min; + if (right_sibling) { + left_child_max = this->get_max_key(); + right_child_min = sibling.get_min_key(); + } else { + left_child_max = sibling.get_max_key(); + right_child_min = this->get_min_key(); + } + + const KeyView prefix = llfs::find_common_prefix(0, left_child_max, right_child_min); + const KeyView new_sibling_pivot_key = right_child_min.substr(0, prefix.size() + 1); + + return new_sibling_pivot_key; +} + //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - // Status InMemoryNode::set_pivot_items_flushed(llfs::PageLoader& page_loader, @@ -1672,6 +2217,27 @@ void InMemoryNode::UpdateBuffer::Segment::insert_pivot(i32 pivot_i, bool is_acti this->flushed_pivots = insert_bit(this->flushed_pivots, pivot_i, false); } +//==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - +// +void InMemoryNode::UpdateBuffer::Segment::remove_pivot(i32 pivot_i) +{ + this->check_invariants(__FILE__, __LINE__); + auto on_scope_exit = batt::finally([&] { + this->check_invariants(__FILE__, __LINE__); + }); + + if (get_bit(this->flushed_pivots, pivot_i)) { + const i32 index = bit_rank(this->flushed_pivots, pivot_i); + BATT_ASSERT_GE(index, 0); + BATT_ASSERT_LT(index, this->flushed_item_upper_bound_.size()); + + this->flushed_item_upper_bound_.erase(this->flushed_item_upper_bound_.begin() + index); + } + + this->active_pivots = remove_bit(this->active_pivots, pivot_i); + this->flushed_pivots = remove_bit(this->flushed_pivots, pivot_i); +} + //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - // void InMemoryNode::UpdateBuffer::Segment::pop_front_pivots(i32 count) @@ -1725,6 +2291,24 @@ SmallFn InMemoryNode::UpdateBuffer::dump() const }; } +//==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - +// +u64 InMemoryNode::UpdateBuffer::compute_active_segmented_levels() const +{ + u64 active_pivots = 0; + for (const Level& level : this->levels) { + if (batt::is_case(level)) { + const SegmentedLevel& segmented_level = std::get(level); + for (usize segment_i = 0; segment_i < segmented_level.segment_count(); ++segment_i) { + const Segment& segment = segmented_level.get_segment(segment_i); + active_pivots |= segment.get_active_pivots(); + } + } + } + + return active_pivots; +} + //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - // SmallFn InMemoryNode::UpdateBuffer::EmptyLevel::dump() const diff --git a/src/turtle_kv/tree/in_memory_node.hpp b/src/turtle_kv/tree/in_memory_node.hpp index 33ed541..7934f60 100644 --- a/src/turtle_kv/tree/in_memory_node.hpp +++ b/src/turtle_kv/tree/in_memory_node.hpp @@ -144,6 +144,11 @@ struct InMemoryNode { */ void insert_pivot(i32 pivot_i, bool is_active); + /** \brief Removes a pivot bit in this->active_pivots and this->flushed_pivots at position + * `pivot_i`. + */ + void remove_pivot(i32 pivot_i); + /** \brief Removes the specified number (`count`) pivots from the front of this segment. This * is used while splitting a node's update buffer. */ @@ -318,6 +323,14 @@ struct InMemoryNode { return estimated; } + MergedLevel concat(MergedLevel& that) + { + return MergedLevel{ + .result_set = MergeCompactor::ResultSet::concat(std::move(this->result_set), + std::move(that.result_set)), + .segment_future_ids_ = {}}; + } + /** \brief Returns the number of segment leaf page build jobs added to the context. */ StatusOr start_serialize(const InMemoryNode& node, TreeSerializeContext& context); @@ -338,6 +351,8 @@ struct InMemoryNode { SmallFn dump() const; + u64 compute_active_segmented_levels() const; + usize count_non_empty_levels() const { usize count = 0; @@ -525,10 +540,31 @@ struct InMemoryNode { */ Status try_flush(BatchUpdateContext& context); + /** \brief Merge the node with one of its siblings and return the newly merged node. + */ + StatusOr> try_merge(BatchUpdateContext& context, + InMemoryNode& sibling); + + /** \brief Attempts to make the node (that needs a merge) viable by borrowing data + * from one of its siblings. If successful, returns the new pivot key to be set in the parent + * of these two nodes to separate them. + */ + StatusOr try_borrow(BatchUpdateContext& context, InMemoryNode& sibling); + /** \brief Splits the specified child, inserting a new pivot immediately after `pivot_i`. */ Status split_child(BatchUpdateContext& update_context, i32 pivot_i); + /** \brief Merges the specified child with a sibling. + */ + Status merge_child(BatchUpdateContext& update_context, i32 pivot_i); + + /** \brief If the node has a single pivot, attempts to flush updates out of the update buffer + * to grow the number of pivots. If all the updates are flushed and still only a single pivot + * remains, the single pivot (child) is returned. + */ + StatusOr> flush_and_shrink(BatchUpdateContext& context); + /** \brief Returns true iff there are no MergedLevels or unserialized Subtree children in this * node. */ diff --git a/src/turtle_kv/tree/in_memory_node.test.cpp b/src/turtle_kv/tree/in_memory_node.test.cpp index 490d30d..dc0f3ea 100644 --- a/src/turtle_kv/tree/in_memory_node.test.cpp +++ b/src/turtle_kv/tree/in_memory_node.test.cpp @@ -131,7 +131,7 @@ void verify_table_point_queries(Table& expected_table, Table& actual_table, Rng& } } -void verify_range_scan(LatencyMetric* scan_latency, +/*void verify_range_scan(LatencyMetric* scan_latency, Table& expected_table, const Slice>& actual_read_items, const KeyView& min_key, @@ -164,7 +164,7 @@ void verify_range_scan(LatencyMetric* scan_latency, ++expected_item_iter; ++actual_item_iter; } -} +} */ struct SubtreeBatchUpdateScenario { static std::atomic& size_tiered_count() @@ -279,6 +279,7 @@ TEST(InMemoryNodeTest, Subtree) if (n_threads != 0) { runner.n_threads(n_threads); } + runner.n_threads(usize{1}); runner.n_seeds(n_seeds); if (n_seeds < 128) { @@ -323,7 +324,7 @@ void SubtreeBatchUpdateScenario::run() } TreeOptions tree_options = TreeOptions::with_default_values() // - .set_leaf_size(512 * kKiB) + .set_leaf_size(32 * kKiB) .set_node_size(4 * kKiB) .set_key_size_hint(24) .set_value_size_hint(100) @@ -365,7 +366,9 @@ void SubtreeBatchUpdateScenario::run() usize total_items = 0; - for (usize i = 0; i < max_i; ++i) { + std::vector pending_deletes; + + for (usize i = 0; i < max_i; ++i) { BatchUpdate update{ .context = BatchUpdateContext{ @@ -373,7 +376,7 @@ void SubtreeBatchUpdateScenario::run() .page_loader = *page_loader, .cancel_token = batt::CancelToken{}, }, - .result_set = result_set_generator(DecayToItem{}, rng, strings), + .result_set = result_set_generator(DecayToItem{}, rng, strings, pending_deletes), .edit_size_totals = None, }; update.update_edit_size_totals(); @@ -386,6 +389,19 @@ void SubtreeBatchUpdateScenario::run() Status table_update_status = update_table(expected_table, update.result_set); ASSERT_TRUE(table_update_status.ok()) << BATT_INSPECT(table_update_status); + if (my_id == 0) { + if (!pending_deletes.empty()) { + pending_deletes.clear(); + } + + if (i > 0) { + BATT_CHECK(pending_deletes.empty()); + for (const EditView& edit : update.result_set.get()) { + pending_deletes.emplace_back(edit.key); + } + } + } + StatusOr tree_height = tree.get_height(*page_loader); ASSERT_TRUE(tree_height.ok()) << BATT_INSPECT(tree_height); @@ -439,7 +455,7 @@ void SubtreeBatchUpdateScenario::run() << BATT_INSPECT(this->seed) << BATT_INSPECT(i); { - auto root_ptr = std::make_shared(tree.clone_serialized_or_panic()); + /*auto root_ptr = std::make_shared(tree.clone_serialized_or_panic()); std::unique_ptr scanner_page_job = page_cache->new_job(); const usize scan_len = pick_scan_len(rng); @@ -475,7 +491,7 @@ void SubtreeBatchUpdateScenario::run() as_slice(scan_items_buffer.data(), n_read), min_key, scan_len)) - << BATT_INSPECT(i) << BATT_INSPECT_STR(min_key) << BATT_INSPECT(scan_len); + << BATT_INSPECT(i) << BATT_INSPECT_STR(min_key) << BATT_INSPECT(scan_len); */ } if (my_id == 0) { diff --git a/src/turtle_kv/tree/packed_leaf_page.hpp b/src/turtle_kv/tree/packed_leaf_page.hpp index 1a2ad3e..74e65d3 100644 --- a/src/turtle_kv/tree/packed_leaf_page.hpp +++ b/src/turtle_kv/tree/packed_leaf_page.hpp @@ -513,23 +513,22 @@ struct LeafItemsSummary { struct AddLeafItemsSummary { LeafItemsSummary operator()(const LeafItemsSummary& prior, const EditView& edit) const noexcept { - if (!decays_to_item(edit.value)) { - LOG(ERROR) << "TODO [tastolfi 2025-05-27] support deletes:" << BATT_INSPECT(edit); - - return LeafItemsSummary{ - .drop_count = prior.drop_count + 1, - .key_count = prior.key_count, - .key_data_size = prior.key_data_size, - .value_data_size = prior.value_data_size, - }; - } else { - return LeafItemsSummary{ - .drop_count = prior.drop_count, - .key_count = prior.key_count + 1, - .key_data_size = prior.key_data_size + (edit.key.size() + 4), - .value_data_size = prior.value_data_size + (1 + edit.value.size()), - }; + usize drop_count = prior.drop_count; + if (decays_to_item(edit)) { + drop_count++; } + return LeafItemsSummary{ + .drop_count = drop_count, + .key_count = prior.key_count + 1, + .key_data_size = prior.key_data_size + (edit.key.size() + 4), + .value_data_size = prior.value_data_size + (1 + edit.value.size()), + }; + } + + LeafItemsSummary operator()(const LeafItemsSummary& prior, + const ItemView& edit) const noexcept + { + return AddLeafItemsSummary{}(BATT_FORWARD(prior), EditView::from_item_view(edit)); } LeafItemsSummary operator()(const LeafItemsSummary& left, @@ -556,8 +555,6 @@ template LeafItemsSummary{}, AddLeafItemsSummary{}); - BATT_CHECK_EQ(summary.drop_count, 0); - PackedLeafLayoutPlanBuilder plan_builder; plan_builder.page_size = page_size; diff --git a/src/turtle_kv/tree/subtree.cpp b/src/turtle_kv/tree/subtree.cpp index a104b05..4c6b659 100644 --- a/src/turtle_kv/tree/subtree.cpp +++ b/src/turtle_kv/tree/subtree.cpp @@ -139,10 +139,10 @@ Status Subtree::apply_batch_update(const TreeOptions& tree_options, auto new_leaf = std::make_unique(llfs::PinnedPage{}, tree_options); - new_leaf->result_set = update.result_set; + update.decay_batch_to_items(new_leaf->result_set); if (!update.edit_size_totals) { - update.update_edit_size_totals(); + update.update_edit_size_totals_decayed(new_leaf->result_set); } new_leaf->set_edit_size_totals(std::move(*update.edit_size_totals)); @@ -177,7 +177,7 @@ Status Subtree::apply_batch_update(const TreeOptions& tree_options, BATT_ASSIGN_OK_RESULT( // new_leaf->result_set, - update.context.merge_compact_edits( // + update.context.merge_compact_edits( // global_max_key(), [&](MergeCompactor& compactor) -> Status { compactor.push_level(update.result_set.live_edit_slices()); @@ -216,7 +216,7 @@ Status Subtree::apply_batch_update(const TreeOptions& tree_options, BATT_ASSIGN_OK_RESULT( in_memory_leaf->result_set, - update.context.merge_compact_edits( + update.context.merge_compact_edits( global_max_key(), [&](MergeCompactor& compactor) -> Status { compactor.push_level(update.result_set.live_edit_slices()); @@ -265,9 +265,14 @@ Status Subtree::apply_batch_update(const TreeOptions& tree_options, return status; }, [&](const NeedsMerge& needs_merge) { - BATT_CHECK(!needs_merge.single_pivot) - << "TODO [tastolfi 2025-03-26] implement flush and shrink"; - return OkStatus(); + // Only perform a shrink if the root has a single pivot. + // + Status status = new_subtree->flush_and_shrink(update.context); + + if (!status.ok()) { + LOG(INFO) << "flush_and_shrink failed;" << BATT_INSPECT(needs_merge); + } + return status; })); } @@ -306,6 +311,38 @@ Status Subtree::split_and_grow(BatchUpdateContext& context, return OkStatus(); } +//==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - +// +Status Subtree::flush_and_shrink(BatchUpdateContext& context) +{ + BATT_CHECK(!this->locked_.load()); + + return batt::case_of( + this->impl_, + + [&](const llfs::PageIdSlot& page_id_slot [[maybe_unused]]) -> Status { + return {batt::StatusCode::kUnimplemented}; + }, + + [&](const std::unique_ptr& leaf [[maybe_unused]]) -> Status { + return OkStatus(); + }, + + [&](std::unique_ptr& node) -> Status { + StatusOr> status_or_new_root = node->flush_and_shrink(context); + + BATT_REQUIRE_OK(status_or_new_root); + + if (!*status_or_new_root) { + return OkStatus(); + } + + this->impl_ = std::move((*status_or_new_root)->impl_); + + return OkStatus(); + }); +} + //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - // StatusOr Subtree::get_height(llfs::PageLoader& page_loader) const @@ -526,6 +563,74 @@ StatusOr> Subtree::try_split(BatchUpdateContext& context) }); } +//==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - +// +StatusOr> Subtree::try_merge(BatchUpdateContext& context, Subtree& sibling) +{ + BATT_CHECK(!this->locked_.load()); + + return batt::case_of( + this->impl_, + + [&](const llfs::PageIdSlot& page_id_slot) -> StatusOr> { + BATT_PANIC() << "Cannot try merging a serialized subtree!"; + + return {batt::StatusCode::kUnimplemented}; + }, + + [&](const std::unique_ptr& leaf) -> StatusOr> { + BATT_CHECK(batt::is_case>(sibling.impl_)); + auto& sibling_leaf_ptr = std::get>(sibling.impl_); + BATT_CHECK(sibling_leaf_ptr); + + BATT_ASSIGN_OK_RESULT(std::unique_ptr merged_leaf, // + leaf->try_merge(context, *sibling_leaf_ptr)); + + return {Subtree{std::move(merged_leaf)}}; + }, + + [&](const std::unique_ptr& node) -> StatusOr> { + BATT_CHECK(batt::is_case>(sibling.impl_)); + auto& sibling_node_ptr = std::get>(sibling.impl_); + BATT_CHECK(sibling_node_ptr); + + BATT_ASSIGN_OK_RESULT(std::unique_ptr merged_node, // + node->try_merge(context, *sibling_node_ptr)); + + if (merged_node == nullptr) { + return Optional{None}; + } + + return {Subtree{std::move(merged_node)}}; + }); +} + +//==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - +// +StatusOr Subtree::try_borrow(BatchUpdateContext& context, Subtree& sibling) +{ + BATT_CHECK(!this->locked_.load()); + + return batt::case_of( + this->impl_, + + [&](const llfs::PageIdSlot& page_id_slot [[maybe_unused]]) -> StatusOr { + return {batt::StatusCode::kUnimplemented}; + }, + + [&](const std::unique_ptr& leaf [[maybe_unused]]) -> StatusOr { + return {batt::StatusCode::kUnimplemented}; + }, + + [&](const std::unique_ptr& node) -> StatusOr { + BATT_CHECK(batt::is_case>(sibling.impl_)); + auto& sibling_node_ptr = std::get>(sibling.impl_); + BATT_CHECK(sibling_node_ptr); + + return node->try_borrow(context, *sibling_node_ptr); + }); +} + //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - // Status Subtree::try_flush(BatchUpdateContext& context) @@ -645,4 +750,48 @@ void Subtree::lock() this->locked_.store(true); } +//==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - +// +Status Subtree::to_in_memory_subtree(llfs::PageLoader& page_loader, + const TreeOptions& tree_options, + i32 height) +{ + BATT_CHECK_GT(height, 0); + + if (this->is_serialized()) { + llfs::PageIdSlot& page_id_slot = std::get(this->impl_); + + BATT_CHECK(page_id_slot.is_valid()); + + llfs::PageLayoutId expected_layout = Subtree::expected_layout_for_height(height); + + StatusOr status_or_pinned_page = page_id_slot.load_through( + page_loader, + llfs::PageLoadOptions{ + expected_layout, + llfs::PinPageToJob::kDefault, + llfs::OkIfNotFound{false}, + llfs::LruPriority{(height > 2) ? kNodeLruPriority : kLeafLruPriority}, + }); + + BATT_REQUIRE_OK(status_or_pinned_page) << BATT_INSPECT(height); + + llfs::PinnedPage& pinned_page = *status_or_pinned_page; + + if (height == 1) { + auto new_leaf = std::make_unique(batt::make_copy(pinned_page), tree_options); + this->impl_ = std::move(new_leaf); + } else { + const PackedNodePage& packed_node = PackedNodePage::view_of(pinned_page); + + BATT_ASSIGN_OK_RESULT( + std::unique_ptr node, + InMemoryNode::unpack(batt::make_copy(pinned_page), tree_options, packed_node)); + + this->impl_ = std::move(node); + } + } + + return OkStatus(); +} } // namespace turtle_kv diff --git a/src/turtle_kv/tree/subtree.hpp b/src/turtle_kv/tree/subtree.hpp index 1dc48b6..a3e296b 100644 --- a/src/turtle_kv/tree/subtree.hpp +++ b/src/turtle_kv/tree/subtree.hpp @@ -138,6 +138,19 @@ class Subtree */ StatusOr> try_split(BatchUpdateContext& context); + /** \brief Attempts to merge the given Subtree with one of its siblings. If successful, the + * newly merged Subtree is returned. + * + * If no merge, returns None. + */ + StatusOr> try_merge(BatchUpdateContext& context, Subtree& sibling); + + /** \brief Attempts to make the Subtree viable by borrowing data from one of its siblings. + * Called when the Subtree needs a merge, but borrowing is the only option to make the tree + * viable. + */ + StatusOr try_borrow(BatchUpdateContext& context, Subtree& sibling); + /** \brief Attempt to make the root viable by flushing a batch. */ Status try_flush(BatchUpdateContext& context); @@ -172,12 +185,24 @@ class Subtree */ bool is_locked() const; + /** \brief Converts a serialized Subtree to its in-memory equivalent. + */ + Status to_in_memory_subtree(llfs::PageLoader& page_loader, + const TreeOptions& tree_options, + i32 height); + //+++++++++++-+-+--+----- --- -- - - - - private: Status split_and_grow(BatchUpdateContext& context, const TreeOptions& tree_options, const KeyView& key_upper_bound); + /** \brief Called when the root of the tree is a node with a single pivot. This function + * flushes the root's update buffer until its is either empty + * (causing the tree to shrink in height) or until it gains more pivots. + */ + Status flush_and_shrink(BatchUpdateContext& context); + //+++++++++++-+-+--+----- --- -- - - - - std::variant, std::unique_ptr> diff --git a/src/turtle_kv/tree/testing/random_leaf_generator.hpp b/src/turtle_kv/tree/testing/random_leaf_generator.hpp index 7f128be..152d707 100644 --- a/src/turtle_kv/tree/testing/random_leaf_generator.hpp +++ b/src/turtle_kv/tree/testing/random_leaf_generator.hpp @@ -72,7 +72,7 @@ class RandomLeafGenerator // Compute a running total of packed sizes, so we can split the result set in to leaf pages. // batt::RunningTotal running_total = - compute_running_total(worker_pool, result.result_set, DecayToItem{}); + compute_running_total(worker_pool, result.result_set, DecayToItem{}); SplitParts page_parts = split_parts( // running_total, // From 47dc60483cb5c92b90c9a4ad62e79c80f393ca77 Mon Sep 17 00:00:00 2001 From: Vidya Silai Date: Wed, 12 Nov 2025 11:23:54 -0500 Subject: [PATCH 02/48] Added some more comments --- src/turtle_kv/tree/in_memory_node.cpp | 45 +++++++++++++++++++++------ 1 file changed, 35 insertions(+), 10 deletions(-) diff --git a/src/turtle_kv/tree/in_memory_node.cpp b/src/turtle_kv/tree/in_memory_node.cpp index 032c566..0c97057 100644 --- a/src/turtle_kv/tree/in_memory_node.cpp +++ b/src/turtle_kv/tree/in_memory_node.cpp @@ -703,19 +703,18 @@ Status InMemoryNode::merge_child(BatchUpdateContext& update_context, i32 pivot_i i32 right_sibling = pivot_i + 1; i32 left_sibling = pivot_i - 1; - bool need_compaction = false; + bool need_update_buffer_compaction = false; u64 active_segmented_levels = this->update_buffer.compute_active_segmented_levels(); if (pivot_i == 0) { sibling_i = right_sibling; - if (get_bit(active_segmented_levels, pivot_i)) { - need_compaction = true; - } - } else if ((usize)pivot_i == this->children.size() - 1) { + } else if ((usize)pivot_i == this->pivot_count() - 1) { sibling_i = left_sibling; - if (get_bit(active_segmented_levels, left_sibling)) { - need_compaction = true; - } } else { + // If we don't have one of the edge cases, try and pick the sibling where the leftmost of + // {child, sibling} is inactive in all segmented levels. This way, the final merged pivot + // won't have on/off flushed ranges in segments. If this is not possible, pick the right + // sibling. + // if (!get_bit(active_segmented_levels, pivot_i)) { sibling_i = right_sibling; } else { @@ -723,12 +722,15 @@ Status InMemoryNode::merge_child(BatchUpdateContext& update_context, i32 pivot_i sibling_i = left_sibling; } else { sibling_i = right_sibling; - need_compaction = true; } } } BATT_CHECK_NE(pivot_i, sibling_i); + if (get_bit(active_segmented_levels, std::min(pivot_i, sibling_i))) { + need_update_buffer_compaction = true; + } + BATT_REQUIRE_OK(this->children[sibling_i].to_in_memory_subtree(update_context.page_loader, this->tree_options, this->height - 1)); @@ -744,6 +746,8 @@ Status InMemoryNode::merge_child(BatchUpdateContext& update_context, i32 pivot_i if (!*status_or_merged) { if (!batt::is_case(child.get_viability())) { + // If the full merge wasn't possible, try borrowing from the sibling. + // BATT_ASSIGN_OK_RESULT(KeyView new_pivot_key, child.try_borrow(update_context, sibling)); this->pivot_keys_[std::max(pivot_i, sibling_i)] = new_pivot_key; @@ -766,7 +770,7 @@ Status InMemoryNode::merge_child(BatchUpdateContext& update_context, i32 pivot_i // Update the update_buffer levels. // - if (need_compaction) { + if (need_update_buffer_compaction) { BATT_REQUIRE_OK(this->compact_update_buffer_levels(update_context)); } else { for (Level& level : this->update_buffer.levels) { @@ -1071,6 +1075,9 @@ StatusOr InMemoryNode::try_borrow(BatchUpdateContext& context, InMemory BATT_CHECK_LT(this->pivot_count(), 4); u32 num_pivots_to_borrow = 4 - this->pivot_count(); + // Calculate the pivot range to borrow from the sibling, and then extract updates from the + // sibling's update buffer that contain this range. + // i32 borrowed_min_pivot_i = -1; KeyView borrowed_max_pivot_key; if (right_sibling) { @@ -1116,6 +1123,8 @@ StatusOr InMemoryNode::try_borrow(BatchUpdateContext& context, InMemory borrowed_pivot_batch.edit_size_totals = None; + // Borrow node metadata from the sibling. + // if (right_sibling) { this->pending_bytes.insert(this->pending_bytes.end(), sibling.pending_bytes.begin(), @@ -1123,12 +1132,18 @@ StatusOr InMemoryNode::try_borrow(BatchUpdateContext& context, InMemory sibling.pending_bytes.erase(sibling.pending_bytes.begin(), sibling.pending_bytes.begin() + num_pivots_to_borrow); + // Update this->pending_bytes_is_exact by placing the borrowing pending bytes bits from the + // right sibling right after the pending bytes bits for this node. + // u64 borrowed_pending_bytes_exact = sibling.pending_bytes_is_exact & ((u64{1} << num_pivots_to_borrow) - 1); u64 mask = ((u64{1} << num_pivots_to_borrow) - 1) << (this->pivot_count() - 1); this->pending_bytes_is_exact = (this->pending_bytes_is_exact & ~mask) | (borrowed_pending_bytes_exact << (this->pivot_count() - 1)); + // Get rid of the key upper bound in this node and insert the borrowed pivot keys, including + // one past num_pivots_to_borrow, to set the new key upper bound. + // this->pivot_keys_.pop_back(); this->pivot_keys_.insert(this->pivot_keys_.end(), sibling.pivot_keys_.begin(), @@ -1159,6 +1174,9 @@ StatusOr InMemoryNode::try_borrow(BatchUpdateContext& context, InMemory sibling.pending_bytes.erase(sibling.pending_bytes.end() - num_pivots_to_borrow, sibling.pending_bytes.end()); + // Shift this->pending_bytes_is_exact up by num_pivots_to_borrow, and place the borrowed + // pending bytes bits at the lowest order bits. + // u64 borrowed_pending_bytes_exact = sibling.pending_bytes_is_exact >> (64 - num_pivots_to_borrow); this->pending_bytes_is_exact <<= num_pivots_to_borrow; @@ -1188,8 +1206,13 @@ StatusOr InMemoryNode::try_borrow(BatchUpdateContext& context, InMemory sibling.children.back().get_max_key(context.page_loader, sibling.child_pages.back())); } + // Now that metadata has been borrowed, inserted the borrowed updates into the update buffer. + // BATT_REQUIRE_OK(this->update_buffer_insert(borrowed_pivot_batch)); + // Adjust the update buffer levels metadata in the sibling now that the borrowed updates have + // been extracted. + // for (Level& level : sibling.update_buffer.levels) { batt::case_of( // level, // @@ -1220,6 +1243,8 @@ StatusOr InMemoryNode::try_borrow(BatchUpdateContext& context, InMemory }); } + // Calculate and return the new pivot key for the parent. + // KeyView left_child_max; KeyView right_child_min; if (right_sibling) { From 99d750feb66c7df73d53578c7aec614d789b293c Mon Sep 17 00:00:00 2001 From: Vidya Silai Date: Wed, 12 Nov 2025 11:28:19 -0500 Subject: [PATCH 03/48] Remove conan file changes --- conanfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conanfile.py b/conanfile.py index 0d0b7ef..9492d30 100644 --- a/conanfile.py +++ b/conanfile.py @@ -60,7 +60,7 @@ def requirements(self): self.requires("gperftools/2.16", **VISIBLE) self.requires("llfs/0.42.0", **VISIBLE) self.requires("pcg-cpp/cci.20220409", **VISIBLE) - self.requires("vqf/0.2.5-devel", **VISIBLE) + self.requires("vqf/0.2.5", **VISIBLE) self.requires("zlib/1.3.1", **OVERRIDE) if platform.system() == "Linux": From 969c140a2b555c86eba5cddf2a5300ff66737ea8 Mon Sep 17 00:00:00 2001 From: Vidya Silai Date: Sat, 15 Nov 2025 12:38:18 -0500 Subject: [PATCH 04/48] Some bug fixes; new test case --- src/turtle_kv/core/merge_compactor.cpp | 2 + src/turtle_kv/tree/in_memory_leaf.cpp | 26 ++-- src/turtle_kv/tree/in_memory_node.cpp | 5 +- src/turtle_kv/tree/in_memory_node.test.cpp | 159 ++++++++++++++++++++- src/turtle_kv/tree/subtree.cpp | 24 +++- src/turtle_kv/tree/subtree.hpp | 2 +- 6 files changed, 202 insertions(+), 16 deletions(-) diff --git a/src/turtle_kv/core/merge_compactor.cpp b/src/turtle_kv/core/merge_compactor.cpp index a50409e..963d2ce 100644 --- a/src/turtle_kv/core/merge_compactor.cpp +++ b/src/turtle_kv/core/merge_compactor.cpp @@ -495,6 +495,8 @@ template chunk_from_second.offset += first_size; }); + ans.chunks_.back().offset = first_size + second.chunks_.back().offset; + first.clear(); second.clear(); diff --git a/src/turtle_kv/tree/in_memory_leaf.cpp b/src/turtle_kv/tree/in_memory_leaf.cpp index 1708bf1..a5f5e55 100644 --- a/src/turtle_kv/tree/in_memory_leaf.cpp +++ b/src/turtle_kv/tree/in_memory_leaf.cpp @@ -159,17 +159,25 @@ StatusOr> InMemoryLeaf::try_merge(BatchUpdateConte // Concatenate the two leaves' result sets in the correct order. // - bool right_sibling = this->get_max_key() < sibling.get_min_key(); - if (right_sibling) { - merged_leaf->result_set = - MergeCompactor::ResultSet::concat(std::move(this->result_set), - std::move(sibling.result_set)); + if (this->result_set.empty()) { + merged_leaf->result_set = std::move(sibling.result_set); + merged_leaf->shared_edit_size_totals_ = std::move(sibling.shared_edit_size_totals_); + merged_leaf->edit_size_totals.emplace(merged_leaf->shared_edit_size_totals_->begin(), + merged_leaf->shared_edit_size_totals_->end()); } else { - merged_leaf->result_set = MergeCompactor::ResultSet::concat(std::move(sibling.result_set), - std::move(this->result_set)); - } + bool right_sibling = this->get_max_key() < sibling.get_min_key(); + if (right_sibling) { + merged_leaf->result_set = + MergeCompactor::ResultSet::concat(std::move(this->result_set), + std::move(sibling.result_set)); + } else { + merged_leaf->result_set = + MergeCompactor::ResultSet::concat(std::move(sibling.result_set), + std::move(this->result_set)); + } - merged_leaf->set_edit_size_totals(context.compute_running_total(merged_leaf->result_set)); + merged_leaf->set_edit_size_totals(context.compute_running_total(merged_leaf->result_set)); + } return {std::move(merged_leaf)}; } diff --git a/src/turtle_kv/tree/in_memory_node.cpp b/src/turtle_kv/tree/in_memory_node.cpp index 0c97057..37da683 100644 --- a/src/turtle_kv/tree/in_memory_node.cpp +++ b/src/turtle_kv/tree/in_memory_node.cpp @@ -731,13 +731,14 @@ Status InMemoryNode::merge_child(BatchUpdateContext& update_context, i32 pivot_i need_update_buffer_compaction = true; } - BATT_REQUIRE_OK(this->children[sibling_i].to_in_memory_subtree(update_context.page_loader, + BATT_REQUIRE_OK(this->children[sibling_i].to_in_memory_subtree(update_context, this->tree_options, this->height - 1)); // Call child.try_merge(). // Subtree& sibling = this->children[sibling_i]; + BATT_CHECK(batt::is_case(sibling.get_viability())); StatusOr> status_or_merged = child.try_merge(update_context, sibling); if (!status_or_merged.ok()) { LOG(ERROR) << BATT_INSPECT(child.get_viability()); @@ -811,7 +812,7 @@ Status InMemoryNode::merge_child(BatchUpdateContext& update_context, i32 pivot_i // Finally, split the newly merged child if needed. // - SubtreeViability merged_viability = merged_subtree.get_viability(); + SubtreeViability merged_viability = this->children[pivot_to_overwrite].get_viability(); if (batt::is_case(merged_viability)) { BATT_REQUIRE_OK(this->split_child(update_context, pivot_to_overwrite)); } else { diff --git a/src/turtle_kv/tree/in_memory_node.test.cpp b/src/turtle_kv/tree/in_memory_node.test.cpp index dc0f3ea..4e24c5b 100644 --- a/src/turtle_kv/tree/in_memory_node.test.cpp +++ b/src/turtle_kv/tree/in_memory_node.test.cpp @@ -56,6 +56,7 @@ using turtle_kv::KVStoreScanner; using turtle_kv::LatencyMetric; using turtle_kv::LatencyTimer; using turtle_kv::make_memory_page_cache; +using turtle_kv::NeedsMerge; using turtle_kv::NeedsSplit; using turtle_kv::None; using turtle_kv::OkStatus; @@ -74,7 +75,9 @@ using turtle_kv::TreeOptions; using turtle_kv::TreeSerializeContext; using turtle_kv::ValueView; using turtle_kv::testing::RandomResultSetGenerator; +using turtle_kv::testing::RandomStringGenerator; +using llfs::get_key; using llfs::StableStringStore; using batt::getenv_as; @@ -368,7 +371,7 @@ void SubtreeBatchUpdateScenario::run() std::vector pending_deletes; - for (usize i = 0; i < max_i; ++i) { + for (usize i = 0; i < max_i; ++i) { BatchUpdate update{ .context = BatchUpdateContext{ @@ -509,4 +512,158 @@ void SubtreeBatchUpdateScenario::run() } } +TEST(InMemoryNodeTest, SubtreeDeletions) +{ + const usize key_size = 24; + const usize value_size = 100; + + TreeOptions tree_options = TreeOptions::with_default_values() // + .set_leaf_size(32 * kKiB) + .set_node_size(4 * kKiB) + .set_key_size_hint(key_size) + .set_value_size_hint(value_size); + + usize items_per_leaf = tree_options.flush_size() / tree_options.expected_item_size(); + usize total_batches = 81; + + std::vector keys; + keys.reserve(total_batches * items_per_leaf); + + std::string value_str = std::string(value_size, 'a'); + ValueView value = ValueView::from_str(value_str); + + std::default_random_engine rng{/*seed=*/1}; + RandomStringGenerator generate_key; + for (usize i = 0; i < total_batches * items_per_leaf; ++i) { + keys.emplace_back(generate_key(rng)); + } + std::sort(keys.begin(), keys.end(), llfs::KeyOrder{}); + keys.erase(std::unique(keys.begin(), + keys.end(), + [](const auto& l, const auto& r) { + return get_key(l) == get_key(r); + }), + keys.end()); + BATT_CHECK_EQ(keys.size(), total_batches * items_per_leaf); + + std::shared_ptr page_cache = + make_memory_page_cache(batt::Runtime::instance().default_scheduler(), + tree_options, + /*byte_capacity=*/1500 * kMiB); + + Subtree tree = Subtree::make_empty(); + + ASSERT_TRUE(tree.is_serialized()); + + batt::WorkerPool& worker_pool = batt::WorkerPool::null_pool(); + + Optional page_loader{*page_cache}; + + const auto create_insertion_batch = [&](usize batch_number) -> std::vector { + std::vector current_batch; + current_batch.reserve(items_per_leaf); + for (usize j = 0; j < items_per_leaf; ++j) { + current_batch.emplace_back(keys[(batch_number * items_per_leaf) + j], value); + } + + return current_batch; + }; + + const auto create_deletion_batch = [&](usize batch_number) -> std::vector { + std::vector current_batch; + current_batch.reserve(items_per_leaf); + + usize per_batch = items_per_leaf / total_batches; + usize batch_remainder = items_per_leaf % total_batches; + usize total_amount_per_batch = per_batch + (batch_number < batch_remainder ? 1 : 0); + + for (usize i = 0; i < total_batches; ++i) { + usize base_i = i * items_per_leaf; + usize offset = batch_number * per_batch + std::min(batch_number, batch_remainder); + + for (usize j = 0; j < total_amount_per_batch; ++j) { + current_batch.emplace_back(keys[base_i + offset + j], ValueView::deleted()); + } + } + BATT_CHECK_LE(current_batch.size(), items_per_leaf) << BATT_INSPECT(batch_number); + + return current_batch; + }; + + const auto apply_tree_updates = [&](auto batch_creation_func) { + for (usize i = 0; i < total_batches; ++i) { + std::vector current_batch = batch_creation_func(i); + // LOG(INFO) << "current batch: " << i << ", size: " << current_batch.size(); + + ResultSet result; + result.append(std::move(current_batch)); + + BatchUpdate update{ + .context = + BatchUpdateContext{ + .worker_pool = worker_pool, + .page_loader = *page_loader, + .cancel_token = batt::CancelToken{}, + }, + .result_set = std::move(result), + .edit_size_totals = None, + }; + update.update_edit_size_totals(); + + StatusOr tree_height = tree.get_height(*page_loader); + ASSERT_TRUE(tree_height.ok()) << BATT_INSPECT(tree_height); + // LOG(INFO) << "tree height at batch_number " << i << ": " << *tree_height; + + Status status = // + tree.apply_batch_update(tree_options, + ParentNodeHeight{*tree_height + 1}, + update, + /*key_upper_bound=*/global_max_key(), + IsRoot{true}); + + ASSERT_TRUE(status.ok()) << BATT_INSPECT(status) << BATT_INSPECT(i); + ASSERT_FALSE(tree.is_serialized()); + ASSERT_FALSE(batt::is_case(tree.get_viability())); + } + }; + + apply_tree_updates(create_insertion_batch); + + std::unique_ptr page_job = page_cache->new_job(); + TreeSerializeContext context{tree_options, *page_job, worker_pool}; + + Status start_status = tree.start_serialize(context); + ASSERT_TRUE(start_status.ok()) << BATT_INSPECT(start_status); + + Status build_status = context.build_all_pages(); + ASSERT_TRUE(build_status.ok()) << BATT_INSPECT(build_status); + + StatusOr finish_status = tree.finish_serialize(context); + ASSERT_TRUE(finish_status.ok()) << BATT_INSPECT(finish_status); + + page_job->new_root(*finish_status); + Status commit_status = llfs::unsafe_commit_job(std::move(page_job)); + ASSERT_TRUE(commit_status.ok()) << BATT_INSPECT(commit_status); + + page_loader.emplace(*page_cache); + + apply_tree_updates(create_deletion_batch); + + StatusOr tree_height = tree.get_height(*page_loader); + ASSERT_TRUE(tree_height.ok()) << BATT_INSPECT(tree_height); + + /*BatchUpdateContext update_context{ + .worker_pool = worker_pool, + .page_loader = *page_loader, + .cancel_token = batt::CancelToken{}, + }; + while (*tree_height > 2) { + Status flush_status = tree.try_flush(update_context); + ASSERT_TRUE(flush_status.ok()); + + tree_height = tree.get_height(*page_loader); + ASSERT_TRUE(tree_height.ok()) << BATT_INSPECT(tree_height); + } */ +} + } // namespace diff --git a/src/turtle_kv/tree/subtree.cpp b/src/turtle_kv/tree/subtree.cpp index 4c6b659..8adfddc 100644 --- a/src/turtle_kv/tree/subtree.cpp +++ b/src/turtle_kv/tree/subtree.cpp @@ -256,6 +256,14 @@ Status Subtree::apply_batch_update(const TreeOptions& tree_options, return OkStatus(); }, [&](NeedsSplit needs_split) { + if (needs_split.too_many_segments && !needs_split.too_many_pivots && + !needs_split.keys_too_large) { + Status flush_status = new_subtree->try_flush(update.context); + if (flush_status.ok() && batt::is_case(new_subtree->get_viability())) { + return OkStatus(); + } + } + Status status = new_subtree->split_and_grow(update.context, tree_options, key_upper_bound); @@ -626,7 +634,7 @@ StatusOr Subtree::try_borrow(BatchUpdateContext& context, Subtree& sibl BATT_CHECK(batt::is_case>(sibling.impl_)); auto& sibling_node_ptr = std::get>(sibling.impl_); BATT_CHECK(sibling_node_ptr); - + return node->try_borrow(context, *sibling_node_ptr); }); } @@ -752,7 +760,7 @@ void Subtree::lock() //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - // -Status Subtree::to_in_memory_subtree(llfs::PageLoader& page_loader, +Status Subtree::to_in_memory_subtree(BatchUpdateContext& context, const TreeOptions& tree_options, i32 height) { @@ -766,7 +774,7 @@ Status Subtree::to_in_memory_subtree(llfs::PageLoader& page_loader, llfs::PageLayoutId expected_layout = Subtree::expected_layout_for_height(height); StatusOr status_or_pinned_page = page_id_slot.load_through( - page_loader, + context.page_loader, llfs::PageLoadOptions{ expected_layout, llfs::PinPageToJob::kDefault, @@ -780,6 +788,16 @@ Status Subtree::to_in_memory_subtree(llfs::PageLoader& page_loader, if (height == 1) { auto new_leaf = std::make_unique(batt::make_copy(pinned_page), tree_options); + const PackedLeafPage& packed_leaf = PackedLeafPage::view_of(pinned_page); + + std::vector items; + for (const PackedKeyValue& pkv : packed_leaf.items_slice()) { + items.emplace_back(to_edit_view(pkv)); + } + new_leaf->result_set.append(std::move(items)); + + new_leaf->set_edit_size_totals(context.compute_running_total(new_leaf->result_set)); + this->impl_ = std::move(new_leaf); } else { const PackedNodePage& packed_node = PackedNodePage::view_of(pinned_page); diff --git a/src/turtle_kv/tree/subtree.hpp b/src/turtle_kv/tree/subtree.hpp index a3e296b..3f883cb 100644 --- a/src/turtle_kv/tree/subtree.hpp +++ b/src/turtle_kv/tree/subtree.hpp @@ -187,7 +187,7 @@ class Subtree /** \brief Converts a serialized Subtree to its in-memory equivalent. */ - Status to_in_memory_subtree(llfs::PageLoader& page_loader, + Status to_in_memory_subtree(BatchUpdateContext& context, const TreeOptions& tree_options, i32 height); From ca5b89ba674311b476d6658a31232cdf52b47549 Mon Sep 17 00:00:00 2001 From: Vidya Silai Date: Mon, 17 Nov 2025 00:12:45 -0500 Subject: [PATCH 05/48] More bug fixes --- src/turtle_kv/kv_store_scanner.cpp | 3 +- src/turtle_kv/tree/in_memory_leaf.cpp | 35 +++-- src/turtle_kv/tree/in_memory_leaf.hpp | 2 +- src/turtle_kv/tree/in_memory_node.cpp | 128 +++++++++--------- src/turtle_kv/tree/in_memory_node.hpp | 14 +- src/turtle_kv/tree/in_memory_node.test.cpp | 145 +++++++++++++++------ src/turtle_kv/tree/subtree.cpp | 27 +++- src/turtle_kv/tree/subtree.hpp | 10 +- 8 files changed, 228 insertions(+), 136 deletions(-) diff --git a/src/turtle_kv/kv_store_scanner.cpp b/src/turtle_kv/kv_store_scanner.cpp index 36b2d14..63fdb5b 100644 --- a/src/turtle_kv/kv_store_scanner.cpp +++ b/src/turtle_kv/kv_store_scanner.cpp @@ -461,7 +461,8 @@ Status KVStoreScanner::set_next_item() LatencyTimer timer{batt::Every2ToTheConst<8>{}, KVStoreScanner::metrics().heap_remove_latency}; this->heap_.remove_first(); - this->needs_resume_ = true; + //this->needs_resume_ = true; + BATT_REQUIRE_OK(this->resume()); } } diff --git a/src/turtle_kv/tree/in_memory_leaf.cpp b/src/turtle_kv/tree/in_memory_leaf.cpp index a5f5e55..2a2193c 100644 --- a/src/turtle_kv/tree/in_memory_leaf.cpp +++ b/src/turtle_kv/tree/in_memory_leaf.cpp @@ -151,34 +151,31 @@ auto InMemoryLeaf::make_split_plan() const -> StatusOr //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - // -StatusOr> InMemoryLeaf::try_merge(BatchUpdateContext& context, - InMemoryLeaf& sibling) +StatusOr> InMemoryLeaf::try_merge( + BatchUpdateContext& context, + std::unique_ptr sibling) noexcept { + if (this->result_set.empty()) { + return {std::move(sibling)}; + } + auto merged_leaf = std::make_unique(batt::make_copy(this->pinned_leaf_page_), this->tree_options); // Concatenate the two leaves' result sets in the correct order. // - if (this->result_set.empty()) { - merged_leaf->result_set = std::move(sibling.result_set); - merged_leaf->shared_edit_size_totals_ = std::move(sibling.shared_edit_size_totals_); - merged_leaf->edit_size_totals.emplace(merged_leaf->shared_edit_size_totals_->begin(), - merged_leaf->shared_edit_size_totals_->end()); + bool right_sibling = this->get_max_key() < sibling->get_min_key(); + if (right_sibling) { + merged_leaf->result_set = + MergeCompactor::ResultSet::concat(std::move(this->result_set), + std::move(sibling->result_set)); } else { - bool right_sibling = this->get_max_key() < sibling.get_min_key(); - if (right_sibling) { - merged_leaf->result_set = - MergeCompactor::ResultSet::concat(std::move(this->result_set), - std::move(sibling.result_set)); - } else { - merged_leaf->result_set = - MergeCompactor::ResultSet::concat(std::move(sibling.result_set), - std::move(this->result_set)); - } - - merged_leaf->set_edit_size_totals(context.compute_running_total(merged_leaf->result_set)); + merged_leaf->result_set = MergeCompactor::ResultSet::concat(std::move(sibling->result_set), + std::move(this->result_set)); } + merged_leaf->set_edit_size_totals(context.compute_running_total(merged_leaf->result_set)); + return {std::move(merged_leaf)}; } diff --git a/src/turtle_kv/tree/in_memory_leaf.hpp b/src/turtle_kv/tree/in_memory_leaf.hpp index 879fee4..ca26bd7 100644 --- a/src/turtle_kv/tree/in_memory_leaf.hpp +++ b/src/turtle_kv/tree/in_memory_leaf.hpp @@ -93,7 +93,7 @@ struct InMemoryLeaf { StatusOr make_split_plan() const; StatusOr> try_merge(BatchUpdateContext& context, - InMemoryLeaf& sibling); + std::unique_ptr sibling) noexcept; Status start_serialize(TreeSerializeContext& context); diff --git a/src/turtle_kv/tree/in_memory_node.cpp b/src/turtle_kv/tree/in_memory_node.cpp index 37da683..b2722fd 100644 --- a/src/turtle_kv/tree/in_memory_node.cpp +++ b/src/turtle_kv/tree/in_memory_node.cpp @@ -692,8 +692,16 @@ Status InMemoryNode::split_child(BatchUpdateContext& update_context, i32 pivot_i //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - // -Status InMemoryNode::merge_child(BatchUpdateContext& update_context, i32 pivot_i) +Status InMemoryNode::merge_child(BatchUpdateContext& update_context, i32 pivot_i) noexcept { + // Special case: we have a tree composed of one node (root) and one leaf (its only child). + // In this case, don't progress with the rest of the function, as we are in the middle of a + // flush and shrink. + // + if (this->height == 2 && this->pivot_count() == 1) { + return OkStatus(); + } + Subtree& child = this->children[pivot_i]; // Decide which sibling to merge with. Edge cases: child that needs merge is the leftmost or @@ -763,8 +771,9 @@ Status InMemoryNode::merge_child(BatchUpdateContext& update_context, i32 pivot_i // Erase rightmost of {child subtree, sibling} in this->child_pages, overwrite leftmost // with new PinnedPage{}. // - i32 pivot_to_erase = std::max(pivot_i, sibling_i); - i32 pivot_to_overwrite = std::min(pivot_i, sibling_i); + const i32 pivot_to_erase = std::max(pivot_i, sibling_i); + const i32 pivot_to_overwrite = std::min(pivot_i, sibling_i); + const usize old_pivot_count = this->pivot_count(); this->child_pages[pivot_to_overwrite] = llfs::PinnedPage{}; this->child_pages.erase(this->child_pages.begin() + pivot_to_erase); @@ -804,7 +813,7 @@ Status InMemoryNode::merge_child(BatchUpdateContext& update_context, i32 pivot_i // this->pivot_keys_.erase(this->pivot_keys_.begin() + pivot_to_erase); - if ((usize)pivot_to_erase == this->children.size()) { + if ((usize)pivot_to_erase == old_pivot_count - 1) { BATT_ASSIGN_OK_RESULT( this->max_key_, this->children.back().get_max_key(update_context.page_loader, this->child_pages.back())); @@ -824,7 +833,7 @@ Status InMemoryNode::merge_child(BatchUpdateContext& update_context, i32 pivot_i //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - // -StatusOr> InMemoryNode::flush_and_shrink(BatchUpdateContext& context) +StatusOr> InMemoryNode::flush_and_shrink(BatchUpdateContext& context) noexcept { // If more than one pivot exists, nothings needs to be done. // @@ -833,7 +842,7 @@ StatusOr> InMemoryNode::flush_and_shrink(BatchUpdateContext& c return None; } - i32 single_pivot_i = 0; + const i32 single_pivot_i = 0; BATT_CHECK_EQ(this->pending_bytes.size(), 1); usize pending_bytes_count = this->pending_bytes[single_pivot_i]; @@ -857,7 +866,7 @@ StatusOr> InMemoryNode::flush_and_shrink(BatchUpdateContext& c //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - // StatusOr> InMemoryNode::try_merge(BatchUpdateContext& context, - InMemoryNode& sibling) + InMemoryNode& sibling) noexcept { // If merging both full nodes will cause the merged node's pivot count to exceed the max // possible pivot count, return null so that we can try a borrow. @@ -873,6 +882,8 @@ StatusOr> InMemoryNode::try_merge(BatchUpdateConte const auto concat_metadata = [&](InMemoryNode& left, InMemoryNode& right) { new_node->max_key_ = right.max_key_; + new_node->height = left.height; + new_node->latest_flush_pivot_i_ = None; new_node->pending_bytes.insert(new_node->pending_bytes.end(), @@ -938,30 +949,12 @@ StatusOr> InMemoryNode::try_merge(BatchUpdateConte [&](SegmentedLevel& right_segmented_level) -> Status { // When merging a MergedLevel and a SegmentedLevel, create a new MergedLevel. // - MergedLevel new_merged_level; - HasPageRefs has_page_refs{false}; - Status segment_load_status; - - Slice levels_to_merge = - as_slice(right.update_buffer.levels.data() + i, usize{1}); - - BATT_ASSIGN_OK_RESULT( // - new_merged_level.result_set, - context.merge_compact_edits( // - global_max_key(), - [&](MergeCompactor& compactor) -> Status { - compactor.push_level(left_merged_level.result_set.live_edit_slices()); - right.push_levels_to_merge(compactor, - context.page_loader, - segment_load_status, - has_page_refs, - levels_to_merge, - /*min_pivot_i=*/0, - /*only_pivot=*/false); - return OkStatus(); - })); - - BATT_REQUIRE_OK(segment_load_status); + BATT_ASSIGN_OK_RESULT( + MergedLevel new_merged_level, + UpdateBuffer::merge_segmented_and_merged_level(context, + left_merged_level, + right_segmented_level, + right)); new_node->update_buffer.levels.emplace_back(std::move(new_merged_level)); @@ -982,31 +975,12 @@ StatusOr> InMemoryNode::try_merge(BatchUpdateConte return OkStatus(); }, [&](MergedLevel& right_merged_level) -> Status { - MergedLevel new_merged_level; - HasPageRefs has_page_refs{false}; - Status segment_load_status; - - Slice levels_to_merge = - as_slice(left.update_buffer.levels.data() + i, usize{1}); - - BATT_ASSIGN_OK_RESULT( // - new_merged_level.result_set, - context.merge_compact_edits( // - global_max_key(), - [&](MergeCompactor& compactor) -> Status { - left.push_levels_to_merge(compactor, - context.page_loader, - segment_load_status, - has_page_refs, - levels_to_merge, - /*min_pivot_i=*/0, - /*only_pivot=*/false); - compactor.push_level( - right_merged_level.result_set.live_edit_slices()); - return OkStatus(); - })); - - BATT_REQUIRE_OK(segment_load_status); + BATT_ASSIGN_OK_RESULT( + MergedLevel new_merged_level, + UpdateBuffer::merge_segmented_and_merged_level(context, + right_merged_level, + left_segmented_level, + left)); new_node->update_buffer.levels.emplace_back(std::move(new_merged_level)); @@ -1067,7 +1041,8 @@ StatusOr> InMemoryNode::try_merge(BatchUpdateConte //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - // -StatusOr InMemoryNode::try_borrow(BatchUpdateContext& context, InMemoryNode& sibling) +StatusOr InMemoryNode::try_borrow(BatchUpdateContext& context, + InMemoryNode& sibling) noexcept { BATT_CHECK(batt::is_case(sibling.get_viability())); @@ -1212,7 +1187,7 @@ StatusOr InMemoryNode::try_borrow(BatchUpdateContext& context, InMemory BATT_REQUIRE_OK(this->update_buffer_insert(borrowed_pivot_batch)); // Adjust the update buffer levels metadata in the sibling now that the borrowed updates have - // been extracted. + // been extracted. // for (Level& level : sibling.update_buffer.levels) { batt::case_of( // @@ -2159,6 +2134,43 @@ void InMemoryNode::UpdateBuffer::SegmentedLevel::check_items_sorted( } } +//==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - +// +/*static*/ StatusOr InMemoryNode::UpdateBuffer::merge_segmented_and_merged_level( + BatchUpdateContext& context, + MergedLevel& merged_level, + SegmentedLevel& segmented_level, + InMemoryNode& segmented_level_node) noexcept +{ + MergedLevel new_merged_level; + HasPageRefs has_page_refs{false}; + Status segment_load_status; + + BoxedSeq segmented_level_slices = + SegmentedLevelScanner{ + segmented_level_node, + segmented_level, + context.page_loader, + llfs::PinPageToJob::kDefault, + segment_load_status, + /*min_pivot_i=*/0} // + | seq::boxed(); + + BATT_ASSIGN_OK_RESULT( // + new_merged_level.result_set, + context.merge_compact_edits( // + global_max_key(), + [&](MergeCompactor& compactor) -> Status { + compactor.push_level(merged_level.result_set.live_edit_slices()); + compactor.push_level(std::move(segmented_level_slices)); + return OkStatus(); + })); + + BATT_REQUIRE_OK(segment_load_status); + + return new_merged_level; +} + //=#=#==#==#===============+=+=+=+=++=++++++++++++++-++-+--+-+----+--------------- //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - diff --git a/src/turtle_kv/tree/in_memory_node.hpp b/src/turtle_kv/tree/in_memory_node.hpp index 7934f60..15e63f4 100644 --- a/src/turtle_kv/tree/in_memory_node.hpp +++ b/src/turtle_kv/tree/in_memory_node.hpp @@ -349,6 +349,12 @@ struct InMemoryNode { //+++++++++++-+-+--+----- --- -- - - - - + static StatusOr merge_segmented_and_merged_level( + BatchUpdateContext& context, // + MergedLevel& merged_level, + SegmentedLevel& segmented_level, + InMemoryNode& segmented_level_node) noexcept; + SmallFn dump() const; u64 compute_active_segmented_levels() const; @@ -543,13 +549,13 @@ struct InMemoryNode { /** \brief Merge the node with one of its siblings and return the newly merged node. */ StatusOr> try_merge(BatchUpdateContext& context, - InMemoryNode& sibling); + InMemoryNode& sibling) noexcept; /** \brief Attempts to make the node (that needs a merge) viable by borrowing data * from one of its siblings. If successful, returns the new pivot key to be set in the parent * of these two nodes to separate them. */ - StatusOr try_borrow(BatchUpdateContext& context, InMemoryNode& sibling); + StatusOr try_borrow(BatchUpdateContext& context, InMemoryNode& sibling) noexcept; /** \brief Splits the specified child, inserting a new pivot immediately after `pivot_i`. */ @@ -557,13 +563,13 @@ struct InMemoryNode { /** \brief Merges the specified child with a sibling. */ - Status merge_child(BatchUpdateContext& update_context, i32 pivot_i); + Status merge_child(BatchUpdateContext& update_context, i32 pivot_i) noexcept; /** \brief If the node has a single pivot, attempts to flush updates out of the update buffer * to grow the number of pivots. If all the updates are flushed and still only a single pivot * remains, the single pivot (child) is returned. */ - StatusOr> flush_and_shrink(BatchUpdateContext& context); + StatusOr> flush_and_shrink(BatchUpdateContext& context) noexcept; /** \brief Returns true iff there are no MergedLevels or unserialized Subtree children in this * node. diff --git a/src/turtle_kv/tree/in_memory_node.test.cpp b/src/turtle_kv/tree/in_memory_node.test.cpp index 4e24c5b..41b26f1 100644 --- a/src/turtle_kv/tree/in_memory_node.test.cpp +++ b/src/turtle_kv/tree/in_memory_node.test.cpp @@ -134,7 +134,7 @@ void verify_table_point_queries(Table& expected_table, Table& actual_table, Rng& } } -/*void verify_range_scan(LatencyMetric* scan_latency, +void verify_range_scan(LatencyMetric* scan_latency, Table& expected_table, const Slice>& actual_read_items, const KeyView& min_key, @@ -167,7 +167,7 @@ void verify_table_point_queries(Table& expected_table, Table& actual_table, Rng& ++expected_item_iter; ++actual_item_iter; } -} */ +} struct SubtreeBatchUpdateScenario { static std::atomic& size_tiered_count() @@ -397,7 +397,7 @@ void SubtreeBatchUpdateScenario::run() pending_deletes.clear(); } - if (i > 0) { + if (i % 5 == 0) { BATT_CHECK(pending_deletes.empty()); for (const EditView& edit : update.result_set.get()) { pending_deletes.emplace_back(edit.key); @@ -458,7 +458,7 @@ void SubtreeBatchUpdateScenario::run() << BATT_INSPECT(this->seed) << BATT_INSPECT(i); { - /*auto root_ptr = std::make_shared(tree.clone_serialized_or_panic()); + auto root_ptr = std::make_shared(tree.clone_serialized_or_panic()); std::unique_ptr scanner_page_job = page_cache->new_job(); const usize scan_len = pick_scan_len(rng); @@ -494,7 +494,7 @@ void SubtreeBatchUpdateScenario::run() as_slice(scan_items_buffer.data(), n_read), min_key, scan_len)) - << BATT_INSPECT(i) << BATT_INSPECT_STR(min_key) << BATT_INSPECT(scan_len); */ + << BATT_INSPECT(i) << BATT_INSPECT_STR(min_key) << BATT_INSPECT(scan_len); } if (my_id == 0) { @@ -516,6 +516,7 @@ TEST(InMemoryNodeTest, SubtreeDeletions) { const usize key_size = 24; const usize value_size = 100; + const usize chi = 4; TreeOptions tree_options = TreeOptions::with_default_values() // .set_leaf_size(32 * kKiB) @@ -532,7 +533,7 @@ TEST(InMemoryNodeTest, SubtreeDeletions) std::string value_str = std::string(value_size, 'a'); ValueView value = ValueView::from_str(value_str); - std::default_random_engine rng{/*seed=*/1}; + std::default_random_engine rng{/*seed=*/3}; RandomStringGenerator generate_key; for (usize i = 0; i < total_batches * items_per_leaf; ++i) { keys.emplace_back(generate_key(rng)); @@ -552,9 +553,11 @@ TEST(InMemoryNodeTest, SubtreeDeletions) /*byte_capacity=*/1500 * kMiB); Subtree tree = Subtree::make_empty(); - ASSERT_TRUE(tree.is_serialized()); + turtle_kv::OrderedMapTable> expected_table; + SubtreeTable actual_table{*page_cache, tree_options, tree}; + batt::WorkerPool& worker_pool = batt::WorkerPool::null_pool(); Optional page_loader{*page_cache}; @@ -590,10 +593,9 @@ TEST(InMemoryNodeTest, SubtreeDeletions) return current_batch; }; - const auto apply_tree_updates = [&](auto batch_creation_func) { + const auto apply_tree_updates = [&](auto batch_creation_func, bool perform_scan) { for (usize i = 0; i < total_batches; ++i) { std::vector current_batch = batch_creation_func(i); - // LOG(INFO) << "current batch: " << i << ", size: " << current_batch.size(); ResultSet result; result.append(std::move(current_batch)); @@ -610,60 +612,119 @@ TEST(InMemoryNodeTest, SubtreeDeletions) }; update.update_edit_size_totals(); - StatusOr tree_height = tree.get_height(*page_loader); - ASSERT_TRUE(tree_height.ok()) << BATT_INSPECT(tree_height); - // LOG(INFO) << "tree height at batch_number " << i << ": " << *tree_height; + Status table_update_status = update_table(expected_table, update.result_set); + ASSERT_TRUE(table_update_status.ok()) << BATT_INSPECT(table_update_status); + + StatusOr tree_height_before = tree.get_height(*page_loader); + ASSERT_TRUE(tree_height_before.ok()) << BATT_INSPECT(tree_height_before); Status status = // tree.apply_batch_update(tree_options, - ParentNodeHeight{*tree_height + 1}, + ParentNodeHeight{*tree_height_before + 1}, update, /*key_upper_bound=*/global_max_key(), IsRoot{true}); ASSERT_TRUE(status.ok()) << BATT_INSPECT(status) << BATT_INSPECT(i); - ASSERT_FALSE(tree.is_serialized()); - ASSERT_FALSE(batt::is_case(tree.get_viability())); - } - }; - apply_tree_updates(create_insertion_batch); + StatusOr tree_height_after = tree.get_height(*page_loader); + ASSERT_TRUE(tree_height_after.ok()) << BATT_INSPECT(tree_height_after); + + if (*tree_height_after == 0) { + ASSERT_LT(*tree_height_after, *tree_height_before); + ASSERT_TRUE(tree.is_serialized()); + break; + } else { + ASSERT_FALSE(tree.is_serialized()); + } + + ASSERT_FALSE(batt::is_case(tree.get_viability())); - std::unique_ptr page_job = page_cache->new_job(); - TreeSerializeContext context{tree_options, *page_job, worker_pool}; + ASSERT_NO_FATAL_FAILURE( + verify_table_point_queries(expected_table, actual_table, rng, batt::log2_ceil(i))) + << BATT_INSPECT(i); - Status start_status = tree.start_serialize(context); - ASSERT_TRUE(start_status.ok()) << BATT_INSPECT(start_status); + if (((i + 1) % chi) == 0) { + LOG(INFO) << "Taking checkpoint..."; - Status build_status = context.build_all_pages(); - ASSERT_TRUE(build_status.ok()) << BATT_INSPECT(build_status); + std::unique_ptr page_job = page_cache->new_job(); + TreeSerializeContext context{tree_options, *page_job, worker_pool}; - StatusOr finish_status = tree.finish_serialize(context); - ASSERT_TRUE(finish_status.ok()) << BATT_INSPECT(finish_status); + Status start_status = tree.start_serialize(context); + ASSERT_TRUE(start_status.ok()) << BATT_INSPECT(start_status); - page_job->new_root(*finish_status); - Status commit_status = llfs::unsafe_commit_job(std::move(page_job)); - ASSERT_TRUE(commit_status.ok()) << BATT_INSPECT(commit_status); + Status build_status = context.build_all_pages(); + ASSERT_TRUE(build_status.ok()) << BATT_INSPECT(build_status); - page_loader.emplace(*page_cache); + StatusOr finish_status = tree.finish_serialize(context); + ASSERT_TRUE(finish_status.ok()) << BATT_INSPECT(finish_status); - apply_tree_updates(create_deletion_batch); + page_job->new_root(*finish_status); + Status commit_status = llfs::unsafe_commit_job(std::move(page_job)); + ASSERT_TRUE(commit_status.ok()) << BATT_INSPECT(commit_status); - StatusOr tree_height = tree.get_height(*page_loader); - ASSERT_TRUE(tree_height.ok()) << BATT_INSPECT(tree_height); + ASSERT_NO_FATAL_FAILURE( + verify_table_point_queries(expected_table, actual_table, rng, batt::log2_ceil(i))) + << BATT_INSPECT(i); + + if (perform_scan) { + auto root_ptr = std::make_shared(tree.clone_serialized_or_panic()); + std::unique_ptr scanner_page_job = page_cache->new_job(); + + const usize scan_len = 20; + std::array, kMaxScanSize> scan_items_buffer; + KeyView min_key = update.result_set.get_min_key(); + + KVStoreScanner kv_scanner{*page_loader, + root_ptr->page_id_slot_or_panic(), + BATT_OK_RESULT_OR_PANIC(root_ptr->get_height(*page_loader)), + min_key, + tree_options.trie_index_sharded_view_size(), + None}; + + usize n_read = 0; + { + BATT_CHECK_OK(kv_scanner.start()); + for (auto& kv_pair : scan_items_buffer) { + Optional item = kv_scanner.next(); + if (!item) { + break; + } + kv_pair.first = item->key; + kv_pair.second = item->value; + ++n_read; + if (n_read == scan_len) { + break; + } + } + } + ASSERT_NO_FATAL_FAILURE(verify_range_scan(nullptr, + expected_table, + as_slice(scan_items_buffer.data(), n_read), + min_key, + scan_len)) + << BATT_INSPECT(i) << BATT_INSPECT_STR(min_key) << BATT_INSPECT(scan_len); + } - /*BatchUpdateContext update_context{ - .worker_pool = worker_pool, - .page_loader = *page_loader, - .cancel_token = batt::CancelToken{}, + page_loader.emplace(*page_cache); + } + } }; - while (*tree_height > 2) { - Status flush_status = tree.try_flush(update_context); - ASSERT_TRUE(flush_status.ok()); - tree_height = tree.get_height(*page_loader); + LOG(INFO) << "Inserting key/value pairs into tree.."; + apply_tree_updates(create_insertion_batch, false); + + LOG(INFO) << "Deleting key/value pairs from tree..."; + for (usize i = 0; i < total_batches; ++i) { + bool perform_scan = i == 0 ? true : false; + StatusOr tree_height = tree.get_height(*page_loader); ASSERT_TRUE(tree_height.ok()) << BATT_INSPECT(tree_height); - } */ + if (*tree_height > 0) { + apply_tree_updates(create_deletion_batch, perform_scan); + } else { + break; + } + } } } // namespace diff --git a/src/turtle_kv/tree/subtree.cpp b/src/turtle_kv/tree/subtree.cpp index 8adfddc..5e99a1d 100644 --- a/src/turtle_kv/tree/subtree.cpp +++ b/src/turtle_kv/tree/subtree.cpp @@ -321,7 +321,7 @@ Status Subtree::split_and_grow(BatchUpdateContext& context, //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - // -Status Subtree::flush_and_shrink(BatchUpdateContext& context) +Status Subtree::flush_and_shrink(BatchUpdateContext& context) noexcept { BATT_CHECK(!this->locked_.load()); @@ -345,6 +345,20 @@ Status Subtree::flush_and_shrink(BatchUpdateContext& context) return OkStatus(); } + if (batt::is_case>((*status_or_new_root)->impl_)) { + const auto& leaf_ptr = + std::get>((*status_or_new_root)->impl_); + BATT_CHECK(leaf_ptr); + + // If the new root that is returned is an empty leaf, set the root to be an empty + // subtree. + // + if (!leaf_ptr->get_items_size()) { + this->impl_ = llfs::PageIdSlot::from_page_id(llfs::PageId{}); + return OkStatus(); + } + } + this->impl_ = std::move((*status_or_new_root)->impl_); return OkStatus(); @@ -573,7 +587,8 @@ StatusOr> Subtree::try_split(BatchUpdateContext& context) //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - // -StatusOr> Subtree::try_merge(BatchUpdateContext& context, Subtree& sibling) +StatusOr> Subtree::try_merge(BatchUpdateContext& context, + Subtree& sibling) noexcept { BATT_CHECK(!this->locked_.load()); @@ -592,7 +607,7 @@ StatusOr> Subtree::try_merge(BatchUpdateContext& context, Subt BATT_CHECK(sibling_leaf_ptr); BATT_ASSIGN_OK_RESULT(std::unique_ptr merged_leaf, // - leaf->try_merge(context, *sibling_leaf_ptr)); + leaf->try_merge(context, std::move(sibling_leaf_ptr))); return {Subtree{std::move(merged_leaf)}}; }, @@ -615,7 +630,7 @@ StatusOr> Subtree::try_merge(BatchUpdateContext& context, Subt //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - // -StatusOr Subtree::try_borrow(BatchUpdateContext& context, Subtree& sibling) +StatusOr Subtree::try_borrow(BatchUpdateContext& context, Subtree& sibling) noexcept { BATT_CHECK(!this->locked_.load()); @@ -762,7 +777,7 @@ void Subtree::lock() // Status Subtree::to_in_memory_subtree(BatchUpdateContext& context, const TreeOptions& tree_options, - i32 height) + i32 height) noexcept { BATT_CHECK_GT(height, 0); @@ -771,7 +786,7 @@ Status Subtree::to_in_memory_subtree(BatchUpdateContext& context, BATT_CHECK(page_id_slot.is_valid()); - llfs::PageLayoutId expected_layout = Subtree::expected_layout_for_height(height); + const llfs::PageLayoutId expected_layout = Subtree::expected_layout_for_height(height); StatusOr status_or_pinned_page = page_id_slot.load_through( context.page_loader, diff --git a/src/turtle_kv/tree/subtree.hpp b/src/turtle_kv/tree/subtree.hpp index 3f883cb..31144ea 100644 --- a/src/turtle_kv/tree/subtree.hpp +++ b/src/turtle_kv/tree/subtree.hpp @@ -140,16 +140,16 @@ class Subtree /** \brief Attempts to merge the given Subtree with one of its siblings. If successful, the * newly merged Subtree is returned. - * + * * If no merge, returns None. */ - StatusOr> try_merge(BatchUpdateContext& context, Subtree& sibling); + StatusOr> try_merge(BatchUpdateContext& context, Subtree& sibling) noexcept; /** \brief Attempts to make the Subtree viable by borrowing data from one of its siblings. * Called when the Subtree needs a merge, but borrowing is the only option to make the tree * viable. */ - StatusOr try_borrow(BatchUpdateContext& context, Subtree& sibling); + StatusOr try_borrow(BatchUpdateContext& context, Subtree& sibling) noexcept; /** \brief Attempt to make the root viable by flushing a batch. */ @@ -189,7 +189,7 @@ class Subtree */ Status to_in_memory_subtree(BatchUpdateContext& context, const TreeOptions& tree_options, - i32 height); + i32 height) noexcept; //+++++++++++-+-+--+----- --- -- - - - - private: @@ -201,7 +201,7 @@ class Subtree * flushes the root's update buffer until its is either empty * (causing the tree to shrink in height) or until it gains more pivots. */ - Status flush_and_shrink(BatchUpdateContext& context); + Status flush_and_shrink(BatchUpdateContext& context) noexcept; //+++++++++++-+-+--+----- --- -- - - - - From 51a4e0264c57ff5c88554aa66b384a6a58c858a3 Mon Sep 17 00:00:00 2001 From: Vidya Silai Date: Mon, 17 Nov 2025 20:38:07 -0500 Subject: [PATCH 06/48] KVStoreScanner updates --- src/turtle_kv/kv_store.cpp | 1 - src/turtle_kv/kv_store_scanner.cpp | 45 ++++++---------------- src/turtle_kv/kv_store_scanner.hpp | 8 +--- src/turtle_kv/tree/in_memory_node.test.cpp | 1 - 4 files changed, 12 insertions(+), 43 deletions(-) diff --git a/src/turtle_kv/kv_store.cpp b/src/turtle_kv/kv_store.cpp index 93b1e5f..b36122b 100644 --- a/src/turtle_kv/kv_store.cpp +++ b/src/turtle_kv/kv_store.cpp @@ -757,7 +757,6 @@ StatusOr KVStore::scan_keys(const KeyView& min_key, this->metrics_.scan_count.add(1); KVStoreScanner scanner{*this, min_key}; - scanner.set_keys_only(true); BATT_REQUIRE_OK(scanner.start()); return scanner.read_keys(items_out); diff --git a/src/turtle_kv/kv_store_scanner.cpp b/src/turtle_kv/kv_store_scanner.cpp index 63fdb5b..b1e8423 100644 --- a/src/turtle_kv/kv_store_scanner.cpp +++ b/src/turtle_kv/kv_store_scanner.cpp @@ -435,17 +435,15 @@ Status KVStoreScanner::set_next_item() ScanLevel* scan_level = this->heap_.first(); if (!this->next_item_) { - this->next_item_.emplace(scan_level->item(this->keys_only_)); + this->next_item_.emplace(scan_level->item()); } else if (this->next_item_->key == scan_level->key) { - if (!this->keys_only_ && this->next_item_->needs_combine()) { + if (this->next_item_->needs_combine()) { this->next_item_->value = combine(this->next_item_->value, scan_level->value()); } } else { - // TODO [vsilai 11-10-2025]: need to fix key only scans to look at values. - // - if (!this->keys_only_ && this->next_item_->value == ValueView::deleted()) { + if (this->next_item_->value == ValueView::deleted()) { this->next_item_ = None; continue; } else { @@ -575,7 +573,7 @@ Status KVStoreScanner::set_next_item() //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - // -EditView KVStoreScanner::ScanLevel::item(bool key_only) const +EditView KVStoreScanner::ScanLevel::item() const { return batt::case_of( this->state_impl, @@ -583,58 +581,37 @@ EditView KVStoreScanner::ScanLevel::item(bool key_only) const BATT_PANIC() << "illegal state"; BATT_UNREACHABLE(); }, - [this, key_only](const MemTableScanState& state) -> EditView { + [this](const MemTableScanState& state) -> EditView { MemTableEntry entry; const bool found = state.mem_table_->hash_index().find_key(this->key, entry); BATT_CHECK(found); - if (key_only) { - return EditView{entry.key_, ValueView{}}; - } return EditView{entry.key_, entry.value_}; }, - [this, key_only](const MemTableScanState& state) -> EditView { + [this](const MemTableScanState& state) -> EditView { const MemTableEntry* entry = state.mem_table_->hash_index().unsynchronized_find_key(key); BATT_CHECK_NOT_NULLPTR(entry); - if (key_only) { - return EditView{entry->key_, ValueView{}}; - } return EditView{entry->key_, entry->value_}; }, - [key_only](const MemTableValueScanState& state) -> EditView { + [](const MemTableValueScanState& state) -> EditView { const MemTableValueEntry& entry = state.art_scanner_->get_value(); - if (key_only) { - return EditView{entry.key_view(), ValueView{}}; - } return EditView{entry.key_view(), entry.value_view()}; }, - [key_only](const MemTableValueScanState& state) -> EditView { + [](const MemTableValueScanState& state) -> EditView { const MemTableValueEntry& entry = state.art_scanner_->get_value(); - if (key_only) { - return EditView{entry.key_view(), ValueView{}}; - } return EditView{entry.key_view(), entry.value_view()}; }, [](const Slice& state) -> EditView { return state.front(); }, - [this, key_only](const TreeLevelScanState& state) -> EditView { - if (key_only) { - return EditView{this->key, ValueView{}}; - } + [this](const TreeLevelScanState& state) -> EditView { return EditView{this->key, get_value(state.kv_slice.front())}; }, - [this, key_only](const TreeLevelScanShardedState& state) -> EditView { - if (key_only) { - return EditView{this->key, ValueView{}}; - } + [this](const TreeLevelScanShardedState& state) -> EditView { return EditView{this->key, state.kv_slice.front_value()}; }, - [this, key_only](const ShardedLeafScanState& state) -> EditView { - if (key_only) { - return EditView{this->key, ValueView{}}; - } + [this](const ShardedLeafScanState& state) -> EditView { return EditView{this->key, BATT_OK_RESULT_OR_PANIC(state.leaf_scanner_->front_value())}; }); } diff --git a/src/turtle_kv/kv_store_scanner.hpp b/src/turtle_kv/kv_store_scanner.hpp index dc7d26c..83c9468 100644 --- a/src/turtle_kv/kv_store_scanner.hpp +++ b/src/turtle_kv/kv_store_scanner.hpp @@ -198,7 +198,7 @@ class KVStoreScanner /** \brief Returns the current item as an EditView. */ - EditView item(bool key_only) const; + EditView item() const; /** \brief Returns the value of the current item. */ @@ -306,11 +306,6 @@ class KVStoreScanner StatusOr read_keys(const Slice& buffer); - void set_keys_only(bool b) noexcept - { - this->keys_only_ = b; - } - //+++++++++++-+-+--+----- --- -- - - - -- private: Status validate_page_layout(i32 height, const llfs::PinnedPage& pinned_page); @@ -353,7 +348,6 @@ class KVStoreScanner boost::container::static_vector tree_scan_path_; boost::container::small_vector scan_levels_; StackMerger heap_; - bool keys_only_ = false; Optional sharded_leaf_scanner_; }; diff --git a/src/turtle_kv/tree/in_memory_node.test.cpp b/src/turtle_kv/tree/in_memory_node.test.cpp index 41b26f1..4b56c15 100644 --- a/src/turtle_kv/tree/in_memory_node.test.cpp +++ b/src/turtle_kv/tree/in_memory_node.test.cpp @@ -282,7 +282,6 @@ TEST(InMemoryNodeTest, Subtree) if (n_threads != 0) { runner.n_threads(n_threads); } - runner.n_threads(usize{1}); runner.n_seeds(n_seeds); if (n_seeds < 128) { From b126c03fb1a870fcf46278cce69b3fcdc320a774 Mon Sep 17 00:00:00 2001 From: Vidya Silai Date: Mon, 17 Nov 2025 20:42:44 -0500 Subject: [PATCH 07/48] Subtree test config change --- src/turtle_kv/tree/in_memory_node.test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/turtle_kv/tree/in_memory_node.test.cpp b/src/turtle_kv/tree/in_memory_node.test.cpp index 4b56c15..80cd6da 100644 --- a/src/turtle_kv/tree/in_memory_node.test.cpp +++ b/src/turtle_kv/tree/in_memory_node.test.cpp @@ -326,7 +326,7 @@ void SubtreeBatchUpdateScenario::run() } TreeOptions tree_options = TreeOptions::with_default_values() // - .set_leaf_size(32 * kKiB) + .set_leaf_size(512 * kKiB) .set_node_size(4 * kKiB) .set_key_size_hint(24) .set_value_size_hint(100) From afa9940e62563cc48ff584e346c840f762e038a5 Mon Sep 17 00:00:00 2001 From: Vidya Silai Date: Fri, 21 Nov 2025 09:48:05 -0500 Subject: [PATCH 08/48] First round of feedback --- src/turtle_kv/core/testing/generate.hpp | 12 ++-- src/turtle_kv/core/testing/generate.test.cpp | 25 +++++++- src/turtle_kv/kv_store_scanner.cpp | 16 ++++- src/turtle_kv/tree/in_memory_node.test.cpp | 2 +- src/turtle_kv/tree/packed_leaf_page.hpp | 61 ++++++++++++++++--- .../tree/testing/random_leaf_generator.hpp | 2 +- 6 files changed, 99 insertions(+), 19 deletions(-) diff --git a/src/turtle_kv/core/testing/generate.hpp b/src/turtle_kv/core/testing/generate.hpp index 06b4c37..4410276 100644 --- a/src/turtle_kv/core/testing/generate.hpp +++ b/src/turtle_kv/core/testing/generate.hpp @@ -188,13 +188,17 @@ class RandomResultSetGenerator : public MinMaxSize DecayToItem, Rng& rng, llfs::StableStringStore& store, - Optional> to_delete = None) + const std::vector& to_delete) { using ResultSet = MergeCompactor::ResultSet; using Item = typename ResultSet::value_type; const usize n = this->Super::pick_size(rng); std::vector items; + + for (const KeyView& delete_key : to_delete) { + items.emplace_back(delete_key, ValueView::deleted()); + } while (items.size() < n) { for (usize i = items.size(); i < n; ++i) { @@ -203,12 +207,6 @@ class RandomResultSetGenerator : public MinMaxSize ValueView::from_str(store.store(std::string(this->value_size_, ch)))); } - if (to_delete) { - for (const KeyView& delete_key : *to_delete) { - items.emplace_back(delete_key, ValueView::deleted()); - } - } - std::sort(items.begin(), items.end(), KeyOrder{}); items.erase(std::unique(items.begin(), items.end(), diff --git a/src/turtle_kv/core/testing/generate.test.cpp b/src/turtle_kv/core/testing/generate.test.cpp index b26c2a7..5076623 100644 --- a/src/turtle_kv/core/testing/generate.test.cpp +++ b/src/turtle_kv/core/testing/generate.test.cpp @@ -7,9 +7,14 @@ namespace { +using batt::int_types::usize; + using turtle_kv::DecayToItem; using turtle_kv::ItemView; using turtle_kv::KeyOrder; +using turtle_kv::KeyView; +using turtle_kv::StatusOr; +using turtle_kv::ValueView; using turtle_kv::testing::RandomResultSetGenerator; template @@ -24,10 +29,28 @@ TEST(GenerateTest, Test) g.set_size(200); - ResultSet result_set = g(DecayToItem{}, rng, store); + std::vector to_delete; + ResultSet result_set = g(DecayToItem{}, rng, store, to_delete); EXPECT_TRUE(std::is_sorted(result_set.get().begin(), result_set.get().end(), KeyOrder{})); EXPECT_EQ(result_set.get().size(), 200u); + + auto result_set_slice = result_set.get(); + usize i = 0; + for (const ItemView& edit : result_set_slice) { + if (i % 2) { + to_delete.emplace_back(edit.key); + } + ++i; + } + + ResultSet result_set_with_deletes = g(DecayToItem{}, rng, store, to_delete); + for (const KeyView& deleted_key : to_delete) { + StatusOr deleted_value = result_set_with_deletes.find_key(deleted_key); + EXPECT_TRUE(deleted_value.ok()); + EXPECT_EQ(*deleted_value, ValueView::deleted()); + } + EXPECT_EQ(to_delete.size(), result_set_with_deletes.size() / 2); } } // namespace diff --git a/src/turtle_kv/kv_store_scanner.cpp b/src/turtle_kv/kv_store_scanner.cpp index b1e8423..f3b5164 100644 --- a/src/turtle_kv/kv_store_scanner.cpp +++ b/src/turtle_kv/kv_store_scanner.cpp @@ -438,12 +438,26 @@ Status KVStoreScanner::set_next_item() this->next_item_.emplace(scan_level->item()); } else if (this->next_item_->key == scan_level->key) { + // If this->next_item_->key == scan_level->key, we need to search for a terminal value for + // the item, so combine it if necessary. + // if (this->next_item_->needs_combine()) { this->next_item_->value = combine(this->next_item_->value, scan_level->value()); } } else { + // If the item stored in this->next_item_ does not have the same key as the first key in + // the current scan_level, we have reached a terminal value for this->next_item_. Now, + // we have to decide whether we want to keep this->next_item_ and break from the loop + // (returning the item to the function's caller) OR discard it, because the terminal value + // represents a deleted item. + // if (this->next_item_->value == ValueView::deleted()) { + // The terminal value represents a deleted item, so discard it by setting this->next_item_ + // to None. Then, continue on to the next iteration of the loop, skipping the logic to + // advance the current scan_level. We do this because we now need to set the first key + // in the current scan_level to this->next_item_ to examine it next. + // this->next_item_ = None; continue; } else { @@ -459,7 +473,7 @@ Status KVStoreScanner::set_next_item() LatencyTimer timer{batt::Every2ToTheConst<8>{}, KVStoreScanner::metrics().heap_remove_latency}; this->heap_.remove_first(); - //this->needs_resume_ = true; + // this->needs_resume_ = true; BATT_REQUIRE_OK(this->resume()); } } diff --git a/src/turtle_kv/tree/in_memory_node.test.cpp b/src/turtle_kv/tree/in_memory_node.test.cpp index 80cd6da..d1b1021 100644 --- a/src/turtle_kv/tree/in_memory_node.test.cpp +++ b/src/turtle_kv/tree/in_memory_node.test.cpp @@ -710,7 +710,7 @@ TEST(InMemoryNodeTest, SubtreeDeletions) } }; - LOG(INFO) << "Inserting key/value pairs into tree.."; + LOG(INFO) << "Inserting key/value pairs into tree..."; apply_tree_updates(create_insertion_batch, false); LOG(INFO) << "Deleting key/value pairs from tree..."; diff --git a/src/turtle_kv/tree/packed_leaf_page.hpp b/src/turtle_kv/tree/packed_leaf_page.hpp index 74e65d3..cb1b9ea 100644 --- a/src/turtle_kv/tree/packed_leaf_page.hpp +++ b/src/turtle_kv/tree/packed_leaf_page.hpp @@ -320,6 +320,8 @@ struct PackedLeafLayoutPlan { usize page_size; usize key_count; usize trie_index_reserved_size; + usize avg_key_len; + usize drop_count; usize trie_index_begin; usize trie_index_end; @@ -361,6 +363,8 @@ struct PackedLeafLayoutPlan { } void check_valid(std::string_view label) const; + + usize compute_trie_step_size() const; }; //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - @@ -370,6 +374,8 @@ BATT_OBJECT_PRINT_IMPL((inline), (page_size, key_count, trie_index_reserved_size, + avg_key_len, + drop_count, trie_index_begin, trie_index_end, leaf_header_begin, @@ -392,6 +398,43 @@ inline void PackedLeafLayoutPlan::check_valid(std::string_view label) const BATT_CHECK(this->is_valid()) << *this << BATT_INSPECT_STR(label); } +//==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - +// +inline usize PackedLeafLayoutPlan::compute_trie_step_size() const +{ + BATT_CHECK_GT(this->key_count, 0); + BATT_CHECK_GT(this->avg_key_len, 0); + + // If there are no deleted items in this leaf, return 16. + // + if (this->drop_count == 0) { + return 16; + } + + usize trie_buffer_size = this->trie_index_end - this->trie_index_begin; + BATT_CHECK_GT(trie_buffer_size, 0); + + // Determine the number of pivot keys to intialize the trie with by using the size of the trie + // buffer and the average key length across the items in the leaf. + // + usize pivot_count = trie_buffer_size / this->avg_key_len; + usize step_size = (this->key_count + pivot_count - 1) / pivot_count; + + BATT_CHECK_GT(step_size, 0); + + // If the calculated step size is already a power of 2, return it now. + // + if ((step_size & (step_size - 1)) == 0) { + return step_size; + } + + // Otherwise, calculate the nearest power of 2 less than `step_size`. + // + i32 shift = batt::log2_floor(step_size); + BATT_CHECK_GE(shift, 0); + return usize{1} << shift; +} + //=#=#==#==#===============+=+=+=+=++=++++++++++++++-++-+--+-+----+--------------- // class PackedLeafLayoutPlanBuilder @@ -423,6 +466,7 @@ class PackedLeafLayoutPlanBuilder plan.page_size = this->page_size; plan.key_count = BATT_CHECKED_CAST(u32, this->key_count); plan.trie_index_reserved_size = this->trie_index_reserved_size; + plan.avg_key_len = plan.key_count > 0 ? this->key_data_size / plan.key_count : 0; usize offset = 0; const auto append = [&offset](usize size) { @@ -514,19 +558,18 @@ struct AddLeafItemsSummary { LeafItemsSummary operator()(const LeafItemsSummary& prior, const EditView& edit) const noexcept { usize drop_count = prior.drop_count; - if (decays_to_item(edit)) { + if (!decays_to_item(edit)) { drop_count++; } return LeafItemsSummary{ - .drop_count = drop_count, - .key_count = prior.key_count + 1, - .key_data_size = prior.key_data_size + (edit.key.size() + 4), - .value_data_size = prior.value_data_size + (1 + edit.value.size()), + .drop_count = drop_count, + .key_count = prior.key_count + 1, + .key_data_size = prior.key_data_size + (edit.key.size() + 4), + .value_data_size = prior.value_data_size + (1 + edit.value.size()), }; } - LeafItemsSummary operator()(const LeafItemsSummary& prior, - const ItemView& edit) const noexcept + LeafItemsSummary operator()(const LeafItemsSummary& prior, const ItemView& edit) const noexcept { return AddLeafItemsSummary{}(BATT_FORWARD(prior), EditView::from_item_view(edit)); } @@ -565,6 +608,8 @@ template PackedLeafLayoutPlan plan = plan_builder.build(); + plan.drop_count = summary.drop_count; + return plan; } @@ -690,7 +735,7 @@ inline PackedLeafPage* build_leaf_page(MutableBuffer buffer, if (plan.trie_index_reserved_size > 0) { const MutableBuffer trie_buffer{(void*)advance_pointer(buffer.data(), plan.trie_index_begin), plan.trie_index_end - plan.trie_index_begin}; - usize step_size = 16; + usize step_size = plan.compute_trie_step_size(); bool retried = false; batt::SmallVec pivot_keys; for (;;) { diff --git a/src/turtle_kv/tree/testing/random_leaf_generator.hpp b/src/turtle_kv/tree/testing/random_leaf_generator.hpp index 152d707..eb1b74b 100644 --- a/src/turtle_kv/tree/testing/random_leaf_generator.hpp +++ b/src/turtle_kv/tree/testing/random_leaf_generator.hpp @@ -59,7 +59,7 @@ class RandomLeafGenerator // Generate a sorted run of random key/value pairs. // - result.result_set = this->items_generator_(decay_to_items, rng, store); + result.result_set = this->items_generator_(decay_to_items, rng, store, {}); batt::WorkerPool& worker_pool = batt::WorkerPool::null_pool(); From 8dbab04e3b18737c9829a5129ece3aeb9c842e3b Mon Sep 17 00:00:00 2001 From: Vidya Silai Date: Mon, 24 Nov 2025 17:44:46 -0500 Subject: [PATCH 09/48] Improvements to Subtree test --- src/turtle_kv/kv_store.cpp | 9 ++++++++ src/turtle_kv/tree/algo/nodes.hpp | 2 +- src/turtle_kv/tree/in_memory_node.test.cpp | 24 +++++++++++++++++++--- 3 files changed, 31 insertions(+), 4 deletions(-) diff --git a/src/turtle_kv/kv_store.cpp b/src/turtle_kv/kv_store.cpp index b36122b..0713d51 100644 --- a/src/turtle_kv/kv_store.cpp +++ b/src/turtle_kv/kv_store.cpp @@ -653,6 +653,9 @@ StatusOr KVStore::get(const KeyView& key) noexcept /*override*/ if (value) { if (!value->needs_combine()) { this->metrics_.mem_table_get_count.add(1); + if (value->is_delete()) { + return {batt::StatusCode::kNotFound}; + } // VLOG(1) << "found key " << batt::c_str_literal(key) << " in active MemTable"; return *value; } @@ -677,11 +680,17 @@ StatusOr KVStore::get(const KeyView& key) noexcept /*override*/ *value = combine(*value, *delta_value); if (!value->needs_combine()) { this->metrics_.delta_log2_get_count[batt::log2_ceil(observed_deltas_size - i)].add(1); + if (value->is_delete()) { + return {batt::StatusCode::kNotFound}; + } return *value; } } else { if (!delta_value->needs_combine()) { this->metrics_.delta_log2_get_count[batt::log2_ceil(observed_deltas_size - i)].add(1); + if (delta_value->is_delete()) { + return {batt::StatusCode::kNotFound}; + } return *delta_value; } value = delta_value; diff --git a/src/turtle_kv/tree/algo/nodes.hpp b/src/turtle_kv/tree/algo/nodes.hpp index 8bb8fe5..50c066a 100644 --- a/src/turtle_kv/tree/algo/nodes.hpp +++ b/src/turtle_kv/tree/algo/nodes.hpp @@ -179,7 +179,7 @@ struct NodeAlgorithms { BATT_REQUIRE_OK(combine_in_place(&value, subtree_result)); - if (!value) { + if (!value || value->is_delete()) { return {batt::StatusCode::kNotFound}; } diff --git a/src/turtle_kv/tree/in_memory_node.test.cpp b/src/turtle_kv/tree/in_memory_node.test.cpp index d1b1021..211326d 100644 --- a/src/turtle_kv/tree/in_memory_node.test.cpp +++ b/src/turtle_kv/tree/in_memory_node.test.cpp @@ -134,6 +134,16 @@ void verify_table_point_queries(Table& expected_table, Table& actual_table, Rng& } } +void verify_deleted_point_queries(Table& expected_table, + Table& actual_table, + const std::vector& deleted_keys) +{ + for (const KeyView& key : deleted_keys) { + EXPECT_EQ(expected_table.get(key).status(), batt::StatusCode::kNotFound); + EXPECT_EQ(actual_table.get(key).status(), batt::StatusCode::kNotFound); + } +} + void verify_range_scan(LatencyMetric* scan_latency, Table& expected_table, const Slice>& actual_read_items, @@ -427,6 +437,10 @@ void SubtreeBatchUpdateScenario::run() verify_table_point_queries(expected_table, actual_table, rng, batt::log2_ceil(i))) << BATT_INSPECT(this->seed) << BATT_INSPECT(i); + ASSERT_NO_FATAL_FAILURE( + verify_deleted_point_queries(expected_table, actual_table, pending_deletes)) + << BATT_INSPECT(this->seed) << BATT_INSPECT(i); + if (((i + 1) % chi) == 0) { if (my_id == 0) { LOG(INFO) << "taking checkpoint..."; @@ -456,6 +470,10 @@ void SubtreeBatchUpdateScenario::run() verify_table_point_queries(expected_table, actual_table, rng, batt::log2_ceil(i))) << BATT_INSPECT(this->seed) << BATT_INSPECT(i); + ASSERT_NO_FATAL_FAILURE( + verify_deleted_point_queries(expected_table, actual_table, pending_deletes)) + << BATT_INSPECT(this->seed) << BATT_INSPECT(i); + { auto root_ptr = std::make_shared(tree.clone_serialized_or_panic()); std::unique_ptr scanner_page_job = page_cache->new_job(); @@ -663,8 +681,8 @@ TEST(InMemoryNodeTest, SubtreeDeletions) ASSERT_TRUE(commit_status.ok()) << BATT_INSPECT(commit_status); ASSERT_NO_FATAL_FAILURE( - verify_table_point_queries(expected_table, actual_table, rng, batt::log2_ceil(i))) - << BATT_INSPECT(i); + verify_table_point_queries(expected_table, actual_table, rng, batt::log2_ceil(i))) + << BATT_INSPECT(i); if (perform_scan) { auto root_ptr = std::make_shared(tree.clone_serialized_or_panic()); @@ -702,7 +720,7 @@ TEST(InMemoryNodeTest, SubtreeDeletions) as_slice(scan_items_buffer.data(), n_read), min_key, scan_len)) - << BATT_INSPECT(i) << BATT_INSPECT_STR(min_key) << BATT_INSPECT(scan_len); + << BATT_INSPECT(i) << BATT_INSPECT_STR(min_key) << BATT_INSPECT(scan_len); } page_loader.emplace(*page_cache); From 969efc0788cb175a4899f8c043eafc1140a7eb5f Mon Sep 17 00:00:00 2001 From: Vidya Silai Date: Fri, 28 Nov 2025 11:48:42 -0500 Subject: [PATCH 10/48] More feedback --- src/turtle_kv/core/merge_compactor.test.cpp | 51 ++++++++ src/turtle_kv/core/testing/generate.hpp | 18 ++- src/turtle_kv/kv_store.cpp | 33 ++--- src/turtle_kv/kv_store_scanner.cpp | 6 +- src/turtle_kv/tree/batch_update.cpp | 121 ------------------ src/turtle_kv/tree/batch_update.hpp | 130 ++++++++++++++++++-- src/turtle_kv/tree/in_memory_node.test.cpp | 20 +-- src/turtle_kv/tree/subtree.cpp | 9 +- 8 files changed, 220 insertions(+), 168 deletions(-) diff --git a/src/turtle_kv/core/merge_compactor.test.cpp b/src/turtle_kv/core/merge_compactor.test.cpp index 651371a..af42abc 100644 --- a/src/turtle_kv/core/merge_compactor.test.cpp +++ b/src/turtle_kv/core/merge_compactor.test.cpp @@ -5,6 +5,7 @@ #include #include +#include #include #include @@ -23,6 +24,8 @@ using namespace batt::int_types; using batt::as_seq; using batt::WorkerPool; +using llfs::StableStringStore; + using turtle_kv::CInterval; using turtle_kv::EditSlice; using turtle_kv::EditView; @@ -39,6 +42,8 @@ using turtle_kv::Status; using turtle_kv::StatusOr; using turtle_kv::ValueView; +using turtle_kv::testing::RandomStringGenerator; + namespace seq = turtle_kv::seq; constexpr usize kNumKeys = 16; @@ -482,4 +487,50 @@ TEST(MergeCompactor, ResultSetDropKeyRange) } } +TEST(MergeCompactor, ResultSetConcat) +{ + usize n = 200; + std::vector all_edits; + std::unordered_set keys_set; + llfs::StableStringStore store; + + std::string value_str = std::string(100, 'a'); + ValueView value = ValueView::from_str(value_str); + + std::default_random_engine rng{/*seed=*/30}; + RandomStringGenerator generate_key; + while (all_edits.size() < n) { + KeyView key = generate_key(rng, store); + if (keys_set.contains(key)) { + continue; + } + keys_set.emplace(key); + all_edits.emplace_back(key, value); + } + std::sort(all_edits.begin(), all_edits.end(), KeyOrder{}); + + std::vector first{all_edits.begin(), all_edits.begin() + (n / 2)}; + std::vector second{all_edits.begin() + (n / 2), all_edits.end()}; + + MergeCompactor::ResultSet first_result_set; + first_result_set.append(std::move(first)); + MergeCompactor::ResultSet second_result_set; + second_result_set.append(std::move(second)); + + EXPECT_EQ(first_result_set.size(), n / 2); + EXPECT_EQ(second_result_set.size(), n / 2); + + MergeCompactor::ResultSet concatenated_result_set = + MergeCompactor::ResultSet::concat(std::move(first_result_set), + std::move(second_result_set)); + + EXPECT_EQ(concatenated_result_set.size(), n); + + usize i = 0; + for (const EditView& edit : concatenated_result_set.get()) { + EXPECT_EQ(edit, all_edits[i]); + ++i; + } +} + } // namespace diff --git a/src/turtle_kv/core/testing/generate.hpp b/src/turtle_kv/core/testing/generate.hpp index 4410276..c48c112 100644 --- a/src/turtle_kv/core/testing/generate.hpp +++ b/src/turtle_kv/core/testing/generate.hpp @@ -12,6 +12,7 @@ #include #include #include +#include #include namespace turtle_kv { @@ -195,16 +196,27 @@ class RandomResultSetGenerator : public MinMaxSize const usize n = this->Super::pick_size(rng); std::vector items; - + for (const KeyView& delete_key : to_delete) { items.emplace_back(delete_key, ValueView::deleted()); } + if (items.size() > n) { + ResultSet result; + result.append(std::move(items)); + return result; + } + std::unordered_set deleted_items_set{to_delete.begin(), to_delete.end()}; while (items.size() < n) { - for (usize i = items.size(); i < n; ++i) { + for (usize i = items.size(); i < n;) { char ch = '_' + (i & 31); - items.emplace_back(this->key_generator_(rng, store), + KeyView key = this->key_generator_(rng, store); + if (deleted_items_set.count(key)) { + continue; + } + items.emplace_back(key, ValueView::from_str(store.store(std::string(this->value_size_, ch)))); + ++i; } std::sort(items.begin(), items.end(), KeyOrder{}); diff --git a/src/turtle_kv/kv_store.cpp b/src/turtle_kv/kv_store.cpp index 0713d51..b324d8d 100644 --- a/src/turtle_kv/kv_store.cpp +++ b/src/turtle_kv/kv_store.cpp @@ -650,14 +650,19 @@ StatusOr KVStore::get(const KeyView& key) noexcept /*override*/ this->metrics_.mem_table_get_latency, observed_state->mem_table_->get(key)); + const auto return_memtable_value = + [](Optional mem_table_value, + FastCountMetric& get_count_metric) -> StatusOr { + get_count_metric.add(1); + if (mem_table_value->is_delete()) { + return {batt::StatusCode::kNotFound}; + } + return *mem_table_value; + }; + if (value) { if (!value->needs_combine()) { - this->metrics_.mem_table_get_count.add(1); - if (value->is_delete()) { - return {batt::StatusCode::kNotFound}; - } - // VLOG(1) << "found key " << batt::c_str_literal(key) << " in active MemTable"; - return *value; + return return_memtable_value(value, this->metrics_.mem_table_get_count); } } @@ -679,19 +684,15 @@ StatusOr KVStore::get(const KeyView& key) noexcept /*override*/ if (value) { *value = combine(*value, *delta_value); if (!value->needs_combine()) { - this->metrics_.delta_log2_get_count[batt::log2_ceil(observed_deltas_size - i)].add(1); - if (value->is_delete()) { - return {batt::StatusCode::kNotFound}; - } - return *value; + return return_memtable_value( + value, + this->metrics_.delta_log2_get_count[batt::log2_ceil(observed_deltas_size - i)]); } } else { if (!delta_value->needs_combine()) { - this->metrics_.delta_log2_get_count[batt::log2_ceil(observed_deltas_size - i)].add(1); - if (delta_value->is_delete()) { - return {batt::StatusCode::kNotFound}; - } - return *delta_value; + return return_memtable_value( + delta_value, + this->metrics_.delta_log2_get_count[batt::log2_ceil(observed_deltas_size - i)]); } value = delta_value; } diff --git a/src/turtle_kv/kv_store_scanner.cpp b/src/turtle_kv/kv_store_scanner.cpp index f3b5164..336b152 100644 --- a/src/turtle_kv/kv_store_scanner.cpp +++ b/src/turtle_kv/kv_store_scanner.cpp @@ -459,6 +459,9 @@ Status KVStoreScanner::set_next_item() // in the current scan_level to this->next_item_ to examine it next. // this->next_item_ = None; + if (this->needs_resume_) { + BATT_REQUIRE_OK(this->resume()); + } continue; } else { break; @@ -473,8 +476,7 @@ Status KVStoreScanner::set_next_item() LatencyTimer timer{batt::Every2ToTheConst<8>{}, KVStoreScanner::metrics().heap_remove_latency}; this->heap_.remove_first(); - // this->needs_resume_ = true; - BATT_REQUIRE_OK(this->resume()); + this->needs_resume_ = true; } } diff --git a/src/turtle_kv/tree/batch_update.cpp b/src/turtle_kv/tree/batch_update.cpp index 8e5675d..262564d 100644 --- a/src/turtle_kv/tree/batch_update.cpp +++ b/src/turtle_kv/tree/batch_update.cpp @@ -13,127 +13,6 @@ void BatchUpdate::update_edit_size_totals() this->context.compute_running_total(this->result_set)); } -//==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - -// -void BatchUpdate::update_edit_size_totals_decayed( - const MergeCompactor::ResultSet& decayed_result_set) -{ - this->edit_size_totals.emplace( - this->context.compute_running_total(decayed_result_set)); -} - -//==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - -// -void BatchUpdate::decay_batch_to_items( - MergeCompactor::ResultSet& output_result_set) -{ - const batt::TaskCount max_tasks{this->context.worker_pool.size() + 1}; - std::vector decayed_items; - - if (max_tasks == 1) { - for (const EditView& edit : this->result_set.get()) { - Optional maybe_item = to_item_view(edit); - if (maybe_item) { - decayed_items.emplace_back(EditView::from_item_view(*maybe_item)); - } - } - } else { - const ParallelAlgoDefaults& algo_defaults = parallel_algo_defaults(); - - auto actual_edits = result_set.get(); - const auto src_begin = actual_edits.begin(); - const auto src_end = actual_edits.end(); - - const batt::WorkSlicePlan plan{batt::WorkSliceParams{ - algo_defaults.copy_decayed_items.min_task_size, - max_tasks, - }, - src_begin, - src_end}; - - BATT_CHECK_GT(plan.n_tasks, 0); - - batt::SmallVec output_size_per_shard(plan.n_tasks); - BATT_CHECK_EQ(output_size_per_shard.size(), plan.n_tasks); - - // First count the number of non-decayed items in the output for each shard. - { - batt::ScopedWorkContext work_context{this->context.worker_pool}; - - BATT_CHECK_OK(batt::slice_work( - work_context, - plan, - /*gen_work_fn=*/ - [&](usize task_index, isize task_offset, isize task_size) { - return [src_begin, task_index, task_offset, task_size, &output_size_per_shard] { - BATT_CHECK_LT(task_index, output_size_per_shard.size()); - - auto task_src_begin = std::next(src_begin, task_offset); - const auto task_src_end = std::next(task_src_begin, task_size); - - usize output_size = 0; - - for (; task_src_begin != task_src_end; ++task_src_begin) { - if (decays_to_item(*task_src_begin)) { - output_size += 1; - } - } - output_size_per_shard[task_index] = output_size; - }; - })) - << "worker_pool must not be closed!"; - } - - // Change to a rolling sum and do the actual copy. - // - usize output_total_size = 0; - batt::SmallVec output_shard_offset; - for (usize output_shard_size : output_size_per_shard) { - output_shard_offset.emplace_back(output_total_size); - output_total_size += output_shard_size; - } - - decayed_items.resize(output_total_size); - { - this->context.worker_pool.reset(); - - batt::ScopedWorkContext work_context{this->context.worker_pool}; - - BATT_CHECK_OK( - batt::slice_work(work_context, - plan, - /*gen_work_fn=*/ - [&](usize task_index, isize task_offset, isize task_size) { - return [src_begin, - &output_shard_offset, - &output_size_per_shard, - task_index, - task_offset, - task_size, - &decayed_items] { - auto task_src_begin = std::next(src_begin, task_offset); - const auto task_src_end = std::next(task_src_begin, task_size); - - BATT_CHECK_LT(task_index, output_shard_offset.size()); - auto task_dst_begin = - std::next(decayed_items.data(), output_shard_offset[task_index]); - - for (; task_src_begin != task_src_end; ++task_src_begin) { - Optional maybe_item = to_item_view(*task_src_begin); - if (maybe_item) { - *task_dst_begin = EditView::from_item_view(*maybe_item); - ++task_dst_begin; - } - } - }; - })) - << "worker_pool must not be closed!"; - } - } - - output_result_set.append(std::move(decayed_items)); -} - //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - // usize BatchUpdate::get_byte_size() diff --git a/src/turtle_kv/tree/batch_update.hpp b/src/turtle_kv/tree/batch_update.hpp index 7eeab1d..2b5150b 100644 --- a/src/turtle_kv/tree/batch_update.hpp +++ b/src/turtle_kv/tree/batch_update.hpp @@ -42,6 +42,12 @@ struct BatchUpdateContext { { return ::turtle_kv::compute_running_total(this->worker_pool, result_set); } + + /** \brief Returns a `ResultSet` with only the edits from the batch passed into the function + * that decay to base-level items (e.g., no tombstones). + */ + MergeCompactor::ResultSet decay_batch_to_items( + MergeCompactor::ResultSet& batch); }; //=#=#==#==#===============+=+=+=+=++=++++++++++++++-++-+--+-+----+--------------- @@ -64,16 +70,6 @@ struct BatchUpdate { */ void update_edit_size_totals(); - /** \brief Resets `this->edit_size_totals` to reflect the decayed version of `this->result_set`. - */ - void update_edit_size_totals_decayed( - const MergeCompactor::ResultSet& decayed_result_set); - - /** \brief Fills the output buffer `ResultSet` passed into the function with only the - * edits from this batch that decay to base-level items (e.g., no tombstones). - */ - void decay_batch_to_items(MergeCompactor::ResultSet& output_result_set); - /** \brief Returns the inclusive (closed) interval of keys in this batch. */ CInterval get_key_crange() const @@ -118,4 +114,118 @@ inline StatusOr> BatchUpdateContext::me return compactor.read(edit_buffer, max_key); } +//==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - +// +inline MergeCompactor::ResultSet BatchUpdateContext::decay_batch_to_items( + MergeCompactor::ResultSet& batch) +{ + const batt::TaskCount max_tasks{this->worker_pool.size() + 1}; + std::vector decayed_items; + + if (max_tasks == 1) { + for (const EditView& edit : batch.get()) { + Optional maybe_item = to_item_view(edit); + if (maybe_item) { + decayed_items.emplace_back(EditView::from_item_view(*maybe_item)); + } + } + } else { + const ParallelAlgoDefaults& algo_defaults = parallel_algo_defaults(); + + auto actual_edits = batch.get(); + const auto src_begin = actual_edits.begin(); + const auto src_end = actual_edits.end(); + + const batt::WorkSlicePlan plan{batt::WorkSliceParams{ + algo_defaults.copy_decayed_items.min_task_size, + max_tasks, + }, + src_begin, + src_end}; + + BATT_CHECK_GT(plan.n_tasks, 0); + + batt::SmallVec output_size_per_shard(plan.n_tasks); + BATT_CHECK_EQ(output_size_per_shard.size(), plan.n_tasks); + + // First count the number of non-decayed items in the output for each shard. + { + batt::ScopedWorkContext work_context{this->worker_pool}; + + BATT_CHECK_OK(batt::slice_work( + work_context, + plan, + /*gen_work_fn=*/ + [&](usize task_index, isize task_offset, isize task_size) { + return [src_begin, task_index, task_offset, task_size, &output_size_per_shard] { + BATT_CHECK_LT(task_index, output_size_per_shard.size()); + + auto task_src_begin = std::next(src_begin, task_offset); + const auto task_src_end = std::next(task_src_begin, task_size); + + usize output_size = 0; + + for (; task_src_begin != task_src_end; ++task_src_begin) { + if (decays_to_item(*task_src_begin)) { + output_size += 1; + } + } + output_size_per_shard[task_index] = output_size; + }; + })) + << "worker_pool must not be closed!"; + } + + // Change to a rolling sum and do the actual copy. + // + usize output_total_size = 0; + batt::SmallVec output_shard_offset; + for (usize output_shard_size : output_size_per_shard) { + output_shard_offset.emplace_back(output_total_size); + output_total_size += output_shard_size; + } + + decayed_items.resize(output_total_size); + { + this->worker_pool.reset(); + + batt::ScopedWorkContext work_context{this->worker_pool}; + + BATT_CHECK_OK( + batt::slice_work(work_context, + plan, + /*gen_work_fn=*/ + [&](usize task_index, isize task_offset, isize task_size) { + return [src_begin, + &output_shard_offset, + &output_size_per_shard, + task_index, + task_offset, + task_size, + &decayed_items] { + auto task_src_begin = std::next(src_begin, task_offset); + const auto task_src_end = std::next(task_src_begin, task_size); + + BATT_CHECK_LT(task_index, output_shard_offset.size()); + auto task_dst_begin = + std::next(decayed_items.data(), output_shard_offset[task_index]); + + for (; task_src_begin != task_src_end; ++task_src_begin) { + Optional maybe_item = to_item_view(*task_src_begin); + if (maybe_item) { + *task_dst_begin = EditView::from_item_view(*maybe_item); + ++task_dst_begin; + } + } + }; + })) + << "worker_pool must not be closed!"; + } + } + + MergeCompactor::ResultSet output_result_set; + output_result_set.append(std::move(decayed_items)); + return output_result_set; +} + } // namespace turtle_kv diff --git a/src/turtle_kv/tree/in_memory_node.test.cpp b/src/turtle_kv/tree/in_memory_node.test.cpp index 211326d..8e9f1db 100644 --- a/src/turtle_kv/tree/in_memory_node.test.cpp +++ b/src/turtle_kv/tree/in_memory_node.test.cpp @@ -544,7 +544,7 @@ TEST(InMemoryNodeTest, SubtreeDeletions) usize items_per_leaf = tree_options.flush_size() / tree_options.expected_item_size(); usize total_batches = 81; - std::vector keys; + std::vector keys; keys.reserve(total_batches * items_per_leaf); std::string value_str = std::string(value_size, 'a'); @@ -552,17 +552,17 @@ TEST(InMemoryNodeTest, SubtreeDeletions) std::default_random_engine rng{/*seed=*/3}; RandomStringGenerator generate_key; - for (usize i = 0; i < total_batches * items_per_leaf; ++i) { - keys.emplace_back(generate_key(rng)); + llfs::StableStringStore store; + std::unordered_set keys_set; + while (keys.size() < total_batches * items_per_leaf) { + KeyView key = generate_key(rng, store); + if (keys_set.contains(key)) { + continue; + } + keys_set.emplace(key); + keys.emplace_back(key); } std::sort(keys.begin(), keys.end(), llfs::KeyOrder{}); - keys.erase(std::unique(keys.begin(), - keys.end(), - [](const auto& l, const auto& r) { - return get_key(l) == get_key(r); - }), - keys.end()); - BATT_CHECK_EQ(keys.size(), total_batches * items_per_leaf); std::shared_ptr page_cache = make_memory_page_cache(batt::Runtime::instance().default_scheduler(), diff --git a/src/turtle_kv/tree/subtree.cpp b/src/turtle_kv/tree/subtree.cpp index 5e99a1d..fb85510 100644 --- a/src/turtle_kv/tree/subtree.cpp +++ b/src/turtle_kv/tree/subtree.cpp @@ -139,13 +139,10 @@ Status Subtree::apply_batch_update(const TreeOptions& tree_options, auto new_leaf = std::make_unique(llfs::PinnedPage{}, tree_options); - update.decay_batch_to_items(new_leaf->result_set); + new_leaf->result_set = std::move(update.context.decay_batch_to_items(update.result_set)); - if (!update.edit_size_totals) { - update.update_edit_size_totals_decayed(new_leaf->result_set); - } - - new_leaf->set_edit_size_totals(std::move(*update.edit_size_totals)); + new_leaf->set_edit_size_totals( + update.context.compute_running_total(new_leaf->result_set)); update.edit_size_totals = None; return Subtree{std::move(new_leaf)}; From 84112d89f53559b78108a956ac5d68455977440f Mon Sep 17 00:00:00 2001 From: Vidya Silai Date: Wed, 3 Dec 2025 08:30:33 -0500 Subject: [PATCH 11/48] Small style updates --- src/turtle_kv/tree/batch_update.cpp | 3 +-- src/turtle_kv/tree/in_memory_node.cpp | 18 +++++++++--------- src/turtle_kv/tree/in_memory_node.hpp | 2 +- 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/src/turtle_kv/tree/batch_update.cpp b/src/turtle_kv/tree/batch_update.cpp index 262564d..94babdd 100644 --- a/src/turtle_kv/tree/batch_update.cpp +++ b/src/turtle_kv/tree/batch_update.cpp @@ -9,8 +9,7 @@ using TrimResult = BatchUpdate::TrimResult; // void BatchUpdate::update_edit_size_totals() { - this->edit_size_totals.emplace( - this->context.compute_running_total(this->result_set)); + this->edit_size_totals.emplace(this->context.compute_running_total(this->result_set)); } //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - diff --git a/src/turtle_kv/tree/in_memory_node.cpp b/src/turtle_kv/tree/in_memory_node.cpp index b2722fd..e0d98af 100644 --- a/src/turtle_kv/tree/in_memory_node.cpp +++ b/src/turtle_kv/tree/in_memory_node.cpp @@ -951,10 +951,10 @@ StatusOr> InMemoryNode::try_merge(BatchUpdateConte // BATT_ASSIGN_OK_RESULT( MergedLevel new_merged_level, - UpdateBuffer::merge_segmented_and_merged_level(context, - left_merged_level, - right_segmented_level, - right)); + UpdateBuffer::concat_segmented_and_merged_level(context, + left_merged_level, + right_segmented_level, + right)); new_node->update_buffer.levels.emplace_back(std::move(new_merged_level)); @@ -977,10 +977,10 @@ StatusOr> InMemoryNode::try_merge(BatchUpdateConte [&](MergedLevel& right_merged_level) -> Status { BATT_ASSIGN_OK_RESULT( MergedLevel new_merged_level, - UpdateBuffer::merge_segmented_and_merged_level(context, - right_merged_level, - left_segmented_level, - left)); + UpdateBuffer::concat_segmented_and_merged_level(context, + right_merged_level, + left_segmented_level, + left)); new_node->update_buffer.levels.emplace_back(std::move(new_merged_level)); @@ -2136,7 +2136,7 @@ void InMemoryNode::UpdateBuffer::SegmentedLevel::check_items_sorted( //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - // -/*static*/ StatusOr InMemoryNode::UpdateBuffer::merge_segmented_and_merged_level( +/*static*/ StatusOr InMemoryNode::UpdateBuffer::concat_segmented_and_merged_level( BatchUpdateContext& context, MergedLevel& merged_level, SegmentedLevel& segmented_level, diff --git a/src/turtle_kv/tree/in_memory_node.hpp b/src/turtle_kv/tree/in_memory_node.hpp index 15e63f4..5cf41b7 100644 --- a/src/turtle_kv/tree/in_memory_node.hpp +++ b/src/turtle_kv/tree/in_memory_node.hpp @@ -349,7 +349,7 @@ struct InMemoryNode { //+++++++++++-+-+--+----- --- -- - - - - - static StatusOr merge_segmented_and_merged_level( + static StatusOr concat_segmented_and_merged_level( BatchUpdateContext& context, // MergedLevel& merged_level, SegmentedLevel& segmented_level, From 75e815a8e0a9f5417215cac47b9db3b3cee7896c Mon Sep 17 00:00:00 2001 From: Vidya Silai Date: Tue, 9 Dec 2025 09:41:32 -0500 Subject: [PATCH 12/48] More feedback, fix try_merge interface --- src/turtle_kv/core/merge_compactor.cpp | 6 + src/turtle_kv/core/merge_compactor.test.cpp | 183 +++++- src/turtle_kv/core/testing/generate.hpp | 5 - src/turtle_kv/tree/in_memory_leaf.cpp | 32 +- src/turtle_kv/tree/in_memory_node.cpp | 652 +++++++++++--------- src/turtle_kv/tree/in_memory_node.hpp | 17 +- src/turtle_kv/tree/in_memory_node.test.cpp | 2 - src/turtle_kv/tree/subtree.cpp | 38 +- src/turtle_kv/tree/subtree.hpp | 17 +- 9 files changed, 546 insertions(+), 406 deletions(-) diff --git a/src/turtle_kv/core/merge_compactor.cpp b/src/turtle_kv/core/merge_compactor.cpp index 963d2ce..a8b0b96 100644 --- a/src/turtle_kv/core/merge_compactor.cpp +++ b/src/turtle_kv/core/merge_compactor.cpp @@ -429,6 +429,12 @@ template /*static*/ auto MergeCompactor::ResultSet::concat(ResultSet&& first, ResultSet&& second) -> ResultSet { + if (first.size() > 0 && second.size() > 0) { + BATT_CHECK_LT(first.get_max_key(), second.get_min_key()) + << "All elements in the first ResultSet should be strictly less than the elements in the " + "second ResultSet!"; + } + ResultSet ans; //----- --- -- - - - - diff --git a/src/turtle_kv/core/merge_compactor.test.cpp b/src/turtle_kv/core/merge_compactor.test.cpp index af42abc..3bf2cf1 100644 --- a/src/turtle_kv/core/merge_compactor.test.cpp +++ b/src/turtle_kv/core/merge_compactor.test.cpp @@ -27,6 +27,7 @@ using batt::WorkerPool; using llfs::StableStringStore; using turtle_kv::CInterval; +using turtle_kv::DecayToItem; using turtle_kv::EditSlice; using turtle_kv::EditView; using turtle_kv::getenv_as; @@ -487,50 +488,176 @@ TEST(MergeCompactor, ResultSetDropKeyRange) } } -TEST(MergeCompactor, ResultSetConcat) +//==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - +// +class ResultSetConcatTest : public ::testing::Test { - usize n = 200; - std::vector all_edits; - std::unordered_set keys_set; - llfs::StableStringStore store; - - std::string value_str = std::string(100, 'a'); - ValueView value = ValueView::from_str(value_str); - - std::default_random_engine rng{/*seed=*/30}; - RandomStringGenerator generate_key; - while (all_edits.size() < n) { - KeyView key = generate_key(rng, store); - if (keys_set.contains(key)) { - continue; + public: + void generate_edits(usize num_edits, bool needs_sort = true) + { + std::unordered_set keys_set; + + std::default_random_engine rng{/*seed=*/30}; + RandomStringGenerator generate_key; + while (this->all_edits_.size() < num_edits) { + KeyView key = generate_key(rng, this->store_); + if (keys_set.contains(key)) { + continue; + } + keys_set.emplace(key); + this->all_edits_.emplace_back(key, + ValueView::from_str(this->store_.store(std::string(100, 'a')))); + } + + if (needs_sort) { + std::sort(this->all_edits_.begin(), this->all_edits_.end(), KeyOrder{}); + } else { + if (std::is_sorted(this->all_edits_.begin(), this->all_edits_.end(), KeyOrder{})) { + std::swap(this->all_edits_.front(), this->all_edits_.back()); + } } - keys_set.emplace(key); - all_edits.emplace_back(key, value); } - std::sort(all_edits.begin(), all_edits.end(), KeyOrder{}); - std::vector first{all_edits.begin(), all_edits.begin() + (n / 2)}; - std::vector second{all_edits.begin() + (n / 2), all_edits.end()}; + template + MergeCompactor::ResultSet concat(std::vector&& first, + std::vector&& second, + DecayToItem decay_to_item) + { + usize first_size = first.size(); + usize second_size = second.size(); + + MergeCompactor::ResultSet first_result_set; + first_result_set.append(std::move(first)); + MergeCompactor::ResultSet second_result_set; + second_result_set.append(std::move(second)); + + EXPECT_EQ(first_result_set.size(), first_size); + EXPECT_EQ(second_result_set.size(), second_size); + + MergeCompactor::ResultSet concatenated_result_set = + MergeCompactor::ResultSet::concat(std::move(first_result_set), + std::move(second_result_set)); + + return concatenated_result_set; + } + + template + void verify_result_set(const MergeCompactor::ResultSet& result_set, + const std::vector& edits) + { + EXPECT_EQ(result_set.size(), edits.size()); + + usize i = 0; + for (const EditView& edit : result_set.get()) { + EXPECT_EQ(edit, edits[i]); + ++i; + } + } + + llfs::StableStringStore store_; + std::vector all_edits_; +}; + +//==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - +// +TEST_F(ResultSetConcatTest, Concat) +{ + // Generate an edit batch of size 200. + // + usize n = 200; + this->generate_edits(n); + + // Divide the edit batch in half, and create ResultSet objects out of each half. + // + std::vector first{this->all_edits_.begin(), this->all_edits_.begin() + (n / 2)}; + std::vector second{this->all_edits_.begin() + (n / 2), this->all_edits_.end()}; + + MergeCompactor::ResultSet concatenated_result_set = + this->concat(std::move(first), std::move(second), DecayToItem{}); + + // Concatenated ResultSet should have the same size as the original edit batch, and should + // also contain the same items in the same order. + // + this->verify_result_set(concatenated_result_set, this->all_edits_); + + // Now, repeat the process qith unequal sized inputs + // + first.assign(this->all_edits_.begin(), this->all_edits_.begin() + (n / 4)); + second.assign(this->all_edits_.begin() + (n / 4), this->all_edits_.end()); + + concatenated_result_set = this->concat(std::move(first), std::move(second), DecayToItem{}); + + this->verify_result_set(concatenated_result_set, this->all_edits_); + + // Finally, test with empty input. + // + first = {}; + second.assign(this->all_edits_.begin(), this->all_edits_.begin() + (n / 4)); + + concatenated_result_set = this->concat(std::move(first), std::move(second), DecayToItem{}); + + this->verify_result_set(concatenated_result_set, + {this->all_edits_.begin(), this->all_edits_.begin() + (n / 4)}); + + first = {}; + second = {}; + concatenated_result_set = this->concat(std::move(first), std::move(second), DecayToItem{}); + EXPECT_EQ(concatenated_result_set.size(), 0); +} + +//==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - +// +TEST_F(ResultSetConcatTest, FragmentedConcat) +{ + usize n = 200; + this->generate_edits(n); + + std::vector first{this->all_edits_.begin(), this->all_edits_.begin() + (n / 2)}; + std::vector second{this->all_edits_.begin() + (n / 2), this->all_edits_.end()}; MergeCompactor::ResultSet first_result_set; first_result_set.append(std::move(first)); MergeCompactor::ResultSet second_result_set; second_result_set.append(std::move(second)); - EXPECT_EQ(first_result_set.size(), n / 2); - EXPECT_EQ(second_result_set.size(), n / 2); + // Drop some keys fron the beginning of the ResultSet. + // + first_result_set.drop_before_n(n / 10); + + // Drop some keys in the middle of the ResultSet. + // + auto second_range_begin = this->all_edits_.begin() + (3 * n / 5); + auto second_range_end = this->all_edits_.begin() + (3 * n / 4); + Interval second_range{second_range_begin->key, second_range_end->key}; + second_result_set.drop_key_range_half_open(second_range); MergeCompactor::ResultSet concatenated_result_set = MergeCompactor::ResultSet::concat(std::move(first_result_set), std::move(second_result_set)); - EXPECT_EQ(concatenated_result_set.size(), n); + std::vector concat_edits{this->all_edits_.begin() + (n / 10), + this->all_edits_.begin() + (3 * n / 5)}; + concat_edits.insert(concat_edits.end(), + this->all_edits_.begin() + (3 * n / 4), + this->all_edits_.end()); + this->verify_result_set(concatenated_result_set, concat_edits); +} - usize i = 0; - for (const EditView& edit : concatenated_result_set.get()) { - EXPECT_EQ(edit, all_edits[i]); - ++i; - } +//==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - +// +TEST_F(ResultSetConcatTest, ConcatDeath) +{ + usize n = 200; + this->generate_edits(n, /*needs_sort*/ false); + + std::vector first{this->all_edits_.begin(), this->all_edits_.begin() + (n / 2)}; + std::vector second{this->all_edits_.begin() + (n / 2), this->all_edits_.end()}; + + // We should panic since first and second have overlapping key ranges. + // + EXPECT_DEATH(this->concat(std::move(first), std::move(second), DecayToItem{}), + "All elements in the first ResultSet should be strictly less than the elements in " + "the second ResultSet!"); } } // namespace diff --git a/src/turtle_kv/core/testing/generate.hpp b/src/turtle_kv/core/testing/generate.hpp index c48c112..8ead962 100644 --- a/src/turtle_kv/core/testing/generate.hpp +++ b/src/turtle_kv/core/testing/generate.hpp @@ -200,11 +200,6 @@ class RandomResultSetGenerator : public MinMaxSize for (const KeyView& delete_key : to_delete) { items.emplace_back(delete_key, ValueView::deleted()); } - if (items.size() > n) { - ResultSet result; - result.append(std::move(items)); - return result; - } std::unordered_set deleted_items_set{to_delete.begin(), to_delete.end()}; while (items.size() < n) { diff --git a/src/turtle_kv/tree/in_memory_leaf.cpp b/src/turtle_kv/tree/in_memory_leaf.cpp index 2a2193c..d6e84cf 100644 --- a/src/turtle_kv/tree/in_memory_leaf.cpp +++ b/src/turtle_kv/tree/in_memory_leaf.cpp @@ -155,28 +155,28 @@ StatusOr> InMemoryLeaf::try_merge( BatchUpdateContext& context, std::unique_ptr sibling) noexcept { + if (sibling->result_set.empty()) { + BATT_CHECK(batt::is_case(this->get_viability())); + return nullptr; + } + if (this->result_set.empty()) { - return {std::move(sibling)}; + BATT_CHECK(batt::is_case(sibling->get_viability())); + this->pinned_leaf_page_ = std::move(sibling->pinned_leaf_page_); + this->result_set = std::move(sibling->result_set); + this->shared_edit_size_totals_ = sibling->shared_edit_size_totals_; + this->edit_size_totals = std::move(sibling->edit_size_totals); + return nullptr; } - auto merged_leaf = - std::make_unique(batt::make_copy(this->pinned_leaf_page_), this->tree_options); + BATT_CHECK_LT(this->get_max_key(), sibling->get_min_key()); - // Concatenate the two leaves' result sets in the correct order. - // - bool right_sibling = this->get_max_key() < sibling->get_min_key(); - if (right_sibling) { - merged_leaf->result_set = - MergeCompactor::ResultSet::concat(std::move(this->result_set), - std::move(sibling->result_set)); - } else { - merged_leaf->result_set = MergeCompactor::ResultSet::concat(std::move(sibling->result_set), - std::move(this->result_set)); - } + this->result_set = MergeCompactor::ResultSet::concat(std::move(this->result_set), + std::move(sibling->result_set)); - merged_leaf->set_edit_size_totals(context.compute_running_total(merged_leaf->result_set)); + this->set_edit_size_totals(context.compute_running_total(this->result_set)); - return {std::move(merged_leaf)}; + return nullptr; } //=#=#==#==#===============+=+=+=+=++=++++++++++++++-++-+--+-+----+--------------- diff --git a/src/turtle_kv/tree/in_memory_node.cpp b/src/turtle_kv/tree/in_memory_node.cpp index e0d98af..27307dd 100644 --- a/src/turtle_kv/tree/in_memory_node.cpp +++ b/src/turtle_kv/tree/in_memory_node.cpp @@ -702,8 +702,6 @@ Status InMemoryNode::merge_child(BatchUpdateContext& update_context, i32 pivot_i return OkStatus(); } - Subtree& child = this->children[pivot_i]; - // Decide which sibling to merge with. Edge cases: child that needs merge is the leftmost or // rightmost child in the node. // @@ -712,7 +710,7 @@ Status InMemoryNode::merge_child(BatchUpdateContext& update_context, i32 pivot_i i32 left_sibling = pivot_i - 1; bool need_update_buffer_compaction = false; - u64 active_segmented_levels = this->update_buffer.compute_active_segmented_levels(); + u64 active_segmented_levels = this->update_buffer.compute_active_pivots(); if (pivot_i == 0) { sibling_i = right_sibling; } else if ((usize)pivot_i == this->pivot_count() - 1) { @@ -743,40 +741,72 @@ Status InMemoryNode::merge_child(BatchUpdateContext& update_context, i32 pivot_i this->tree_options, this->height - 1)); - // Call child.try_merge(). + // Erase rightmost of {child subtree, sibling} in all metadata of the parent. // - Subtree& sibling = this->children[sibling_i]; - BATT_CHECK(batt::is_case(sibling.get_viability())); - StatusOr> status_or_merged = child.try_merge(update_context, sibling); + const i32 right_pivot_i = std::max(pivot_i, sibling_i); + const i32 left_pivot_i = std::min(pivot_i, sibling_i); + const usize old_pivot_count = this->pivot_count(); + + // Call Subtree::try_merge. + // + StatusOr> status_or_merged = + this->children[left_pivot_i].try_merge(update_context, + std::move(this->children[right_pivot_i])); if (!status_or_merged.ok()) { - LOG(ERROR) << BATT_INSPECT(child.get_viability()); + LOG(ERROR) << BATT_INSPECT(status_or_merged.status()); } BATT_REQUIRE_OK(status_or_merged); - if (!*status_or_merged) { - if (!batt::is_case(child.get_viability())) { - // If the full merge wasn't possible, try borrowing from the sibling. - // - BATT_ASSIGN_OK_RESULT(KeyView new_pivot_key, child.try_borrow(update_context, sibling)); + if (*status_or_merged) { + // If try_merge returned a Subtree, a borrow occurred. + // + this->child_pages[left_pivot_i] = llfs::PinnedPage{}; + this->child_pages[right_pivot_i] = llfs::PinnedPage{}; - this->pivot_keys_[std::max(pivot_i, sibling_i)] = new_pivot_key; + // A borrow would have returned the updated right sibling (left sibling was updated in place), + // so overwrite what is currently in this->children. + // + this->children[right_pivot_i] = std::move(**status_or_merged); - BATT_REQUIRE_OK(this->compact_update_buffer_levels(update_context)); + if ((usize)right_pivot_i == old_pivot_count - 1) { + BATT_ASSIGN_OK_RESULT( + this->max_key_, + this->children.back().get_max_key(update_context.page_loader, this->child_pages.back())); } - BATT_CHECK(batt::is_case(child.get_viability())); + + // Compute and store the new pivot key. + // + StatusOr right_child_min_key = + this->children[right_pivot_i].get_min_key(update_context.page_loader, + this->child_pages[right_pivot_i]); + BATT_REQUIRE_OK(right_child_min_key); + StatusOr left_child_max_key = + this->children[left_pivot_i].get_max_key(update_context.page_loader, + this->child_pages[left_pivot_i]); + BATT_REQUIRE_OK(left_child_max_key); + + const KeyView prefix = llfs::find_common_prefix(0, *left_child_max_key, *right_child_min_key); + const KeyView new_pivot_key = right_child_min_key->substr(0, prefix.size() + 1); + this->pivot_keys_[right_pivot_i] = new_pivot_key; + + // Compact the update buffer levels and recompute pending byte counts. + // + BATT_REQUIRE_OK(this->compact_update_buffer_levels(update_context)); + + BATT_CHECK_EQ(this->update_buffer.levels.size(), 1); + BATT_CHECK(batt::is_case(this->update_buffer.levels[0])); + MergedLevel& merged_edits = std::get(this->update_buffer.levels[0]); + + std::fill(this->pending_bytes.begin(), this->pending_bytes.end(), 0); + in_node(*this).update_pending_bytes(update_context.worker_pool, + merged_edits.result_set.get(), + PackedSizeOfEdit{}); + return OkStatus(); } - Subtree& merged_subtree = **status_or_merged; - // Erase rightmost of {child subtree, sibling} in this->child_pages, overwrite leftmost - // with new PinnedPage{}. - // - const i32 pivot_to_erase = std::max(pivot_i, sibling_i); - const i32 pivot_to_overwrite = std::min(pivot_i, sibling_i); - const usize old_pivot_count = this->pivot_count(); - - this->child_pages[pivot_to_overwrite] = llfs::PinnedPage{}; - this->child_pages.erase(this->child_pages.begin() + pivot_to_erase); + this->child_pages[left_pivot_i] = llfs::PinnedPage{}; + this->child_pages.erase(this->child_pages.begin() + right_pivot_i); // Update the update_buffer levels. // @@ -787,33 +817,32 @@ Status InMemoryNode::merge_child(BatchUpdateContext& update_context, i32 pivot_i if (batt::is_case(level)) { SegmentedLevel& segmented_level = std::get(level); in_segmented_level(*this, segmented_level, update_context.page_loader) - .merge_pivots(pivot_to_overwrite, pivot_to_erase); + .merge_pivots(left_pivot_i, right_pivot_i); } } } - // Update this->children, following same update method as with this->child_pages. + // Update this->children. // - this->children[pivot_to_overwrite] = std::move(merged_subtree); - this->children.erase(this->children.begin() + pivot_to_erase); + this->children.erase(this->children.begin() + right_pivot_i); // Update pending_bytes. The leftmost of {subtree, sibling} should be incremented by the removed // subtree's pending bytes values. Erase the pending bytes of the removed subtree. // - this->pending_bytes[pivot_to_overwrite] += this->pending_bytes[pivot_to_erase]; - this->pending_bytes.erase(this->pending_bytes.begin() + pivot_to_erase); + this->pending_bytes[left_pivot_i] += this->pending_bytes[right_pivot_i]; + this->pending_bytes.erase(this->pending_bytes.begin() + right_pivot_i); - bool is_pending_bytes_exact = get_bit(this->pending_bytes_is_exact, pivot_to_overwrite) & - get_bit(this->pending_bytes_is_exact, pivot_to_erase); + bool is_pending_bytes_exact = get_bit(this->pending_bytes_is_exact, left_pivot_i) & + get_bit(this->pending_bytes_is_exact, right_pivot_i); this->pending_bytes_is_exact = - set_bit(this->pending_bytes_is_exact, pivot_to_overwrite, is_pending_bytes_exact); - this->pending_bytes_is_exact = remove_bit(this->pending_bytes_is_exact, pivot_to_erase); + set_bit(this->pending_bytes_is_exact, left_pivot_i, is_pending_bytes_exact); + this->pending_bytes_is_exact = remove_bit(this->pending_bytes_is_exact, right_pivot_i); // Remove the pivot key of the removed child subtree from this->pivot_keys_. // - this->pivot_keys_.erase(this->pivot_keys_.begin() + pivot_to_erase); + this->pivot_keys_.erase(this->pivot_keys_.begin() + right_pivot_i); - if ((usize)pivot_to_erase == old_pivot_count - 1) { + if ((usize)right_pivot_i == old_pivot_count - 1) { BATT_ASSIGN_OK_RESULT( this->max_key_, this->children.back().get_max_key(update_context.page_loader, this->child_pages.back())); @@ -821,9 +850,9 @@ Status InMemoryNode::merge_child(BatchUpdateContext& update_context, i32 pivot_i // Finally, split the newly merged child if needed. // - SubtreeViability merged_viability = this->children[pivot_to_overwrite].get_viability(); + SubtreeViability merged_viability = this->children[left_pivot_i].get_viability(); if (batt::is_case(merged_viability)) { - BATT_REQUIRE_OK(this->split_child(update_context, pivot_to_overwrite)); + BATT_REQUIRE_OK(this->make_child_viable(update_context, left_pivot_i)); } else { BATT_CHECK(batt::is_case(merged_viability)); } @@ -865,184 +894,183 @@ StatusOr> InMemoryNode::flush_and_shrink(BatchUpdateContext& c //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - // -StatusOr> InMemoryNode::try_merge(BatchUpdateContext& context, - InMemoryNode& sibling) noexcept +StatusOr> InMemoryNode::try_merge( + BatchUpdateContext& context, + std::unique_ptr sibling) noexcept { + //----- --- -- - - - - // If merging both full nodes will cause the merged node's pivot count to exceed the max - // possible pivot count, return null so that we can try a borrow. + // possible pivot count, try a borrow. // - if (this->pivot_count() + sibling.pivot_count() > this->max_pivot_count()) { - return nullptr; - } - - auto new_node = std::make_unique(batt::make_copy(this->pinned_node_page_), - this->tree_options, - this->is_size_tiered()); - - const auto concat_metadata = [&](InMemoryNode& left, InMemoryNode& right) { - new_node->max_key_ = right.max_key_; - - new_node->height = left.height; - - new_node->latest_flush_pivot_i_ = None; - - new_node->pending_bytes.insert(new_node->pending_bytes.end(), - left.pending_bytes.begin(), - left.pending_bytes.end()); - new_node->pending_bytes.insert(new_node->pending_bytes.end(), - right.pending_bytes.begin(), - right.pending_bytes.end()); - - new_node->pending_bytes_is_exact = left.pending_bytes_is_exact & right.pending_bytes_is_exact; - - new_node->child_pages.insert(new_node->child_pages.end(), - std::make_move_iterator(left.child_pages.begin()), - std::make_move_iterator(left.child_pages.end())); - new_node->child_pages.insert(new_node->child_pages.end(), - std::make_move_iterator(right.child_pages.begin()), - std::make_move_iterator(right.child_pages.end())); - - new_node->children.insert(new_node->children.end(), - std::make_move_iterator(left.children.begin()), - std::make_move_iterator(left.children.end())); - new_node->children.insert(new_node->children.end(), - std::make_move_iterator(right.children.begin()), - std::make_move_iterator(right.children.end())); - - new_node->pivot_keys_.insert(new_node->pivot_keys_.end(), - left.pivot_keys_.begin(), - left.pivot_keys_.end() - 1); - new_node->pivot_keys_.insert(new_node->pivot_keys_.end(), - right.pivot_keys_.begin(), - right.pivot_keys_.end()); - }; + if (this->pivot_count() + sibling->pivot_count() > this->max_pivot_count()) { + bool borrow_from_sibling = false; + if (batt::is_case(this->get_viability())) { + borrow_from_sibling = true; + } else { + BATT_CHECK(batt::is_case(sibling->get_viability())); + } - const auto merge_update_buffers = [&](InMemoryNode& left, InMemoryNode& right) -> Status { - usize i = 0; - for (; i < left.update_buffer.levels.size(); ++i) { - Level& left_level = left.update_buffer.levels[i]; - BATT_REQUIRE_OK(batt::case_of( // - left_level, // - [&](EmptyLevel&) -> Status { - if (i < right.update_buffer.levels.size()) { - Level& right_level = right.update_buffer.levels[i]; - if (!batt::is_case(right_level)) { - new_node->update_buffer.levels.emplace_back(std::move(right_level)); - } - } + Status borrow_status = borrow_from_sibling ? this->try_borrow(context, *sibling) + : sibling->try_borrow(context, *this); + BATT_REQUIRE_OK(borrow_status); - return OkStatus(); - }, - [&](MergedLevel& left_merged_level) -> Status { - if (i < right.update_buffer.levels.size()) { - BATT_REQUIRE_OK(batt::case_of( - right.update_buffer.levels[i], - [&](EmptyLevel&) -> Status { - new_node->update_buffer.levels.emplace_back(std::move(left_merged_level)); - return OkStatus(); - }, - [&](MergedLevel& right_merged_level) -> Status { - new_node->update_buffer.levels.emplace_back( - left_merged_level.concat(right_merged_level)); - return OkStatus(); - }, - [&](SegmentedLevel& right_segmented_level) -> Status { - // When merging a MergedLevel and a SegmentedLevel, create a new MergedLevel. - // - BATT_ASSIGN_OK_RESULT( - MergedLevel new_merged_level, - UpdateBuffer::concat_segmented_and_merged_level(context, - left_merged_level, - right_segmented_level, - right)); - - new_node->update_buffer.levels.emplace_back(std::move(new_merged_level)); - - return OkStatus(); - })); - } else { - new_node->update_buffer.levels.emplace_back(std::move(left_merged_level)); - } + return {std::move(sibling)}; + } - return OkStatus(); - }, - [&](SegmentedLevel& left_segmented_level) -> Status { - if (i < right.update_buffer.levels.size()) { - BATT_REQUIRE_OK(batt::case_of( - right.update_buffer.levels[i], - [&](EmptyLevel&) -> Status { - new_node->update_buffer.levels.emplace_back(std::move(left_segmented_level)); - return OkStatus(); - }, - [&](MergedLevel& right_merged_level) -> Status { - BATT_ASSIGN_OK_RESULT( - MergedLevel new_merged_level, - UpdateBuffer::concat_segmented_and_merged_level(context, - right_merged_level, - left_segmented_level, - left)); - - new_node->update_buffer.levels.emplace_back(std::move(new_merged_level)); - - return OkStatus(); - }, - [&](SegmentedLevel& right_segmented_level) -> Status { - // First shift the right level's bitsets to the left by the number of pivots - // in the left node. - // - usize left_node_pivot_count = left.pivot_count(); - for (usize segment_i = 0; segment_i < right_segmented_level.segment_count(); - ++segment_i) { - Segment& segment = right_segmented_level.get_segment(segment_i); - segment.flushed_pivots <<= left_node_pivot_count; - segment.active_pivots <<= left_node_pivot_count; - } - - new_node->update_buffer.levels.emplace_back(std::move(left_segmented_level)); - SegmentedLevel& new_segmented_level = - std::get(new_node->update_buffer.levels.back()); - new_segmented_level.segments.insert( - new_segmented_level.segments.end(), - std::make_move_iterator(right_segmented_level.segments.begin()), - std::make_move_iterator(right_segmented_level.segments.end())); - - return OkStatus(); - })); - } else { - new_node->update_buffer.levels.emplace_back(std::move(left_segmented_level)); + BATT_CHECK_LT(this->get_max_key(), sibling->get_min_key()); + + //----- --- -- - - - - + // Concatenate the update buffers. + // + usize i = 0; + for (; i < this->update_buffer.levels.size(); ++i) { + Level& left_level = this->update_buffer.levels[i]; + BATT_REQUIRE_OK(batt::case_of( // + left_level, // + [&](EmptyLevel&) -> Status { + if (i < sibling->update_buffer.levels.size()) { + Level& right_level = sibling->update_buffer.levels[i]; + if (!batt::is_case(right_level)) { + this->update_buffer.levels[i] = std::move(right_level); } + } - return OkStatus(); - })); - } + return OkStatus(); + }, + [&](MergedLevel& left_merged_level) -> Status { + if (i < sibling->update_buffer.levels.size()) { + BATT_REQUIRE_OK(batt::case_of( + sibling->update_buffer.levels[i], + [](EmptyLevel&) -> Status { + return OkStatus(); + }, + [&](MergedLevel& right_merged_level) -> Status { + this->update_buffer.levels[i] = + std::move(left_merged_level.concat(right_merged_level)); + return OkStatus(); + }, + [&](SegmentedLevel& right_segmented_level) -> Status { + // When merging a MergedLevel and a SegmentedLevel, create a new MergedLevel. + // + BATT_ASSIGN_OK_RESULT( + MergedLevel new_merged_level, + UpdateBuffer::concat_segmented_and_merged_level(context, + left_merged_level, + right_segmented_level, + *sibling)); + + this->update_buffer.levels[i] = std::move(new_merged_level); + + return OkStatus(); + })); + } - // Carry over any remaining levels from the right node's update buffer. - // - for (; i < right.update_buffer.levels.size(); ++i) { - Level& right_level = right.update_buffer.levels[i]; - if (!batt::is_case(right_level)) { - new_node->update_buffer.levels.emplace_back(std::move(right_level)); - } - } + return OkStatus(); + }, + [&](SegmentedLevel& left_segmented_level) -> Status { + if (i < sibling->update_buffer.levels.size()) { + BATT_REQUIRE_OK(batt::case_of( + sibling->update_buffer.levels[i], + [](EmptyLevel&) -> Status { + return OkStatus(); + }, + [&](MergedLevel& right_merged_level) -> Status { + BATT_ASSIGN_OK_RESULT( + MergedLevel new_merged_level, + UpdateBuffer::concat_segmented_and_merged_level(context, + right_merged_level, + left_segmented_level, + *this)); + + this->update_buffer.levels[i] = std::move(new_merged_level); + + return OkStatus(); + }, + [&](SegmentedLevel& right_segmented_level) -> Status { + // First shift the right level's bitsets to the left by the number of pivots + // in the left node. + // + usize left_node_pivot_count = this->pivot_count(); + for (usize segment_i = 0; segment_i < right_segmented_level.segment_count(); + ++segment_i) { + Segment& segment = right_segmented_level.get_segment(segment_i); + segment.flushed_pivots <<= left_node_pivot_count; + segment.active_pivots <<= left_node_pivot_count; + } + + left_segmented_level.segments.insert( + left_segmented_level.segments.end(), + std::make_move_iterator(right_segmented_level.segments.begin()), + std::make_move_iterator(right_segmented_level.segments.end())); + + return OkStatus(); + })); + } - return OkStatus(); - }; + return OkStatus(); + })); + } - if (this->get_max_key() < sibling.get_min_key()) { - concat_metadata(*this, sibling); - BATT_REQUIRE_OK(merge_update_buffers(*this, sibling)); - } else { - concat_metadata(sibling, *this); - BATT_REQUIRE_OK(merge_update_buffers(sibling, *this)); + // Carry over any remaining levels from the right node's update buffer. + // + for (; i < sibling->update_buffer.levels.size(); ++i) { + batt::case_of( + sibling->update_buffer.levels[i], + [](EmptyLevel&) { + // do nothing + }, + [&](MergedLevel& right_merged_level) { + this->update_buffer.levels.emplace_back(std::move(right_merged_level)); + }, + [&](SegmentedLevel& right_segmented_level) { + usize left_node_pivot_count = this->pivot_count(); + for (usize segment_i = 0; segment_i < right_segmented_level.segment_count(); + ++segment_i) { + Segment& segment = right_segmented_level.get_segment(segment_i); + segment.flushed_pivots <<= left_node_pivot_count; + segment.active_pivots <<= left_node_pivot_count; + } + + this->update_buffer.levels.emplace_back(right_segmented_level); + }); } - return {std::move(new_node)}; + //----- --- -- - - - - + // Then, concatenate the two nodes' metadata. + // + this->max_key_ = sibling->max_key_; + + this->pending_bytes.insert(this->pending_bytes.end(), + sibling->pending_bytes.begin(), + sibling->pending_bytes.end()); + + sibling->pending_bytes_is_exact <<= this->pivot_count(); + this->pending_bytes_is_exact |= sibling->pending_bytes_is_exact; + + this->child_pages.insert(this->child_pages.end(), + std::make_move_iterator(sibling->child_pages.begin()), + std::make_move_iterator(sibling->child_pages.end())); + + // Modify the children Subtree vector after concatenating the update buffers, inserting into the + // vector will cause the pivot count to increase. + // + this->children.insert(this->children.end(), + std::make_move_iterator(sibling->children.begin()), + std::make_move_iterator(sibling->children.end())); + + // Remove the key upper bound for `this`. + // + this->pivot_keys_.pop_back(); + this->pivot_keys_.insert(this->pivot_keys_.end(), + sibling->pivot_keys_.begin(), + sibling->pivot_keys_.end()); + + return nullptr; } //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - // -StatusOr InMemoryNode::try_borrow(BatchUpdateContext& context, - InMemoryNode& sibling) noexcept +Status InMemoryNode::try_borrow(BatchUpdateContext& context, InMemoryNode& sibling) noexcept { BATT_CHECK(batt::is_case(sibling.get_viability())); @@ -1051,55 +1079,9 @@ StatusOr InMemoryNode::try_borrow(BatchUpdateContext& context, BATT_CHECK_LT(this->pivot_count(), 4); u32 num_pivots_to_borrow = 4 - this->pivot_count(); - // Calculate the pivot range to borrow from the sibling, and then extract updates from the - // sibling's update buffer that contain this range. - // - i32 borrowed_min_pivot_i = -1; - KeyView borrowed_max_pivot_key; - if (right_sibling) { - borrowed_min_pivot_i = 0; - borrowed_max_pivot_key = sibling.get_pivot_key(num_pivots_to_borrow); - } else { - borrowed_min_pivot_i = sibling.pivot_count() - num_pivots_to_borrow; - borrowed_max_pivot_key = sibling.get_pivot_key(sibling.pivot_count()); - } - Interval borrowed_pivot_range{sibling.get_pivot_key(borrowed_min_pivot_i), - borrowed_max_pivot_key}; - - BatchUpdate borrowed_pivot_batch{ - .context = context, - .result_set = {}, - .edit_size_totals = None, - }; - - Status segment_load_status; - HasPageRefs has_page_refs{false}; - - BATT_ASSIGN_OK_RESULT( // - borrowed_pivot_batch.result_set, // - context.merge_compact_edits( // - /*max_key=*/borrowed_max_pivot_key, // - [&](MergeCompactor& compactor) -> Status { - sibling.push_levels_to_merge(compactor, - context.page_loader, - segment_load_status, - has_page_refs, - as_slice(sibling.update_buffer.levels), - /*min_pivot_i=*/borrowed_min_pivot_i, - /*only_pivot=*/false); - return OkStatus(); - })); - - BATT_REQUIRE_OK(segment_load_status); - - borrowed_pivot_batch.result_set.drop_key_range_half_open(Interval{ - borrowed_max_pivot_key, - sibling.key_upper_bound(), - }); - - borrowed_pivot_batch.edit_size_totals = None; - - // Borrow node metadata from the sibling. + //----- --- -- - - - - + // Borrow node metadata. Modify all metadata right now except this->children, since modifying it + // will change the pivot count. // if (right_sibling) { this->pending_bytes.insert(this->pending_bytes.end(), @@ -1108,14 +1090,14 @@ StatusOr InMemoryNode::try_borrow(BatchUpdateContext& context, sibling.pending_bytes.erase(sibling.pending_bytes.begin(), sibling.pending_bytes.begin() + num_pivots_to_borrow); - // Update this->pending_bytes_is_exact by placing the borrowing pending bytes bits from the - // right sibling right after the pending bytes bits for this node. + // Update this->pending_bytes_is_exact by placing the borrowed pending bytes bits from the + // right sibling directly after the pending bytes bits for this node. // - u64 borrowed_pending_bytes_exact = - sibling.pending_bytes_is_exact & ((u64{1} << num_pivots_to_borrow) - 1); - u64 mask = ((u64{1} << num_pivots_to_borrow) - 1) << (this->pivot_count() - 1); - this->pending_bytes_is_exact = (this->pending_bytes_is_exact & ~mask) | - (borrowed_pending_bytes_exact << (this->pivot_count() - 1)); + u64 borrowed_bits = sibling.pending_bytes_is_exact & ((u64{1} << num_pivots_to_borrow) - 1); + u64 mask = ((u64{1} << num_pivots_to_borrow) - 1) << this->pivot_count(); + this->pending_bytes_is_exact = + (this->pending_bytes_is_exact & ~mask) | (borrowed_bits << this->pivot_count()); + sibling.pending_bytes_is_exact >>= num_pivots_to_borrow; // Get rid of the key upper bound in this node and insert the borrowed pivot keys, including // one past num_pivots_to_borrow, to set the new key upper bound. @@ -1133,16 +1115,6 @@ StatusOr InMemoryNode::try_borrow(BatchUpdateContext& context, std::make_move_iterator(sibling.child_pages.begin() + num_pivots_to_borrow)); sibling.child_pages.erase(sibling.child_pages.begin(), sibling.child_pages.begin() + num_pivots_to_borrow); - - this->children.insert(this->children.end(), - std::make_move_iterator(sibling.children.begin()), - std::make_move_iterator(sibling.children.begin() + num_pivots_to_borrow)); - sibling.children.erase(sibling.children.begin(), - sibling.children.begin() + num_pivots_to_borrow); - - BATT_ASSIGN_OK_RESULT( - this->max_key_, - this->children.back().get_max_key(context.page_loader, this->child_pages.back())); } else { this->pending_bytes.insert(this->pending_bytes.begin(), sibling.pending_bytes.end() - num_pivots_to_borrow, @@ -1153,10 +1125,14 @@ StatusOr InMemoryNode::try_borrow(BatchUpdateContext& context, // Shift this->pending_bytes_is_exact up by num_pivots_to_borrow, and place the borrowed // pending bytes bits at the lowest order bits. // - u64 borrowed_pending_bytes_exact = - sibling.pending_bytes_is_exact >> (64 - num_pivots_to_borrow); + u64 borrowed_bits = + (sibling.pending_bytes_is_exact >> (sibling.pivot_count() - num_pivots_to_borrow)) & + ((u64{1} << num_pivots_to_borrow) - 1); this->pending_bytes_is_exact <<= num_pivots_to_borrow; - this->pending_bytes_is_exact |= borrowed_pending_bytes_exact; + this->pending_bytes_is_exact |= borrowed_bits; + u64 mask = ((u64{1} << num_pivots_to_borrow) - 1) + << (sibling.pivot_count() - num_pivots_to_borrow); + sibling.pending_bytes_is_exact &= ~mask; sibling.pivot_keys_.pop_back(); this->pivot_keys_.insert(this->pivot_keys_.begin(), @@ -1171,24 +1147,62 @@ StatusOr InMemoryNode::try_borrow(BatchUpdateContext& context, std::make_move_iterator(sibling.child_pages.end())); sibling.child_pages.erase(sibling.child_pages.end() - num_pivots_to_borrow, sibling.child_pages.end()); - - this->children.insert(this->children.begin(), - std::make_move_iterator(sibling.children.end() - num_pivots_to_borrow), - std::make_move_iterator(sibling.children.end())); - sibling.children.erase(sibling.children.end() - num_pivots_to_borrow, sibling.children.end()); - - BATT_ASSIGN_OK_RESULT( - sibling.max_key_, - sibling.children.back().get_max_key(context.page_loader, sibling.child_pages.back())); } - // Now that metadata has been borrowed, inserted the borrowed updates into the update buffer. + //----- --- -- - - - - + // Modify the update buffers of both `this` and `sibling`. + // Calculate the pivot range to borrow from the sibling, and then extract updates from the + // sibling's update buffer that contain this range. // - BATT_REQUIRE_OK(this->update_buffer_insert(borrowed_pivot_batch)); + i32 borrowed_min_pivot_i = -1; + KeyView borrowed_max_pivot_key; + if (right_sibling) { + borrowed_min_pivot_i = 0; + borrowed_max_pivot_key = sibling.get_pivot_key(num_pivots_to_borrow); + } else { + borrowed_min_pivot_i = sibling.pivot_count() - num_pivots_to_borrow; + borrowed_max_pivot_key = sibling.get_pivot_key(sibling.pivot_count()); + } + Interval borrowed_pivot_range{sibling.get_pivot_key(borrowed_min_pivot_i), + borrowed_max_pivot_key}; + + BatchUpdate borrowed_pivot_batch{ + .context = context, + .result_set = {}, + .edit_size_totals = None, + }; + + Status segment_load_status; + HasPageRefs has_page_refs{false}; + + BATT_ASSIGN_OK_RESULT( // + borrowed_pivot_batch.result_set, // + context.merge_compact_edits( // + /*max_key=*/borrowed_max_pivot_key, // + [&](MergeCompactor& compactor) -> Status { + sibling.push_levels_to_merge(compactor, + context.page_loader, + segment_load_status, + has_page_refs, + as_slice(sibling.update_buffer.levels), + /*min_pivot_i=*/borrowed_min_pivot_i, + /*only_pivot=*/false); + return OkStatus(); + })); + + BATT_REQUIRE_OK(segment_load_status); + + borrowed_pivot_batch.result_set.drop_key_range_half_open(Interval{ + borrowed_max_pivot_key, + sibling.key_upper_bound(), + }); + + borrowed_pivot_batch.edit_size_totals = None; // Adjust the update buffer levels metadata in the sibling now that the borrowed updates have // been extracted. // + usize remove_pivot_i = right_sibling ? 0 : sibling.pivot_count() - num_pivots_to_borrow; for (Level& level : sibling.update_buffer.levels) { batt::case_of( // level, // @@ -1201,40 +1215,66 @@ StatusOr InMemoryNode::try_borrow(BatchUpdateContext& context, [&](SegmentedLevel& segmented_level) { for (usize segment_i = 0; segment_i < segmented_level.segment_count(); ++segment_i) { Segment& segment = segmented_level.get_segment(segment_i); - if (right_sibling) { - segment.flushed_pivots >>= num_pivots_to_borrow; - segment.active_pivots >>= num_pivots_to_borrow; - segment.flushed_item_upper_bound_.erase( - segment.flushed_item_upper_bound_.begin(), - segment.flushed_item_upper_bound_.begin() + num_pivots_to_borrow); - } else { - u64 mask = (u64{1} << (64 - num_pivots_to_borrow)) - 1; - segment.flushed_pivots &= mask; - segment.active_pivots &= mask; - segment.flushed_item_upper_bound_.erase( - segment.flushed_item_upper_bound_.end() - num_pivots_to_borrow, - segment.flushed_item_upper_bound_.end()); + // Iterate backwards, since calling `remove_bit` will shift the bitset. + // TODO [vsilai 12-6-2025]: consider writing a `remove_bits` function to modify the bit + // sets more efficiently? This would only be for active_pivots. + // + for (usize j = remove_pivot_i + num_pivots_to_borrow - 1; j >= remove_pivot_i; --j) { + segment.remove_pivot(j); } } }); } - // Calculate and return the new pivot key for the parent. + // Insert the borrowed updates into the update buffer. + // + BATT_REQUIRE_OK(this->update_buffer_insert(borrowed_pivot_batch)); + + usize insert_pivot_i = right_sibling ? this->pivot_count() : 0; + for (Level& level : this->update_buffer.levels) { + batt::case_of( // + level, // + [](EmptyLevel&) { + // nothing to do + }, + [&](MergedLevel& merged_level) { + // nothing to do + }, + [&](SegmentedLevel& segmented_level) { + for (usize segment_i = 0; segment_i < segmented_level.segment_count(); ++segment_i) { + Segment& segment = segmented_level.get_segment(segment_i); + for (usize j = insert_pivot_i; j < insert_pivot_i + num_pivots_to_borrow; ++j) { + segment.insert_pivot(j, /*is_active*/false); + } + } + }); + } + + //----- --- -- - - - - + // Finally, update the children Subtree vector for both nodes. // - KeyView left_child_max; - KeyView right_child_min; if (right_sibling) { - left_child_max = this->get_max_key(); - right_child_min = sibling.get_min_key(); + this->children.insert(this->children.end(), + std::make_move_iterator(sibling.children.begin()), + std::make_move_iterator(sibling.children.begin() + num_pivots_to_borrow)); + sibling.children.erase(sibling.children.begin(), + sibling.children.begin() + num_pivots_to_borrow); + + BATT_ASSIGN_OK_RESULT( + this->max_key_, + this->children.back().get_max_key(context.page_loader, this->child_pages.back())); } else { - left_child_max = sibling.get_max_key(); - right_child_min = this->get_min_key(); - } + this->children.insert(this->children.begin(), + std::make_move_iterator(sibling.children.end() - num_pivots_to_borrow), + std::make_move_iterator(sibling.children.end())); + sibling.children.erase(sibling.children.end() - num_pivots_to_borrow, sibling.children.end()); - const KeyView prefix = llfs::find_common_prefix(0, left_child_max, right_child_min); - const KeyView new_sibling_pivot_key = right_child_min.substr(0, prefix.size() + 1); + BATT_ASSIGN_OK_RESULT( + sibling.max_key_, + sibling.children.back().get_max_key(context.page_loader, sibling.child_pages.back())); + } - return new_sibling_pivot_key; + return OkStatus(); } //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - @@ -2331,7 +2371,7 @@ SmallFn InMemoryNode::UpdateBuffer::dump() const //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - // -u64 InMemoryNode::UpdateBuffer::compute_active_segmented_levels() const +u64 InMemoryNode::UpdateBuffer::compute_active_pivots() const { u64 active_pivots = 0; for (const Level& level : this->levels) { diff --git a/src/turtle_kv/tree/in_memory_node.hpp b/src/turtle_kv/tree/in_memory_node.hpp index 5cf41b7..87d9f0f 100644 --- a/src/turtle_kv/tree/in_memory_node.hpp +++ b/src/turtle_kv/tree/in_memory_node.hpp @@ -357,7 +357,7 @@ struct InMemoryNode { SmallFn dump() const; - u64 compute_active_segmented_levels() const; + u64 compute_active_pivots() const; usize count_non_empty_levels() const { @@ -546,16 +546,19 @@ struct InMemoryNode { */ Status try_flush(BatchUpdateContext& context); - /** \brief Merge the node with one of its siblings and return the newly merged node. + /** \brief Merge the node in place with its right sibling. + * + * Returns nullptr if `sibling` is completely consumed; otherwise, returns the modified sibling + * since a borrow occurred. */ StatusOr> try_merge(BatchUpdateContext& context, - InMemoryNode& sibling) noexcept; + std::unique_ptr sibling) noexcept; - /** \brief Attempts to make the node (that needs a merge) viable by borrowing data - * from one of its siblings. If successful, returns the new pivot key to be set in the parent - * of these two nodes to separate them. + /** \brief Attempts to make `this` (which needs a merge) viable by borrowing data + * from one of its siblings. Note that for this function, `sibling` does not have to be the right + * sibling. Both `this` and `sibling` are modified in place. */ - StatusOr try_borrow(BatchUpdateContext& context, InMemoryNode& sibling) noexcept; + Status try_borrow(BatchUpdateContext& context, InMemoryNode& sibling) noexcept; /** \brief Splits the specified child, inserting a new pivot immediately after `pivot_i`. */ diff --git a/src/turtle_kv/tree/in_memory_node.test.cpp b/src/turtle_kv/tree/in_memory_node.test.cpp index 8e9f1db..5bae61c 100644 --- a/src/turtle_kv/tree/in_memory_node.test.cpp +++ b/src/turtle_kv/tree/in_memory_node.test.cpp @@ -662,8 +662,6 @@ TEST(InMemoryNodeTest, SubtreeDeletions) << BATT_INSPECT(i); if (((i + 1) % chi) == 0) { - LOG(INFO) << "Taking checkpoint..."; - std::unique_ptr page_job = page_cache->new_job(); TreeSerializeContext context{tree_options, *page_job, worker_pool}; diff --git a/src/turtle_kv/tree/subtree.cpp b/src/turtle_kv/tree/subtree.cpp index fb85510..9b787a9 100644 --- a/src/turtle_kv/tree/subtree.cpp +++ b/src/turtle_kv/tree/subtree.cpp @@ -585,7 +585,7 @@ StatusOr> Subtree::try_split(BatchUpdateContext& context) //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - // StatusOr> Subtree::try_merge(BatchUpdateContext& context, - Subtree& sibling) noexcept + Subtree&& sibling) noexcept { BATT_CHECK(!this->locked_.load()); @@ -598,7 +598,7 @@ StatusOr> Subtree::try_merge(BatchUpdateContext& context, return {batt::StatusCode::kUnimplemented}; }, - [&](const std::unique_ptr& leaf) -> StatusOr> { + [&](std::unique_ptr& leaf) -> StatusOr> { BATT_CHECK(batt::is_case>(sibling.impl_)); auto& sibling_leaf_ptr = std::get>(sibling.impl_); BATT_CHECK(sibling_leaf_ptr); @@ -606,16 +606,18 @@ StatusOr> Subtree::try_merge(BatchUpdateContext& context, BATT_ASSIGN_OK_RESULT(std::unique_ptr merged_leaf, // leaf->try_merge(context, std::move(sibling_leaf_ptr))); - return {Subtree{std::move(merged_leaf)}}; + BATT_CHECK_EQ(merged_leaf, nullptr); + + return Optional{None}; }, - [&](const std::unique_ptr& node) -> StatusOr> { + [&](std::unique_ptr& node) -> StatusOr> { BATT_CHECK(batt::is_case>(sibling.impl_)); auto& sibling_node_ptr = std::get>(sibling.impl_); BATT_CHECK(sibling_node_ptr); BATT_ASSIGN_OK_RESULT(std::unique_ptr merged_node, // - node->try_merge(context, *sibling_node_ptr)); + node->try_merge(context, std::move(sibling_node_ptr))); if (merged_node == nullptr) { return Optional{None}; @@ -625,32 +627,6 @@ StatusOr> Subtree::try_merge(BatchUpdateContext& context, }); } -//==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - -// -StatusOr Subtree::try_borrow(BatchUpdateContext& context, Subtree& sibling) noexcept -{ - BATT_CHECK(!this->locked_.load()); - - return batt::case_of( - this->impl_, - - [&](const llfs::PageIdSlot& page_id_slot [[maybe_unused]]) -> StatusOr { - return {batt::StatusCode::kUnimplemented}; - }, - - [&](const std::unique_ptr& leaf [[maybe_unused]]) -> StatusOr { - return {batt::StatusCode::kUnimplemented}; - }, - - [&](const std::unique_ptr& node) -> StatusOr { - BATT_CHECK(batt::is_case>(sibling.impl_)); - auto& sibling_node_ptr = std::get>(sibling.impl_); - BATT_CHECK(sibling_node_ptr); - - return node->try_borrow(context, *sibling_node_ptr); - }); -} - //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - // Status Subtree::try_flush(BatchUpdateContext& context) diff --git a/src/turtle_kv/tree/subtree.hpp b/src/turtle_kv/tree/subtree.hpp index 31144ea..983aaf7 100644 --- a/src/turtle_kv/tree/subtree.hpp +++ b/src/turtle_kv/tree/subtree.hpp @@ -138,18 +138,13 @@ class Subtree */ StatusOr> try_split(BatchUpdateContext& context); - /** \brief Attempts to merge the given Subtree with one of its siblings. If successful, the - * newly merged Subtree is returned. - * - * If no merge, returns None. - */ - StatusOr> try_merge(BatchUpdateContext& context, Subtree& sibling) noexcept; - - /** \brief Attempts to make the Subtree viable by borrowing data from one of its siblings. - * Called when the Subtree needs a merge, but borrowing is the only option to make the tree - * viable. + /** \brief Attempts to merge the given Subtree in place with its right sibling. + * + * If the in place merge is successful, `sibling` is completely consumed and `None` is returned. + * + * If a borrow needs to occur, `this` is modified in place and the modified sibling is returned. */ - StatusOr try_borrow(BatchUpdateContext& context, Subtree& sibling) noexcept; + StatusOr> try_merge(BatchUpdateContext& context, Subtree&& sibling) noexcept; /** \brief Attempt to make the root viable by flushing a batch. */ From 9e36ed9bad195fe586ff6d7975655e15e8ec9afe Mon Sep 17 00:00:00 2001 From: Vidya Silai Date: Tue, 16 Dec 2025 07:51:51 -0500 Subject: [PATCH 13/48] More Subtree fixes --- src/turtle_kv/tree/in_memory_leaf.cpp | 97 ++++++++++++ src/turtle_kv/tree/in_memory_leaf.hpp | 10 ++ src/turtle_kv/tree/in_memory_node.cpp | 70 ++++----- src/turtle_kv/tree/in_memory_node.hpp | 10 +- src/turtle_kv/tree/subtree.cpp | 189 +++++++++++------------ src/turtle_kv/tree/subtree.hpp | 21 ++- src/turtle_kv/tree/subtree_viability.hpp | 15 ++ 7 files changed, 261 insertions(+), 151 deletions(-) diff --git a/src/turtle_kv/tree/in_memory_leaf.cpp b/src/turtle_kv/tree/in_memory_leaf.cpp index d6e84cf..829b373 100644 --- a/src/turtle_kv/tree/in_memory_leaf.cpp +++ b/src/turtle_kv/tree/in_memory_leaf.cpp @@ -8,6 +8,74 @@ namespace turtle_kv { +//==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - +// +/*static*/ std::unique_ptr InMemoryLeaf::unpack( + llfs::PinnedPage&& pinned_leaf_page, + const TreeOptions& tree_options, + const PackedLeafPage& packed_leaf, + batt::WorkerPool& worker_pool) noexcept +{ + std::unique_ptr new_leaf = + std::make_unique(batt::make_copy(pinned_leaf_page), tree_options); + + const batt::TaskCount max_tasks{worker_pool.size() + 1}; + + Slice packed_items = packed_leaf.items_slice(); + std::vector buffer; + buffer.reserve(packed_items.size()); + + if (max_tasks == 1) { + for (const PackedKeyValue& pkv : packed_items) { + buffer.emplace_back(to_edit_view(pkv)); + } + } else { + const ParallelAlgoDefaults& algo_defaults = parallel_algo_defaults(); + + const auto src_begin = packed_items.begin(); + const auto src_end = packed_items.end(); + const auto dst_begin = buffer.begin(); + + const batt::WorkSlicePlan plan{batt::WorkSliceParams{ + algo_defaults.copy_edits.min_task_size, + max_tasks, + }, + src_begin, + src_end}; + + BATT_CHECK_GT(plan.n_tasks, 0); + + { + batt::ScopedWorkContext work_context{worker_pool}; + + BATT_CHECK_OK(slice_work(work_context, + plan, + /*gen_work_fn=*/ + [&](usize /*task_index*/, isize task_offset, isize task_size) { + return [src_begin, dst_begin, task_offset, task_size] { + auto task_src_begin = std::next(src_begin, task_offset); + auto task_src_end = std::next(task_src_begin, task_size); + auto task_dst_begin = std::next(dst_begin, task_offset); + + for (; task_src_begin != task_src_end; ++task_src_begin) { + *task_dst_begin = to_edit_view(*task_src_begin); + ++task_dst_begin; + } + }; + })) + << "work_context must not be closed!"; + } + } + + MergeCompactor::ResultSet result_set; + result_set.append(std::move(buffer)); + new_leaf->result_set = std::move(result_set); + + new_leaf->set_edit_size_totals(compute_running_total(worker_pool, new_leaf->result_set)); + + return {std::move(new_leaf)}; +} + //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - // SubtreeViability InMemoryLeaf::get_viability() @@ -179,6 +247,35 @@ StatusOr> InMemoryLeaf::try_merge( return nullptr; } +//==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - +// +Status InMemoryLeaf::apply_batch_update(BatchUpdate& update, + Optional>&& current_result_set) noexcept +{ + if (current_result_set) { + // A valid BoxedSeq was passed in. Merge compact this sequence with the incoming + // update. + // + BATT_ASSIGN_OK_RESULT(this->result_set, + update.context.merge_compact_edits( + global_max_key(), + [&](MergeCompactor& compactor) -> Status { + compactor.push_level(update.result_set.live_edit_slices()); + compactor.push_level(std::move(*current_result_set)); + return OkStatus(); + })); + } else { + // If nothing was passed in, we have a new leaf being populated for the first time (empty tree). + // + this->result_set = update.context.decay_batch_to_items(update.result_set); + } + + this->result_set.update_has_page_refs(update.result_set.has_page_refs()); + this->set_edit_size_totals(update.context.compute_running_total(this->result_set)); + + return OkStatus(); +} + //=#=#==#==#===============+=+=+=+=++=++++++++++++++-++-+--+-+----+--------------- //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - diff --git a/src/turtle_kv/tree/in_memory_leaf.hpp b/src/turtle_kv/tree/in_memory_leaf.hpp index ca26bd7..5e24c8a 100644 --- a/src/turtle_kv/tree/in_memory_leaf.hpp +++ b/src/turtle_kv/tree/in_memory_leaf.hpp @@ -40,6 +40,13 @@ struct InMemoryLeaf { //+++++++++++-+-+--+----- --- -- - - - - + static std::unique_ptr unpack(llfs::PinnedPage&& pinned_leaf_page, + const TreeOptions& tree_options, + const PackedLeafPage& packed_leaf, + batt::WorkerPool& worker_pool) noexcept; + + //+++++++++++-+-+--+----- --- -- - - - - + explicit InMemoryLeaf(llfs::PinnedPage&& pinned_leaf_page, const TreeOptions& tree_options_arg) noexcept : pinned_leaf_page_{std::move(pinned_leaf_page)} @@ -95,6 +102,9 @@ struct InMemoryLeaf { StatusOr> try_merge(BatchUpdateContext& context, std::unique_ptr sibling) noexcept; + Status apply_batch_update(BatchUpdate& update, + Optional>&& current_result_set) noexcept; + Status start_serialize(TreeSerializeContext& context); StatusOr finish_serialize(TreeSerializeContext& context); diff --git a/src/turtle_kv/tree/in_memory_node.cpp b/src/turtle_kv/tree/in_memory_node.cpp index 27307dd..00c399b 100644 --- a/src/turtle_kv/tree/in_memory_node.cpp +++ b/src/turtle_kv/tree/in_memory_node.cpp @@ -216,9 +216,12 @@ Status InMemoryNode::apply_batch_update(BatchUpdate& update, // BATT_REQUIRE_OK(this->update_buffer_insert(update)); - // Check for flush. + // Check for flush. If a flush is not necessary, batt::StatusCode::kUnavailable is returned. // - BATT_REQUIRE_OK(this->flush_if_necessary(update.context)); + Status flush_status = this->flush_if_necessary(update.context); + if (flush_status != OkStatus() && flush_status != batt::StatusCode::kUnavailable) { + return flush_status; + } // We don't need to check whether _this_ node needs to be split; the caller will take care of // that! @@ -352,6 +355,10 @@ Status InMemoryNode::flush_if_necessary(BatchUpdateContext& context, bool force_ // const MaxPendingBytes max_pending = this->find_max_pending(); + if (!max_pending.byte_count) { + return {batt::StatusCode::kUnavailable}; + } + const bool flush_needed = force_flush || // (max_pending.byte_count >= this->tree_options.min_flush_size()) || // this->has_too_many_tiers(); @@ -690,15 +697,25 @@ Status InMemoryNode::split_child(BatchUpdateContext& update_context, i32 pivot_i return OkStatus(); } +//==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - +// +Subtree InMemoryNode::try_shrink() +{ + BATT_CHECK_EQ(this->children.size(), 1); + BATT_CHECK_EQ(this->pending_bytes.size(), 1); + BATT_CHECK_EQ(this->pending_bytes[0], 0); + + return {std::move(this->children[0])}; +} + //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - // Status InMemoryNode::merge_child(BatchUpdateContext& update_context, i32 pivot_i) noexcept { - // Special case: we have a tree composed of one node (root) and one leaf (its only child). - // In this case, don't progress with the rest of the function, as we are in the middle of a - // flush and shrink. + // If there are no siblings to merge with, we must be in the middle of collapsing the tree + // (flush and shrink). // - if (this->height == 2 && this->pivot_count() == 1) { + if (this->pivot_count() == 1) { return OkStatus(); } @@ -737,9 +754,10 @@ Status InMemoryNode::merge_child(BatchUpdateContext& update_context, i32 pivot_i need_update_buffer_compaction = true; } - BATT_REQUIRE_OK(this->children[sibling_i].to_in_memory_subtree(update_context, - this->tree_options, - this->height - 1)); + BATT_REQUIRE_OK(this->children[sibling_i].unpack_if_necessary(update_context.page_loader, + update_context.worker_pool, + this->tree_options, + this->height - 1)); // Erase rightmost of {child subtree, sibling} in all metadata of the parent. // @@ -860,38 +878,6 @@ Status InMemoryNode::merge_child(BatchUpdateContext& update_context, i32 pivot_i return OkStatus(); } -//==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - -// -StatusOr> InMemoryNode::flush_and_shrink(BatchUpdateContext& context) noexcept -{ - // If more than one pivot exists, nothings needs to be done. - // - usize pivot_count = this->pivot_count(); - if (pivot_count > 1) { - return None; - } - - const i32 single_pivot_i = 0; - BATT_CHECK_EQ(this->pending_bytes.size(), 1); - usize pending_bytes_count = this->pending_bytes[single_pivot_i]; - - // Flush until we have nothing left in the update buffer or until we gain more pivots. - // - while (pivot_count == 1 && pending_bytes_count > 0) { - BATT_REQUIRE_OK(this->flush_to_pivot(context, single_pivot_i)); - pivot_count = this->pivot_count(); - pending_bytes_count = this->pending_bytes[single_pivot_i]; - } - - // If still only one pivot remains, return the child. - // - if (pivot_count == 1) { - return std::move(this->children[single_pivot_i]); - } else { - return None; - } -} - //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - // StatusOr> InMemoryNode::try_merge( @@ -1244,7 +1230,7 @@ Status InMemoryNode::try_borrow(BatchUpdateContext& context, InMemoryNode& sibli for (usize segment_i = 0; segment_i < segmented_level.segment_count(); ++segment_i) { Segment& segment = segmented_level.get_segment(segment_i); for (usize j = insert_pivot_i; j < insert_pivot_i + num_pivots_to_borrow; ++j) { - segment.insert_pivot(j, /*is_active*/false); + segment.insert_pivot(j, /*is_active*/ false); } } }); diff --git a/src/turtle_kv/tree/in_memory_node.hpp b/src/turtle_kv/tree/in_memory_node.hpp index 87d9f0f..15ad720 100644 --- a/src/turtle_kv/tree/in_memory_node.hpp +++ b/src/turtle_kv/tree/in_memory_node.hpp @@ -546,6 +546,10 @@ struct InMemoryNode { */ Status try_flush(BatchUpdateContext& context); + /** \brief Attempt to collapse one level of the tree. Returns the node's single pivot. + */ + Subtree try_shrink(); + /** \brief Merge the node in place with its right sibling. * * Returns nullptr if `sibling` is completely consumed; otherwise, returns the modified sibling @@ -568,12 +572,6 @@ struct InMemoryNode { */ Status merge_child(BatchUpdateContext& update_context, i32 pivot_i) noexcept; - /** \brief If the node has a single pivot, attempts to flush updates out of the update buffer - * to grow the number of pivots. If all the updates are flushed and still only a single pivot - * remains, the single pivot (child) is returned. - */ - StatusOr> flush_and_shrink(BatchUpdateContext& context) noexcept; - /** \brief Returns true iff there are no MergedLevels or unserialized Subtree children in this * node. */ diff --git a/src/turtle_kv/tree/subtree.cpp b/src/turtle_kv/tree/subtree.cpp index 9b787a9..9650952 100644 --- a/src/turtle_kv/tree/subtree.cpp +++ b/src/turtle_kv/tree/subtree.cpp @@ -139,11 +139,7 @@ Status Subtree::apply_batch_update(const TreeOptions& tree_options, auto new_leaf = std::make_unique(llfs::PinnedPage{}, tree_options); - new_leaf->result_set = std::move(update.context.decay_batch_to_items(update.result_set)); - - new_leaf->set_edit_size_totals( - update.context.compute_running_total(new_leaf->result_set)); - update.edit_size_totals = None; + BATT_REQUIRE_OK(new_leaf->apply_batch_update(update, /*current_result_set*/ None)); return Subtree{std::move(new_leaf)}; } @@ -172,18 +168,9 @@ Status Subtree::apply_batch_update(const TreeOptions& tree_options, auto new_leaf = std::make_unique(batt::make_copy(pinned_page), tree_options); - BATT_ASSIGN_OK_RESULT( // - new_leaf->result_set, - update.context.merge_compact_edits( // - global_max_key(), - [&](MergeCompactor& compactor) -> Status { - compactor.push_level(update.result_set.live_edit_slices()); - compactor.push_level(packed_leaf.as_edit_slice_seq()); - return OkStatus(); - })); - - new_leaf->set_edit_size_totals( - update.context.compute_running_total(new_leaf->result_set)); + BATT_REQUIRE_OK( + new_leaf->apply_batch_update(update, + /*current_result_set*/ packed_leaf.as_edit_slice_seq())); return Subtree{std::move(new_leaf)}; @@ -211,19 +198,9 @@ Status Subtree::apply_batch_update(const TreeOptions& tree_options, BATT_CHECK_EQ(parent_height, 2); - BATT_ASSIGN_OK_RESULT( - in_memory_leaf->result_set, - update.context.merge_compact_edits( - global_max_key(), - [&](MergeCompactor& compactor) -> Status { - compactor.push_level(update.result_set.live_edit_slices()); - compactor.push_level(in_memory_leaf->result_set.live_edit_slices()); - return OkStatus(); - })); - - in_memory_leaf->result_set.update_has_page_refs(update.result_set.has_page_refs()); - in_memory_leaf->set_edit_size_totals( - update.context.compute_running_total(in_memory_leaf->result_set)); + BATT_REQUIRE_OK(in_memory_leaf->apply_batch_update( + update, + /*current_result_set*/ in_memory_leaf->result_set.live_edit_slices())); return Subtree{std::move(in_memory_leaf)}; }, @@ -253,6 +230,8 @@ Status Subtree::apply_batch_update(const TreeOptions& tree_options, return OkStatus(); }, [&](NeedsSplit needs_split) { + // TODO [vsilai 12-9-2025]: revist when VLDB changes are merged in. + // if (needs_split.too_many_segments && !needs_split.too_many_pivots && !needs_split.keys_too_large) { Status flush_status = new_subtree->try_flush(update.context); @@ -270,8 +249,12 @@ Status Subtree::apply_batch_update(const TreeOptions& tree_options, return status; }, [&](const NeedsMerge& needs_merge) { - // Only perform a shrink if the root has a single pivot. + // Only perform a flush and shrink if the root has a single pivot. // + if (!needs_merge.single_pivot) { + return OkStatus(); + } + Status status = new_subtree->flush_and_shrink(update.context); if (!status.ok()) { @@ -322,44 +305,44 @@ Status Subtree::flush_and_shrink(BatchUpdateContext& context) noexcept { BATT_CHECK(!this->locked_.load()); - return batt::case_of( - this->impl_, - - [&](const llfs::PageIdSlot& page_id_slot [[maybe_unused]]) -> Status { - return {batt::StatusCode::kUnimplemented}; - }, - - [&](const std::unique_ptr& leaf [[maybe_unused]]) -> Status { - return OkStatus(); - }, - - [&](std::unique_ptr& node) -> Status { - StatusOr> status_or_new_root = node->flush_and_shrink(context); + BATT_CHECK(!this->is_serialized()); - BATT_REQUIRE_OK(status_or_new_root); + while (!is_root_viable(this->get_viability())) { + // First, try flushing. If flushing makes the root viable, return immediately. + // + Status flush_status = this->try_flush(context); + if (flush_status != OkStatus() && flush_status != batt::StatusCode::kUnavailable) { + return flush_status; + } - if (!*status_or_new_root) { - return OkStatus(); - } + SubtreeViability current_viability = this->get_viability(); + if (is_root_viable(current_viability)) { + break; + } - if (batt::is_case>((*status_or_new_root)->impl_)) { - const auto& leaf_ptr = - std::get>((*status_or_new_root)->impl_); - BATT_CHECK(leaf_ptr); + // Nothing was available to flush since the node's update buffer is empty. Try collapsing one + // level of the tree. + // + if (flush_status == batt::StatusCode::kUnavailable) { + // Note: At this point, we must have a node and not a leaf, since the `is_root_viable` check + // above will return Viable` for a leaf and we break out of the loop in that case. + // + BATT_REQUIRE_OK(this->try_shrink()); + } + } - // If the new root that is returned is an empty leaf, set the root to be an empty - // subtree. - // - if (!leaf_ptr->get_items_size()) { - this->impl_ = llfs::PageIdSlot::from_page_id(llfs::PageId{}); - return OkStatus(); - } - } + // If the root is a leaf and there are no items in the leaf, set the root to be an empty subtree. + // + if (batt::is_case>(this->impl_)) { + std::unique_ptr& root_leaf = std::get>(this->impl_); + BATT_CHECK(root_leaf); - this->impl_ = std::move((*status_or_new_root)->impl_); + if (!root_leaf->get_item_count()) { + this->impl_ = llfs::PageIdSlot::from_page_id(llfs::PageId{}); + } + } - return OkStatus(); - }); + return OkStatus(); } //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - @@ -598,32 +581,21 @@ StatusOr> Subtree::try_merge(BatchUpdateContext& context, return {batt::StatusCode::kUnimplemented}; }, - [&](std::unique_ptr& leaf) -> StatusOr> { - BATT_CHECK(batt::is_case>(sibling.impl_)); - auto& sibling_leaf_ptr = std::get>(sibling.impl_); - BATT_CHECK(sibling_leaf_ptr); - - BATT_ASSIGN_OK_RESULT(std::unique_ptr merged_leaf, // - leaf->try_merge(context, std::move(sibling_leaf_ptr))); + [&](auto& in_memory) -> StatusOr> { + using PtrT = std::decay_t; - BATT_CHECK_EQ(merged_leaf, nullptr); - - return Optional{None}; - }, + BATT_CHECK(batt::is_case(sibling.impl_)); + auto& sibling_ptr = std::get(sibling.impl_); + BATT_CHECK(sibling_ptr); - [&](std::unique_ptr& node) -> StatusOr> { - BATT_CHECK(batt::is_case>(sibling.impl_)); - auto& sibling_node_ptr = std::get>(sibling.impl_); - BATT_CHECK(sibling_node_ptr); + BATT_ASSIGN_OK_RESULT(PtrT merged_subtree, + in_memory->try_merge(context, std::move(sibling_ptr))); - BATT_ASSIGN_OK_RESULT(std::unique_ptr merged_node, // - node->try_merge(context, std::move(sibling_node_ptr))); - - if (merged_node == nullptr) { + if (merged_subtree == nullptr) { return Optional{None}; } - return {Subtree{std::move(merged_node)}}; + return {Subtree{std::move(merged_subtree)}}; }); } @@ -641,7 +613,7 @@ Status Subtree::try_flush(BatchUpdateContext& context) }, [&](const std::unique_ptr& leaf [[maybe_unused]]) -> Status { - return OkStatus(); + return {batt::StatusCode::kUnavailable}; }, [&](const std::unique_ptr& node) -> Status { @@ -649,6 +621,34 @@ Status Subtree::try_flush(BatchUpdateContext& context) }); } +//==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - +// +Status Subtree::try_shrink() noexcept +{ + BATT_CHECK(!this->locked_.load()); + + StatusOr new_root = batt::case_of( + this->impl_, + + [&](const llfs::PageIdSlot& page_id_slot [[maybe_unused]]) -> StatusOr { + return {batt::StatusCode::kUnimplemented}; + }, + + [&](const std::unique_ptr& leaf [[maybe_unused]]) -> StatusOr { + return {batt::StatusCode::kUnavailable}; + }, + + [&](const std::unique_ptr& node) -> StatusOr { + return node->try_shrink(); + }); + + BATT_REQUIRE_OK(new_root); + + this->impl_ = std::move(new_root->impl_); + + return OkStatus(); +} + //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - // llfs::PackedPageId Subtree::packed_page_id_or_panic() const @@ -748,9 +748,10 @@ void Subtree::lock() //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - // -Status Subtree::to_in_memory_subtree(BatchUpdateContext& context, - const TreeOptions& tree_options, - i32 height) noexcept +Status Subtree::unpack_if_necessary(llfs::PageLoader& page_loader, + batt::WorkerPool& worker_pool, + const TreeOptions& tree_options, + i32 height) noexcept { BATT_CHECK_GT(height, 0); @@ -762,7 +763,7 @@ Status Subtree::to_in_memory_subtree(BatchUpdateContext& context, const llfs::PageLayoutId expected_layout = Subtree::expected_layout_for_height(height); StatusOr status_or_pinned_page = page_id_slot.load_through( - context.page_loader, + page_loader, llfs::PageLoadOptions{ expected_layout, llfs::PinPageToJob::kDefault, @@ -775,16 +776,12 @@ Status Subtree::to_in_memory_subtree(BatchUpdateContext& context, llfs::PinnedPage& pinned_page = *status_or_pinned_page; if (height == 1) { - auto new_leaf = std::make_unique(batt::make_copy(pinned_page), tree_options); const PackedLeafPage& packed_leaf = PackedLeafPage::view_of(pinned_page); - std::vector items; - for (const PackedKeyValue& pkv : packed_leaf.items_slice()) { - items.emplace_back(to_edit_view(pkv)); - } - new_leaf->result_set.append(std::move(items)); - - new_leaf->set_edit_size_totals(context.compute_running_total(new_leaf->result_set)); + std::unique_ptr new_leaf = InMemoryLeaf::unpack(batt::make_copy(pinned_page), + tree_options, + packed_leaf, + worker_pool); this->impl_ = std::move(new_leaf); } else { diff --git a/src/turtle_kv/tree/subtree.hpp b/src/turtle_kv/tree/subtree.hpp index 983aaf7..1e3981c 100644 --- a/src/turtle_kv/tree/subtree.hpp +++ b/src/turtle_kv/tree/subtree.hpp @@ -139,17 +139,22 @@ class Subtree StatusOr> try_split(BatchUpdateContext& context); /** \brief Attempts to merge the given Subtree in place with its right sibling. - * + * * If the in place merge is successful, `sibling` is completely consumed and `None` is returned. - * + * * If a borrow needs to occur, `this` is modified in place and the modified sibling is returned. */ StatusOr> try_merge(BatchUpdateContext& context, Subtree&& sibling) noexcept; - /** \brief Attempt to make the root viable by flushing a batch. + /** \brief Attempt to make the root viable by flushing a batch. If nothing is available to + * flush, returns batt::StatusCode::kUnavailable. */ Status try_flush(BatchUpdateContext& context); + /** \brief Attempt to collapse a level of the tree. + */ + Status try_shrink() noexcept; + /** \brief Returns true iff this Subtree has no in-memory modifications. */ bool is_serialized() const; @@ -180,11 +185,13 @@ class Subtree */ bool is_locked() const; - /** \brief Converts a serialized Subtree to its in-memory equivalent. + /** \brief Converts a serialized Subtree to its in-memory equivalent, modifying the Subtree in + * place. If the Subtree is already an in-memory type, this function does nothing. */ - Status to_in_memory_subtree(BatchUpdateContext& context, - const TreeOptions& tree_options, - i32 height) noexcept; + Status unpack_if_necessary(llfs::PageLoader& page_loader, + batt::WorkerPool& worker_pool, + const TreeOptions& tree_options, + i32 height) noexcept; //+++++++++++-+-+--+----- --- -- - - - - private: diff --git a/src/turtle_kv/tree/subtree_viability.hpp b/src/turtle_kv/tree/subtree_viability.hpp index d2984e0..de00f60 100644 --- a/src/turtle_kv/tree/subtree_viability.hpp +++ b/src/turtle_kv/tree/subtree_viability.hpp @@ -108,4 +108,19 @@ inline bool compacting_levels_might_fix(const SubtreeViability& viability) }); } +inline bool is_root_viable(const SubtreeViability& viability) +{ + return batt::case_of( + viability, + [](const Viable&) { + return true; + }, + [](const NeedsSplit&) { + return false; + }, + [](const NeedsMerge& needs_merge) { + return !needs_merge.single_pivot; + }); +} + } // namespace turtle_kv From 3b0b16fd5ecc470c6e684c38fe727324708fede2 Mon Sep 17 00:00:00 2001 From: Vidya Silai Date: Tue, 16 Dec 2025 09:23:57 -0500 Subject: [PATCH 14/48] Fix decay to items bug --- src/turtle_kv/tree/batch_update.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/turtle_kv/tree/batch_update.hpp b/src/turtle_kv/tree/batch_update.hpp index 2b5150b..d76c0e6 100644 --- a/src/turtle_kv/tree/batch_update.hpp +++ b/src/turtle_kv/tree/batch_update.hpp @@ -129,7 +129,7 @@ inline MergeCompactor::ResultSet BatchUpdateContext::de decayed_items.emplace_back(EditView::from_item_view(*maybe_item)); } } - } else { + } else if (batch.size() > 0) { const ParallelAlgoDefaults& algo_defaults = parallel_algo_defaults(); auto actual_edits = batch.get(); From 238216913a3acae8627633be02a0c304441024de Mon Sep 17 00:00:00 2001 From: Vidya Silai Date: Tue, 6 Jan 2026 10:20:56 -0500 Subject: [PATCH 15/48] InMemoryLeaf updates --- src/turtle_kv/tree/batch_update.hpp | 120 ------------ src/turtle_kv/tree/in_memory_leaf.cpp | 258 ++++++++++++++++++-------- src/turtle_kv/tree/in_memory_leaf.hpp | 29 +-- src/turtle_kv/tree/subtree.cpp | 11 +- 4 files changed, 204 insertions(+), 214 deletions(-) diff --git a/src/turtle_kv/tree/batch_update.hpp b/src/turtle_kv/tree/batch_update.hpp index d76c0e6..838a72a 100644 --- a/src/turtle_kv/tree/batch_update.hpp +++ b/src/turtle_kv/tree/batch_update.hpp @@ -42,12 +42,6 @@ struct BatchUpdateContext { { return ::turtle_kv::compute_running_total(this->worker_pool, result_set); } - - /** \brief Returns a `ResultSet` with only the edits from the batch passed into the function - * that decay to base-level items (e.g., no tombstones). - */ - MergeCompactor::ResultSet decay_batch_to_items( - MergeCompactor::ResultSet& batch); }; //=#=#==#==#===============+=+=+=+=++=++++++++++++++-++-+--+-+----+--------------- @@ -114,118 +108,4 @@ inline StatusOr> BatchUpdateContext::me return compactor.read(edit_buffer, max_key); } -//==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - -// -inline MergeCompactor::ResultSet BatchUpdateContext::decay_batch_to_items( - MergeCompactor::ResultSet& batch) -{ - const batt::TaskCount max_tasks{this->worker_pool.size() + 1}; - std::vector decayed_items; - - if (max_tasks == 1) { - for (const EditView& edit : batch.get()) { - Optional maybe_item = to_item_view(edit); - if (maybe_item) { - decayed_items.emplace_back(EditView::from_item_view(*maybe_item)); - } - } - } else if (batch.size() > 0) { - const ParallelAlgoDefaults& algo_defaults = parallel_algo_defaults(); - - auto actual_edits = batch.get(); - const auto src_begin = actual_edits.begin(); - const auto src_end = actual_edits.end(); - - const batt::WorkSlicePlan plan{batt::WorkSliceParams{ - algo_defaults.copy_decayed_items.min_task_size, - max_tasks, - }, - src_begin, - src_end}; - - BATT_CHECK_GT(plan.n_tasks, 0); - - batt::SmallVec output_size_per_shard(plan.n_tasks); - BATT_CHECK_EQ(output_size_per_shard.size(), plan.n_tasks); - - // First count the number of non-decayed items in the output for each shard. - { - batt::ScopedWorkContext work_context{this->worker_pool}; - - BATT_CHECK_OK(batt::slice_work( - work_context, - plan, - /*gen_work_fn=*/ - [&](usize task_index, isize task_offset, isize task_size) { - return [src_begin, task_index, task_offset, task_size, &output_size_per_shard] { - BATT_CHECK_LT(task_index, output_size_per_shard.size()); - - auto task_src_begin = std::next(src_begin, task_offset); - const auto task_src_end = std::next(task_src_begin, task_size); - - usize output_size = 0; - - for (; task_src_begin != task_src_end; ++task_src_begin) { - if (decays_to_item(*task_src_begin)) { - output_size += 1; - } - } - output_size_per_shard[task_index] = output_size; - }; - })) - << "worker_pool must not be closed!"; - } - - // Change to a rolling sum and do the actual copy. - // - usize output_total_size = 0; - batt::SmallVec output_shard_offset; - for (usize output_shard_size : output_size_per_shard) { - output_shard_offset.emplace_back(output_total_size); - output_total_size += output_shard_size; - } - - decayed_items.resize(output_total_size); - { - this->worker_pool.reset(); - - batt::ScopedWorkContext work_context{this->worker_pool}; - - BATT_CHECK_OK( - batt::slice_work(work_context, - plan, - /*gen_work_fn=*/ - [&](usize task_index, isize task_offset, isize task_size) { - return [src_begin, - &output_shard_offset, - &output_size_per_shard, - task_index, - task_offset, - task_size, - &decayed_items] { - auto task_src_begin = std::next(src_begin, task_offset); - const auto task_src_end = std::next(task_src_begin, task_size); - - BATT_CHECK_LT(task_index, output_shard_offset.size()); - auto task_dst_begin = - std::next(decayed_items.data(), output_shard_offset[task_index]); - - for (; task_src_begin != task_src_end; ++task_src_begin) { - Optional maybe_item = to_item_view(*task_src_begin); - if (maybe_item) { - *task_dst_begin = EditView::from_item_view(*maybe_item); - ++task_dst_begin; - } - } - }; - })) - << "worker_pool must not be closed!"; - } - } - - MergeCompactor::ResultSet output_result_set; - output_result_set.append(std::move(decayed_items)); - return output_result_set; -} - } // namespace turtle_kv diff --git a/src/turtle_kv/tree/in_memory_leaf.cpp b/src/turtle_kv/tree/in_memory_leaf.cpp index 829b373..bd0dab0 100644 --- a/src/turtle_kv/tree/in_memory_leaf.cpp +++ b/src/turtle_kv/tree/in_memory_leaf.cpp @@ -6,6 +6,8 @@ #include #include +#include + namespace turtle_kv { //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - @@ -19,59 +21,34 @@ namespace turtle_kv { std::unique_ptr new_leaf = std::make_unique(batt::make_copy(pinned_leaf_page), tree_options); - const batt::TaskCount max_tasks{worker_pool.size() + 1}; - Slice packed_items = packed_leaf.items_slice(); std::vector buffer; buffer.reserve(packed_items.size()); - if (max_tasks == 1) { - for (const PackedKeyValue& pkv : packed_items) { - buffer.emplace_back(to_edit_view(pkv)); - } - } else { - const ParallelAlgoDefaults& algo_defaults = parallel_algo_defaults(); - - const auto src_begin = packed_items.begin(); - const auto src_end = packed_items.end(); - const auto dst_begin = buffer.begin(); - - const batt::WorkSlicePlan plan{batt::WorkSliceParams{ - algo_defaults.copy_edits.min_task_size, - max_tasks, - }, - src_begin, - src_end}; - - BATT_CHECK_GT(plan.n_tasks, 0); + { + batt::ScopedWorkContext context{worker_pool}; - { - batt::ScopedWorkContext work_context{worker_pool}; - - BATT_CHECK_OK(slice_work(work_context, - plan, - /*gen_work_fn=*/ - [&](usize /*task_index*/, isize task_offset, isize task_size) { - return [src_begin, dst_begin, task_offset, task_size] { - auto task_src_begin = std::next(src_begin, task_offset); - auto task_src_end = std::next(task_src_begin, task_size); - auto task_dst_begin = std::next(dst_begin, task_offset); - - for (; task_src_begin != task_src_end; ++task_src_begin) { - *task_dst_begin = to_edit_view(*task_src_begin); - ++task_dst_begin; - } - }; - })) - << "work_context must not be closed!"; - } + const ParallelAlgoDefaults& algo_defaults = parallel_algo_defaults(); + const batt::TaskCount max_tasks{worker_pool.size() + 1}; + + batt::parallel_transform( + context, + packed_items.begin(), + packed_items.end(), + buffer.data(), + [](const PackedKeyValue& pkv) -> EditView { + return to_edit_view(pkv); + }, + /*min_task_size = */ algo_defaults.copy_edits.min_task_size, + /*max_tasks = */ max_tasks); } MergeCompactor::ResultSet result_set; - result_set.append(std::move(buffer)); + const ItemView* first_edit = (const ItemView*)buffer.data(); + result_set.append(std::move(buffer), as_slice(first_edit, packed_items.size())); new_leaf->result_set = std::move(result_set); - new_leaf->set_edit_size_totals(compute_running_total(worker_pool, new_leaf->result_set)); + new_leaf->set_edit_size_totals(compute_running_total(worker_pool, *(new_leaf->result_set))); return {std::move(new_leaf)}; } @@ -97,7 +74,8 @@ StatusOr> InMemoryLeaf::try_split() { BATT_CHECK(this->edit_size_totals); BATT_CHECK(!this->edit_size_totals->empty()); - BATT_CHECK_EQ(this->result_set.size() + 1, // + BATT_CHECK(this->result_set); + BATT_CHECK_EQ(this->result_set->size() + 1, // this->edit_size_totals->size()); BATT_ASSIGN_OK_RESULT(SplitPlan plan, this->make_split_plan()); @@ -105,16 +83,16 @@ StatusOr> InMemoryLeaf::try_split() // Sanity checks. // BATT_CHECK_LT(0, plan.split_point); - BATT_CHECK_LT(plan.split_point, this->result_set.size()); + BATT_CHECK_LT(plan.split_point, this->result_set->size()); auto new_sibling = std::make_unique(batt::make_copy(this->pinned_leaf_page_), this->tree_options); new_sibling->result_set = this->result_set; { - const usize pre_drop_size = new_sibling->result_set.size(); - new_sibling->result_set.drop_before_n(plan.split_point); - const usize post_drop_size = new_sibling->result_set.size(); + const usize pre_drop_size = new_sibling->result_set->size(); + new_sibling->result_set->drop_before_n(plan.split_point); + const usize post_drop_size = new_sibling->result_set->size(); BATT_CHECK_EQ(post_drop_size, pre_drop_size - plan.split_point) << BATT_INSPECT(pre_drop_size) << BATT_INSPECT(plan); @@ -123,13 +101,13 @@ StatusOr> InMemoryLeaf::try_split() new_sibling->edit_size_totals = this->edit_size_totals; new_sibling->edit_size_totals->drop_front(plan.split_point); - this->result_set.drop_after_n(plan.split_point); + this->result_set->drop_after_n(plan.split_point); this->edit_size_totals->drop_back(this->edit_size_totals->size() - plan.split_point - 1); - BATT_CHECK_EQ(this->result_set.size() + 1, // + BATT_CHECK_EQ(this->result_set->size() + 1, // this->edit_size_totals->size()); - BATT_CHECK_EQ(new_sibling->result_set.size() + 1, // + BATT_CHECK_EQ(new_sibling->result_set->size() + 1, // new_sibling->edit_size_totals->size()); BATT_CHECK(!batt::is_case(this->get_viability())) @@ -223,12 +201,15 @@ StatusOr> InMemoryLeaf::try_merge( BatchUpdateContext& context, std::unique_ptr sibling) noexcept { - if (sibling->result_set.empty()) { + BATT_CHECK(this->result_set); + BATT_CHECK(sibling->result_set); + + if (sibling->result_set->empty()) { BATT_CHECK(batt::is_case(this->get_viability())); return nullptr; } - if (this->result_set.empty()) { + if (this->result_set->empty()) { BATT_CHECK(batt::is_case(sibling->get_viability())); this->pinned_leaf_page_ = std::move(sibling->pinned_leaf_page_); this->result_set = std::move(sibling->result_set); @@ -237,41 +218,166 @@ StatusOr> InMemoryLeaf::try_merge( return nullptr; } + if (this->get_items_size() + sibling->get_items_size() > this->tree_options.flush_size()) { + bool borrow_from_sibling = false; + if (batt::is_case(this->get_viability())) { + borrow_from_sibling = true; + } else { + BATT_CHECK(batt::is_case(sibling->get_viability())); + } + + Status borrow_status = borrow_from_sibling ? this->try_borrow(context, *sibling) + : sibling->try_borrow(context, *this); + BATT_REQUIRE_OK(borrow_status); + + return {std::move(sibling)}; + } + BATT_CHECK_LT(this->get_max_key(), sibling->get_min_key()); - this->result_set = MergeCompactor::ResultSet::concat(std::move(this->result_set), - std::move(sibling->result_set)); + this->result_set = MergeCompactor::ResultSet::concat(std::move(*this->result_set), + std::move(*(sibling->result_set))); - this->set_edit_size_totals(context.compute_running_total(this->result_set)); + this->set_edit_size_totals(context.compute_running_total(*this->result_set)); return nullptr; } //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - // -Status InMemoryLeaf::apply_batch_update(BatchUpdate& update, - Optional>&& current_result_set) noexcept +Status InMemoryLeaf::try_borrow(BatchUpdateContext& context, InMemoryLeaf& sibling) noexcept { - if (current_result_set) { - // A valid BoxedSeq was passed in. Merge compact this sequence with the incoming - // update. + const usize orig_edit_count = sibling.result_set->size(); + BATT_CHECK_EQ(sibling.result_set->size() + 1, sibling.edit_size_totals->size()); + + // Calculate the minimum number of bytes we would need to borrow from the sibling to make this + // leaf viable. By definition, we need to get this leaf to be at least a quarter full. + // + const usize min_bytes_to_borrow = (this->tree_options.flush_size() / 4) - this->get_items_size(); + usize n_to_borrow = 0; + + const auto borrow_edits = [&context, &n_to_borrow]( + const auto& src_begin, + const auto& src_end, + MergeCompactor::ResultSet& dst_result_set) -> void { + std::vector buffer; + buffer.reserve(n_to_borrow); + { + batt::ScopedWorkContext work_context{context.worker_pool}; + + const ParallelAlgoDefaults& algo_defaults = parallel_algo_defaults(); + const batt::TaskCount max_tasks{context.worker_pool.size() + 1}; + + parallel_copy(work_context, + src_begin, + src_end, + buffer.data(), + /*min_task_size = */ algo_defaults.copy_edits.min_task_size, + /*max_tasks = */ max_tasks); + } + const ItemView* first_edit = (const ItemView*)buffer.data(); + dst_result_set.append(std::move(buffer), as_slice(first_edit, n_to_borrow)); + }; + + if (this->get_max_key() < sibling.get_min_key()) { + // If the sibling is the right sibling, we borrow from the front of the sibling. // - BATT_ASSIGN_OK_RESULT(this->result_set, - update.context.merge_compact_edits( - global_max_key(), - [&](MergeCompactor& compactor) -> Status { - compactor.push_level(update.result_set.live_edit_slices()); - compactor.push_level(std::move(*current_result_set)); - return OkStatus(); - })); + for (n_to_borrow = 1; n_to_borrow <= orig_edit_count; ++n_to_borrow) { + usize bytes = (*sibling.edit_size_totals)[n_to_borrow] - sibling.edit_size_totals->front(); + if (bytes >= min_bytes_to_borrow) { + break; + } + } + + // The number of edits being borrowed should always be less than the original edit count of + // the sibling, since borrowing everything is a full merge. + // + BATT_CHECK_LT(n_to_borrow, orig_edit_count); + + auto src_begin = sibling.result_set->get().begin(); + auto src_end = src_begin + n_to_borrow; + + // Copy over edits into this leaf's result_set. + // + borrow_edits(src_begin, src_end, *this->result_set); + + sibling.result_set->drop_before_n(n_to_borrow); + sibling.edit_size_totals->drop_front(n_to_borrow); } else { - // If nothing was passed in, we have a new leaf being populated for the first time (empty tree). + // If the sibling is the left sibling, we borrow from the back of the sibling. + // + for (n_to_borrow = 1; n_to_borrow <= orig_edit_count; ++n_to_borrow) { + usize bytes = sibling.edit_size_totals->back() - + (*sibling.edit_size_totals)[orig_edit_count - n_to_borrow]; + if (bytes >= min_bytes_to_borrow) { + break; + } + } + + BATT_CHECK_LT(n_to_borrow, orig_edit_count); + + usize new_edit_count = orig_edit_count - n_to_borrow; + + auto src_begin = sibling.result_set->get().begin() + new_edit_count; + auto src_end = sibling.result_set->get().end(); + + // Copy over the edits to be borrowed into an intermediary ResultSet and concat it with + // this leaf's current result_set. + // + MergeCompactor::ResultSet items_to_prepend; + + borrow_edits(src_begin, src_end, items_to_prepend); + + this->result_set = MergeCompactor::ResultSet::concat(std::move(items_to_prepend), + std::move(*this->result_set)); + + sibling.result_set->drop_after_n(new_edit_count); + sibling.edit_size_totals->drop_back(n_to_borrow); + } + + this->set_edit_size_totals(context.compute_running_total(*this->result_set)); + + BATT_CHECK(batt::is_case(sibling.get_viability())); + + return OkStatus(); +} + +//==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - +// +Status InMemoryLeaf::apply_batch_update(BatchUpdate& update) noexcept +{ + Optional> current_result_set = None; + if (this->pinned_leaf_page_ && !this->result_set) { + // In this case, we have initialized a new InMemoryLeaf from a PackedLeaf. Use the + // items from the PackedLeaf to merge with the incoming update. // - this->result_set = update.context.decay_batch_to_items(update.result_set); + const PackedLeafPage& packed_leaf = PackedLeafPage::view_of(this->pinned_leaf_page_); + current_result_set = packed_leaf.as_edit_slice_seq(); + } else if (this->result_set) { + // In this case, we have an existing InMemoryLeaf that we are applying updates to. + // Use the existing ResultSet to merge with the incoming update. + // + current_result_set = this->result_set->live_edit_slices(); } - this->result_set.update_has_page_refs(update.result_set.has_page_refs()); - this->set_edit_size_totals(update.context.compute_running_total(this->result_set)); + // If we didn't enter either of the above two cases, we must have an empty tree that we are + // applying updates to. + // + BATT_CHECK_IMPLIES(!current_result_set, !this->pinned_leaf_page_ && !this->result_set); + + BATT_ASSIGN_OK_RESULT(this->result_set, + update.context.merge_compact_edits( + global_max_key(), + [&](MergeCompactor& compactor) -> Status { + compactor.push_level(update.result_set.live_edit_slices()); + if (current_result_set) { + compactor.push_level(std::move(*current_result_set)); + } + return OkStatus(); + })); + + this->result_set->update_has_page_refs(update.result_set.has_page_refs()); + this->set_edit_size_totals(update.context.compute_running_total(*this->result_set)); return OkStatus(); } @@ -282,6 +388,8 @@ Status InMemoryLeaf::apply_batch_update(BatchUpdate& update, // Status InMemoryLeaf::start_serialize(TreeSerializeContext& context) { + BATT_CHECK(this->result_set); + BATT_CHECK(!batt::is_case(this->get_viability())) << BATT_INSPECT(this->get_viability()) << BATT_INSPECT(this->get_items_size()) << BATT_INSPECT(this->tree_options.flush_size()); @@ -306,14 +414,14 @@ Status InMemoryLeaf::start_serialize(TreeSerializeContext& context) if (task_i == 0) { return build_leaf_page_in_job(this->tree_options.trie_index_reserve_size(), page_buffer, - this->result_set.get()); + this->result_set->get()); } BATT_CHECK_EQ(task_i, 1); return build_filter_for_leaf_in_job(page_cache, filter_bits_per_key, page_buffer.page_id(), - this->result_set.get()); + this->result_set->get()); })); BATT_CHECK_EQ(this->future_id_.exchange(future_id), ~u64{0}); diff --git a/src/turtle_kv/tree/in_memory_leaf.hpp b/src/turtle_kv/tree/in_memory_leaf.hpp index 5e24c8a..61df84a 100644 --- a/src/turtle_kv/tree/in_memory_leaf.hpp +++ b/src/turtle_kv/tree/in_memory_leaf.hpp @@ -33,7 +33,7 @@ struct InMemoryLeaf { llfs::PinnedPage pinned_leaf_page_; TreeOptions tree_options; - MergeCompactor::ResultSet result_set; + Optional> result_set; std::shared_ptr shared_edit_size_totals_; Optional edit_size_totals; mutable std::atomic future_id_{~u64{0}}; @@ -41,9 +41,9 @@ struct InMemoryLeaf { //+++++++++++-+-+--+----- --- -- - - - - static std::unique_ptr unpack(llfs::PinnedPage&& pinned_leaf_page, - const TreeOptions& tree_options, - const PackedLeafPage& packed_leaf, - batt::WorkerPool& worker_pool) noexcept; + const TreeOptions& tree_options, + const PackedLeafPage& packed_leaf, + batt::WorkerPool& worker_pool) noexcept; //+++++++++++-+-+--+----- --- -- - - - - @@ -51,6 +51,7 @@ struct InMemoryLeaf { const TreeOptions& tree_options_arg) noexcept : pinned_leaf_page_{std::move(pinned_leaf_page)} , tree_options{tree_options_arg} + , result_set{None} { } @@ -64,13 +65,15 @@ struct InMemoryLeaf { usize get_item_count() const { - return this->result_set.size(); + BATT_CHECK(this->result_set); + return this->result_set->size(); } usize get_items_size() const { BATT_CHECK(this->edit_size_totals); - BATT_CHECK_EQ(this->edit_size_totals->size(), this->result_set.size() + 1); + BATT_CHECK(this->result_set); + BATT_CHECK_EQ(this->edit_size_totals->size(), this->result_set->size() + 1); if (this->edit_size_totals->empty()) { return 0; @@ -80,17 +83,20 @@ struct InMemoryLeaf { KeyView get_min_key() const { - return this->result_set.get_min_key(); + BATT_CHECK(this->result_set); + return this->result_set->get_min_key(); } KeyView get_max_key() const { - return this->result_set.get_max_key(); + BATT_CHECK(this->result_set); + return this->result_set->get_max_key(); } StatusOr find_key(const KeyView& key) const { - return this->result_set.find_key(key); + BATT_CHECK(this->result_set); + return this->result_set->find_key(key); } SubtreeViability get_viability(); @@ -102,8 +108,9 @@ struct InMemoryLeaf { StatusOr> try_merge(BatchUpdateContext& context, std::unique_ptr sibling) noexcept; - Status apply_batch_update(BatchUpdate& update, - Optional>&& current_result_set) noexcept; + Status try_borrow(BatchUpdateContext& context, InMemoryLeaf& sibling) noexcept; + + Status apply_batch_update(BatchUpdate& update) noexcept; Status start_serialize(TreeSerializeContext& context); diff --git a/src/turtle_kv/tree/subtree.cpp b/src/turtle_kv/tree/subtree.cpp index 9650952..aaca0b3 100644 --- a/src/turtle_kv/tree/subtree.cpp +++ b/src/turtle_kv/tree/subtree.cpp @@ -139,7 +139,7 @@ Status Subtree::apply_batch_update(const TreeOptions& tree_options, auto new_leaf = std::make_unique(llfs::PinnedPage{}, tree_options); - BATT_REQUIRE_OK(new_leaf->apply_batch_update(update, /*current_result_set*/ None)); + BATT_REQUIRE_OK(new_leaf->apply_batch_update(update)); return Subtree{std::move(new_leaf)}; } @@ -164,13 +164,10 @@ Status Subtree::apply_batch_update(const TreeOptions& tree_options, //+++++++++++-+-+--+----- --- -- - - - - // Case: {BatchUpdate} + {PackedLeafPage} => InMemoryLeaf - const PackedLeafPage& packed_leaf = PackedLeafPage::view_of(pinned_page); auto new_leaf = std::make_unique(batt::make_copy(pinned_page), tree_options); - BATT_REQUIRE_OK( - new_leaf->apply_batch_update(update, - /*current_result_set*/ packed_leaf.as_edit_slice_seq())); + BATT_REQUIRE_OK(new_leaf->apply_batch_update(update)); return Subtree{std::move(new_leaf)}; @@ -198,9 +195,7 @@ Status Subtree::apply_batch_update(const TreeOptions& tree_options, BATT_CHECK_EQ(parent_height, 2); - BATT_REQUIRE_OK(in_memory_leaf->apply_batch_update( - update, - /*current_result_set*/ in_memory_leaf->result_set.live_edit_slices())); + BATT_REQUIRE_OK(in_memory_leaf->apply_batch_update(update)); return Subtree{std::move(in_memory_leaf)}; }, From 4b0f7927f9295f8090a7cfc4c86444dde4f7c2a0 Mon Sep 17 00:00:00 2001 From: Vidya Silai Date: Tue, 6 Jan 2026 21:12:22 -0500 Subject: [PATCH 16/48] Fix node find key --- src/turtle_kv/tree/algo/nodes.hpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/turtle_kv/tree/algo/nodes.hpp b/src/turtle_kv/tree/algo/nodes.hpp index 50c066a..3cb5fbb 100644 --- a/src/turtle_kv/tree/algo/nodes.hpp +++ b/src/turtle_kv/tree/algo/nodes.hpp @@ -169,6 +169,9 @@ struct NodeAlgorithms { BATT_ASSIGN_OK_RESULT(const bool done, combine_in_place(&value, found_in_level)); if (done) { BATT_CHECK(value); + if (value->is_delete()) { + return {batt::StatusCode::kNotFound}; + } return *value; } } From ac7e0786785e17bc7456482d1d5f451d2b9d602c Mon Sep 17 00:00:00 2001 From: Vidya Silai Date: Tue, 24 Feb 2026 09:50:42 -0500 Subject: [PATCH 17/48] Initial commit, tests passing --- src/turtle_kv/tree/algo/segmented_levels.hpp | 7 +-- src/turtle_kv/tree/algo/segments.hpp | 28 +++++++-- src/turtle_kv/tree/in_memory_node.cpp | 51 +++++++++++++--- src/turtle_kv/tree/in_memory_node.hpp | 32 ++++++++-- src/turtle_kv/tree/packed_node_page.hpp | 7 ++- src/turtle_kv/tree/testing/fake_segment.hpp | 62 ++++++++++++++++---- 6 files changed, 150 insertions(+), 37 deletions(-) diff --git a/src/turtle_kv/tree/algo/segmented_levels.hpp b/src/turtle_kv/tree/algo/segmented_levels.hpp index b6c0c39..82fbb78 100644 --- a/src/turtle_kv/tree/algo/segmented_levels.hpp +++ b/src/turtle_kv/tree/algo/segmented_levels.hpp @@ -181,8 +181,7 @@ struct SegmentedLevelAlgorithms { // Skip this segment if the pivot is not active. // - const u64 active_pivots = segment.get_active_pivots(); - if (!get_bit(active_pivots, pivot_i)) { + if (!segment.is_pivot_active(pivot_i)) { ++segment_i; continue; } @@ -208,7 +207,7 @@ struct SegmentedLevelAlgorithms { // Drop the segment if it has become inactive due to the flush. // - if (segment.get_active_pivots() == 0) { + if (segment.is_inactive()) { this->level_.drop_segment(segment_i); } else { ++segment_i; @@ -236,7 +235,7 @@ struct SegmentedLevelAlgorithms { << batt::c_str_literal(old_pivot_key_range.upper_bound) << "), key=" << batt::c_str_literal(split_key) << ")"; - BATT_CHECK_LT(this->node_.pivot_count(), 64); + BATT_CHECK_LT(this->node_.pivot_count(), 67); const KeyView pivot_key = old_pivot_key_range.lower_bound; const usize segment_count = this->level_.segment_count(); diff --git a/src/turtle_kv/tree/algo/segments.hpp b/src/turtle_kv/tree/algo/segments.hpp index eaeca0e..69b85ee 100644 --- a/src/turtle_kv/tree/algo/segments.hpp +++ b/src/turtle_kv/tree/algo/segments.hpp @@ -47,7 +47,7 @@ struct SegmentAlgorithms { this->segment_.check_invariants(__FILE__, __LINE__); }); - BATT_CHECK_LT(pivot_i, 63); + BATT_CHECK_LT(pivot_i, 67); // Simplest case: pivot not active for this segment. // @@ -116,14 +116,32 @@ struct SegmentAlgorithms { // they were when this function was entered, regardless of what `fn` may do to change the state // of the segment. // - const u64 observed_active_pivots = this->segment_.get_active_pivots(); + const std::pair observed_active_pivots = + this->segment_.get_active_pivots_with_overflow(); + + const u64 first_active_bit = first_bit(observed_active_pivots.first); + const u64 first_active_overflow_bit = first_bit(observed_active_pivots.second); + const u64 first_active_pivot = + (first_active_bit != 64) ? first_active_bit : (first_active_overflow_bit + 64); const i32 first_pivot_i = std::max(pivot_range.lower_bound, // - first_bit(observed_active_pivots)); + first_active_pivot); - for (i32 pivot_i = first_pivot_i; pivot_i < pivot_range.upper_bound; - pivot_i = next_bit(observed_active_pivots, pivot_i)) { + for (i32 pivot_i = first_pivot_i; pivot_i < pivot_range.upper_bound;) { BATT_INVOKE_LOOP_FN((fn, this->segment_, pivot_i)); + + if (pivot_i < 64) { + pivot_i = next_bit(observed_active_pivots.first, pivot_i); + if (pivot_i == 64) { + pivot_i = first_bit(observed_active_pivots.second) + 64; + } + } else { + const i32 overflow_i = pivot_i - 64; + if (overflow_i >= 64) { + break; + } + pivot_i = next_bit(observed_active_pivots.second, overflow_i) + 64; + } } } diff --git a/src/turtle_kv/tree/in_memory_node.cpp b/src/turtle_kv/tree/in_memory_node.cpp index fd368dc..a9deaba 100644 --- a/src/turtle_kv/tree/in_memory_node.cpp +++ b/src/turtle_kv/tree/in_memory_node.cpp @@ -777,7 +777,7 @@ Status InMemoryNode::set_pivot_completely_flushed(usize pivot_i, segment.set_pivot_active(pivot_i, false); - if (segment.get_active_pivots() == 0) { + if (segment.is_inactive()) { segmented_level.drop_segment(segment_i); } else { ++segment_i; @@ -1118,8 +1118,27 @@ StatusOr> InMemoryNode::try_split_direct(BatchUpda BATT_CHECK_EQ(orig_pivot_count + 1, orig_pivot_keys.size()); u64 tried_already = 0; + u64 tried_already_overflow = 0; usize split_pivot_i = (orig_pivot_count + 1) / 2; + const auto get_tried_bit = [&tried_already, &tried_already_overflow](usize i) -> bool { + if (i < 64) { + return get_bit(tried_already, i); + } else { + const i32 overflow_i = i - 64; + return get_bit(tried_already_overflow, overflow_i); + } + }; + + auto set_tried_bit = [&tried_already, &tried_already_overflow](usize i) { + if (i < 64) { + tried_already = set_bit(tried_already, i, true); + } else { + const i32 overflow_i = i - 64; + tried_already_overflow = set_bit(tried_already_overflow, overflow_i, true); + } + }; + auto* node_lower_half = this; auto node_upper_half = std::make_unique(batt::make_copy(this->pinned_node_page_), this->tree_options, @@ -1132,10 +1151,10 @@ StatusOr> InMemoryNode::try_split_direct(BatchUpda for (;;) { // If we ever try the same split point a second time, fail. // - if (get_bit(tried_already, split_pivot_i)) { + if (get_tried_bit(split_pivot_i)) { return {batt::StatusCode::kInternal}; } - tried_already = set_bit(tried_already, split_pivot_i, true); + set_tried_bit(split_pivot_i); //+++++++++++-+-+--+----- --- -- - - - - @@ -1265,7 +1284,7 @@ StatusOr> InMemoryNode::try_split_direct(BatchUpda // If the upper half is too large, then move the split point up and retry if possible. // - if (split_pivot_i + 4 < 64 && batt::is_case(upper_viability) && + if (split_pivot_i + 4 < 68 && batt::is_case(upper_viability) && !batt::is_case(lower_viability)) { ++split_pivot_i; continue; @@ -1627,7 +1646,7 @@ void InMemoryNode::UpdateBuffer::SegmentedLevel::drop_after_pivot(i32 pivot_i, llfs::PageLoader& page_loader, const TreeOptions& tree_options) { - this->drop_pivot_range((Interval{pivot_i, 64}), + this->drop_pivot_range((Interval{pivot_i, 68}), (Interval{pivot_key, global_max_key()}), page_loader, tree_options); @@ -1701,7 +1720,17 @@ void InMemoryNode::UpdateBuffer::Segment::insert_pivot(i32 pivot_i, bool is_acti this->check_invariants(__FILE__, __LINE__); }); - this->active_pivots = insert_bit(this->active_pivots, pivot_i, is_active); + if (pivot_i < 64) { + // Insert the highest bit from active_pivots to the overflow bit set. + // + this->active_pivots_overflow = + (this->active_pivots_overflow << 1) | ((this->active_pivots >> 63) & u64{1}); + + this->active_pivots = insert_bit(this->active_pivots, pivot_i, is_active); + } else { + const i32 overflow_i = pivot_i - 64; + this->active_pivots_overflow = insert_bit(this->active_pivots_overflow, overflow_i, is_active); + } } //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - @@ -1712,6 +1741,8 @@ void InMemoryNode::UpdateBuffer::Segment::pop_front_pivots(i32 count) return; } + BATT_CHECK_LT(count, 64); + // Before we modify the bit sets, make sure we aren't losing any active pivots. // const u64 mask = (u64{1} << count) - 1; @@ -1719,16 +1750,18 @@ void InMemoryNode::UpdateBuffer::Segment::pop_front_pivots(i32 count) BATT_CHECK_EQ(bit_count(mask), count); BATT_CHECK_EQ((this->active_pivots & mask), u64{0}); - // Shift the active pivot set down by count. + // Shift the active pivot sets down by count. // - this->active_pivots = (this->active_pivots >> count); + this->active_pivots = + (this->active_pivots >> count) | (this->active_pivots_overflow << (64 - count)); + this->active_pivots_overflow = (this->active_pivots_overflow >> count); } //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - // bool InMemoryNode::UpdateBuffer::Segment::is_inactive() const { - const bool inactive = (this->active_pivots == 0); + const bool inactive = (this->active_pivots == 0 && this->active_pivots_overflow == 0); if (inactive) { Slice> filter_dropped_ranges = this->filter.dropped(); BATT_CHECK_EQ(filter_dropped_ranges.size(), 1); diff --git a/src/turtle_kv/tree/in_memory_node.hpp b/src/turtle_kv/tree/in_memory_node.hpp index e52d107..e3339ed 100644 --- a/src/turtle_kv/tree/in_memory_node.hpp +++ b/src/turtle_kv/tree/in_memory_node.hpp @@ -85,10 +85,16 @@ struct InMemoryNode { */ llfs::PageIdSlot page_id_slot; - /** \brief A bit set of pivots in whose key range this segment contains items. + /** \brief A bit set of pivots in whose key range this segment contains items. Used for + * pivots [0, 64). */ u64 active_pivots = 0; + /** \brief A bit set of pivots in whose key range this segment contains items. Used for + * pivots 64 and greater. + */ + u64 active_pivots_overflow = 0; + /** \brief A filter over the flushed items in this segment. */ PiecewiseFilter filter; @@ -110,18 +116,34 @@ struct InMemoryNode { return this->active_pivots; } + /** \brief Returns the active pivots bit set, as well as the overflow bit set. + */ + std::pair get_active_pivots_with_overflow() const + { + return std::make_pair(this->active_pivots, this->active_pivots_overflow); + } + /** \brief Marks this segment as containing (or not) active keys addressed to `pivot_i`. */ void set_pivot_active(i32 pivot_i, bool active) { - this->active_pivots = set_bit(this->active_pivots, pivot_i, active); + if (pivot_i < 64) { + this->active_pivots = set_bit(this->active_pivots, pivot_i, active); + } else { + this->active_pivots_overflow = + set_bit(this->active_pivots_overflow, pivot_i - 64, active); + } } /** \brief Returns true iff this segment has active keys addressed to `pivot_i`. */ bool is_pivot_active(i32 pivot_i) const { - return get_bit(this->active_pivots, pivot_i); + if (pivot_i < 64) { + return get_bit(this->active_pivots, pivot_i); + } else { + return get_bit(this->active_pivots_overflow, pivot_i - 64); + } } template @@ -460,12 +482,12 @@ struct InMemoryNode { usize max_pivot_count() const { - return this->is_size_tiered() ? (64 - 1) : (64 - 1); + return this->is_size_tiered() ? 64 : 64; } usize max_segment_count() const { - return this->is_size_tiered() ? (64 - 2) : (64 - 2); + return this->is_size_tiered() ? (64 - 1) : (64 - 1); } Slice get_pivot_keys() const diff --git a/src/turtle_kv/tree/packed_node_page.hpp b/src/turtle_kv/tree/packed_node_page.hpp index 3f12fcf..fb8cc13 100644 --- a/src/turtle_kv/tree/packed_node_page.hpp +++ b/src/turtle_kv/tree/packed_node_page.hpp @@ -49,7 +49,7 @@ struct PackedNodePage { kMaxPivots + 1 /*max_key*/ + 1 /*common_prefix*/ + 1 /*final_offset*/; static constexpr u8 kFlagSizeTiered = 0x80; - static constexpr u8 kPivotCountMask = 0x3f; + static constexpr u8 kPivotCountMask = 0x7f; static constexpr u16 kSegmentStartsFiltered = 0x8000; using Key = PackedNodePageKey; @@ -143,6 +143,11 @@ struct PackedNodePage { return this->active_pivots; } + std::pair get_active_pivots_with_overflow() const + { + return std::make_pair(this->active_pivots, u64{0}); + } + llfs::PageId get_leaf_page_id() const { return this->leaf_page_id.unpack(); diff --git a/src/turtle_kv/tree/testing/fake_segment.hpp b/src/turtle_kv/tree/testing/fake_segment.hpp index a38adc3..aa0d261 100644 --- a/src/turtle_kv/tree/testing/fake_segment.hpp +++ b/src/turtle_kv/tree/testing/fake_segment.hpp @@ -25,7 +25,8 @@ struct FakeLevel; struct FakeSegment { llfs::PageId page_id_; u64 active_pivots_ = 0; - PiecewiseFilter filter; + u64 active_pivots_overflow_ = 0; + PiecewiseFilter filter_; std::map pivot_items_count_; //+++++++++++-+-+--+----- --- -- - - - - @@ -47,24 +48,48 @@ struct FakeSegment { return this->active_pivots_; } - bool is_pivot_active(usize pivot_i) const + std::pair get_active_pivots_with_overflow() const { - return get_bit(this->active_pivots_, pivot_i); + return std::make_pair(this->active_pivots_, this->active_pivots_overflow_); } - void set_pivot_active(usize pivot_i, bool active) + bool is_pivot_active(i32 pivot_i) const { - this->active_pivots_ = set_bit(this->active_pivots_, pivot_i, active); + if (pivot_i < 64) { + return get_bit(this->active_pivots_, pivot_i); + } else { + return get_bit(this->active_pivots_overflow_, pivot_i - 64); + } + } + + void set_pivot_active(i32 pivot_i, bool active) + { + if (pivot_i < 64) { + this->active_pivots_ = set_bit(this->active_pivots_, pivot_i, active); + } else { + this->active_pivots_overflow_ = set_bit(this->active_pivots_overflow_, pivot_i - 64, active); + } } void insert_active_pivot(usize pivot_i, bool is_active = true) { - this->active_pivots_ = insert_bit(this->active_pivots_, pivot_i, is_active); + if (pivot_i < 64) { + // Insert the highest bit from active_pivots to the overflow bit set. + // + this->active_pivots_overflow_ = + (this->active_pivots_overflow_ << 1) | ((this->active_pivots_ >> 63) & u64{1}); + + this->active_pivots_ = insert_bit(this->active_pivots_, pivot_i, is_active); + } else { + const i32 overflow_i = pivot_i - 64; + this->active_pivots_overflow_ = + insert_bit(this->active_pivots_overflow_, overflow_i, is_active); + } } void set_pivot_items_count(usize pivot_i, usize count) { - this->active_pivots_ = set_bit(this->active_pivots_, pivot_i, (count > 0)); + this->set_pivot_active(pivot_i, (count > 0)); if (count > 0) { this->pivot_items_count_[pivot_i] = count; } else { @@ -88,36 +113,47 @@ struct FakeSegment { this->pivot_items_count_.clear(); } + bool is_inactive() const + { + const bool inactive = (this->active_pivots_ == 0 && this->active_pivots_overflow_ == 0); + if (inactive) { + Slice> filter_dropped_ranges = this->filter_.dropped(); + BATT_CHECK_EQ(filter_dropped_ranges.size(), 1); + BATT_CHECK_EQ(filter_dropped_ranges[0].lower_bound, 0); + } + return inactive; + } + template void drop_key_range(const BasicInterval& key_range, const Slice& items) { - drop_item_range(this->filter, items, key_range, llfs::KeyRangeOrder{}); + drop_item_range(this->filter_, items, key_range, llfs::KeyRangeOrder{}); } void drop_index_range(Interval i) { - this->filter.drop_index_range(i); + this->filter_.drop_index_range(i); } bool is_index_filtered(const FakeLevel&, u32 index) const { - return !this->filter.live_at_index(index); + return !this->filter_.live_at_index(index); } bool is_unfiltered() const { - return !this->filter.dropped_total(); + return !this->filter_.dropped_total(); } u32 live_lower_bound(const FakeLevel&, u32 item_i) const { - return this->filter.live_lower_bound(item_i); + return this->filter_.live_lower_bound(item_i); } Interval get_live_item_range(const FakeLevel&, Interval i) const { - return this->filter.find_live_range(i); + return this->filter_.find_live_range(i); } }; From 4c4476799f689e91b78630bae5e8fd37a65f572e Mon Sep 17 00:00:00 2001 From: Vidya Silai Date: Wed, 25 Feb 2026 09:40:45 -0500 Subject: [PATCH 18/48] Updated filter test --- src/turtle_kv/config.hpp | 2 + src/turtle_kv/tree/algo/segmented_levels.hpp | 2 +- src/turtle_kv/tree/algo/segments.hpp | 3 +- src/turtle_kv/tree/in_memory_node.cpp | 5 +- src/turtle_kv/tree/in_memory_node.hpp | 7 +- src/turtle_kv/tree/packed_node_page.cpp | 2 +- src/turtle_kv/tree/packed_node_page.hpp | 1 - src/turtle_kv/util/piecewise_filter.test.cpp | 105 +++++++++++-------- 8 files changed, 73 insertions(+), 54 deletions(-) diff --git a/src/turtle_kv/config.hpp b/src/turtle_kv/config.hpp index b8a49a5..ec4b104 100644 --- a/src/turtle_kv/config.hpp +++ b/src/turtle_kv/config.hpp @@ -91,4 +91,6 @@ constexpr i64 kNewLeafLruPriority = kLeafLruPriority + kNewPagePriorityBoost; constexpr u32 kDefaultLeafShardedViewSize = 4096; +constexpr usize kMaxPivots = 64; + } // namespace turtle_kv diff --git a/src/turtle_kv/tree/algo/segmented_levels.hpp b/src/turtle_kv/tree/algo/segmented_levels.hpp index 82fbb78..bc1d434 100644 --- a/src/turtle_kv/tree/algo/segmented_levels.hpp +++ b/src/turtle_kv/tree/algo/segmented_levels.hpp @@ -235,7 +235,7 @@ struct SegmentedLevelAlgorithms { << batt::c_str_literal(old_pivot_key_range.upper_bound) << "), key=" << batt::c_str_literal(split_key) << ")"; - BATT_CHECK_LT(this->node_.pivot_count(), 67); + BATT_CHECK_LT(this->node_.pivot_count(), InMemoryNode::kMaxTempPivots); const KeyView pivot_key = old_pivot_key_range.lower_bound; const usize segment_count = this->level_.segment_count(); diff --git a/src/turtle_kv/tree/algo/segments.hpp b/src/turtle_kv/tree/algo/segments.hpp index 69b85ee..f44af5b 100644 --- a/src/turtle_kv/tree/algo/segments.hpp +++ b/src/turtle_kv/tree/algo/segments.hpp @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include @@ -47,7 +48,7 @@ struct SegmentAlgorithms { this->segment_.check_invariants(__FILE__, __LINE__); }); - BATT_CHECK_LT(pivot_i, 67); + BATT_CHECK_LT(pivot_i, InMemoryNode::kMaxTempPivots - 1); // Simplest case: pivot not active for this segment. // diff --git a/src/turtle_kv/tree/in_memory_node.cpp b/src/turtle_kv/tree/in_memory_node.cpp index a9deaba..467a41c 100644 --- a/src/turtle_kv/tree/in_memory_node.cpp +++ b/src/turtle_kv/tree/in_memory_node.cpp @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -1284,7 +1285,7 @@ StatusOr> InMemoryNode::try_split_direct(BatchUpda // If the upper half is too large, then move the split point up and retry if possible. // - if (split_pivot_i + 4 < 68 && batt::is_case(upper_viability) && + if (split_pivot_i + 4 < orig_pivot_count && batt::is_case(upper_viability) && !batt::is_case(lower_viability)) { ++split_pivot_i; continue; @@ -1646,7 +1647,7 @@ void InMemoryNode::UpdateBuffer::SegmentedLevel::drop_after_pivot(i32 pivot_i, llfs::PageLoader& page_loader, const TreeOptions& tree_options) { - this->drop_pivot_range((Interval{pivot_i, 68}), + this->drop_pivot_range((Interval{pivot_i, InMemoryNode::kMaxTempPivots}), (Interval{pivot_key, global_max_key()}), page_loader, tree_options); diff --git a/src/turtle_kv/tree/in_memory_node.hpp b/src/turtle_kv/tree/in_memory_node.hpp index e3339ed..196c711 100644 --- a/src/turtle_kv/tree/in_memory_node.hpp +++ b/src/turtle_kv/tree/in_memory_node.hpp @@ -1,6 +1,5 @@ #pragma once -#include #include #include #include @@ -65,6 +64,8 @@ struct InMemoryNode { //+++++++++++-+-+--+----- --- -- - - - - + static constexpr usize kMaxTempPivots = 128; + //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - // /** \brief Mutable, in-memory representation of a node update buffer. @@ -482,12 +483,12 @@ struct InMemoryNode { usize max_pivot_count() const { - return this->is_size_tiered() ? 64 : 64; + return this->is_size_tiered() ? kMaxPivots : kMaxPivots; } usize max_segment_count() const { - return this->is_size_tiered() ? (64 - 1) : (64 - 1); + return this->is_size_tiered() ? (kMaxPivots - 1) : (kMaxPivots - 1); } Slice get_pivot_keys() const diff --git a/src/turtle_kv/tree/packed_node_page.cpp b/src/turtle_kv/tree/packed_node_page.cpp index 6515a71..09f1423 100644 --- a/src/turtle_kv/tree/packed_node_page.cpp +++ b/src/turtle_kv/tree/packed_node_page.cpp @@ -68,7 +68,7 @@ PackedNodePage* build_node_page(const MutableBuffer& buffer, const InMemoryNode& const usize pivot_count = src_node.pivot_count(); - BATT_CHECK_LE(pivot_count, PackedNodePage::kMaxPivots); + BATT_CHECK_LE(pivot_count, kMaxPivots); BATT_CHECK_EQ(src_node.pivot_keys_.size(), pivot_count + 1); BATT_CHECK_EQ(src_node.children.size(), pivot_count); BATT_CHECK_EQ(src_node.pending_bytes.size(), pivot_count); diff --git a/src/turtle_kv/tree/packed_node_page.hpp b/src/turtle_kv/tree/packed_node_page.hpp index fb8cc13..15e2c9b 100644 --- a/src/turtle_kv/tree/packed_node_page.hpp +++ b/src/turtle_kv/tree/packed_node_page.hpp @@ -42,7 +42,6 @@ struct Subtree; //=#=#==#==#===============+=+=+=+=++=++++++++++++++-++-+--+-+----+--------------- struct PackedNodePage { - static constexpr usize kMaxPivots = 64; static constexpr usize kMaxSegments = kMaxPivots - 1; static constexpr usize kMaxLevels = batt::log2_ceil(kMaxPivots); static constexpr usize kPivotKeysSize = diff --git a/src/turtle_kv/util/piecewise_filter.test.cpp b/src/turtle_kv/util/piecewise_filter.test.cpp index 696c600..72c8f7a 100644 --- a/src/turtle_kv/util/piecewise_filter.test.cpp +++ b/src/turtle_kv/util/piecewise_filter.test.cpp @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -63,68 +64,82 @@ TEST(PiecewiseFilterTest, InvalidFilterTest) // TEST(PiecewiseFilterTest, QueryTest) { + const usize num_items = 1000000; + for (usize i = 0; i < 100; ++i) { std::default_random_engine rng{i}; - std::uniform_int_distribution pick_num_items{10, ~usize{0} - 1}; - const usize num_items = pick_num_items(rng); PiecewiseFilter filter; + EXPECT_TRUE(filter.check_invariants()); - // First verify that everything should be live since nothing has been dropped yet. + // All items start unfiltered. // - EXPECT_EQ(filter.dropped_total(), 0); - - Interval next_live_interval = filter.find_live_range(Interval{0, num_items}); - EXPECT_EQ(next_live_interval, (Interval{0, num_items})); + std::set live_items; + for (usize i = 0; i < num_items; ++i) { + live_items.insert(i); + } - // Drop some items from the start. + // Drop random intervals. // - filter.drop_index_range(Interval{0, num_items / 10}); + std::uniform_int_distribution pick_num_dropped{100, num_items / 2}; + usize num_intervals_dropped = pick_num_dropped(rng); + for (usize i = 0; i < num_intervals_dropped; ++i) { + std::uniform_int_distribution pick_interval_start{0, num_items - 1}; + usize start_i = pick_interval_start(rng); - EXPECT_EQ(filter.dropped_total(), num_items / 10); + std::uniform_int_distribution pick_interval_end{start_i, num_items}; + usize end_i = pick_interval_end(rng); - EXPECT_FALSE(filter.live_at_index(0)); - EXPECT_TRUE(filter.live_at_index(num_items / 10)); + for (usize j = start_i; j < end_i; ++j) { + live_items.erase(j); + } - EXPECT_EQ(filter.live_lower_bound(0), num_items / 10); - EXPECT_EQ(filter.live_lower_bound(num_items / 10), num_items / 10); + filter.drop_index_range(Interval{start_i, end_i}); + } - EXPECT_EQ(filter.find_live_range(Interval{0, num_items}), - (Interval{num_items / 10, num_items})); + EXPECT_TRUE(filter.check_invariants()); - // Drop some more items from the middle of the item range (not overlapping with the previous - // interval). + // Test live_at_index // - filter.drop_index_range(Interval{num_items / 2, (num_items / 5) * 3}); - - usize dropped_item_count = (num_items / 10) + ((num_items / 5) * 3) - (num_items / 2); - EXPECT_EQ(filter.dropped_total(), dropped_item_count); - - EXPECT_FALSE(filter.live_at_index(num_items / 2)); - EXPECT_TRUE(filter.live_at_index(num_items / 5)); - EXPECT_TRUE(filter.live_at_index((num_items / 5) * 4)); - - EXPECT_EQ(filter.live_lower_bound(num_items / 10), num_items / 10); - EXPECT_EQ(filter.live_lower_bound(num_items / 5), num_items / 5); - EXPECT_EQ(filter.live_lower_bound((num_items / 5) * 4), (num_items / 5) * 4); - - EXPECT_EQ(filter.find_live_range(Interval{0, num_items}), - (Interval{num_items / 10, num_items / 2})); - - EXPECT_EQ(filter.find_live_range(Interval{num_items / 10, (num_items / 5) * 2}), - (Interval{num_items / 10, (num_items / 5) * 2})); + for (usize i = 0; i < num_items; ++i) { + EXPECT_EQ(filter.live_at_index(i), live_items.count(i) > 0); + } - // Drop a range with some overlap with the previous ranges. + // Test live_lower_bound // - filter.drop_index_range(Interval{num_items / 10, (num_items / 5) * 3}); - - EXPECT_FALSE(filter.live_at_index(num_items / 5)); - EXPECT_TRUE(filter.live_at_index((num_items / 5) * 3)); - - EXPECT_EQ(filter.find_live_range(Interval{0, num_items}), - (Interval{(num_items / 5) * 3, num_items})); + for (usize i = 0; i < num_items; ++i) { + auto iter = live_items.lower_bound(i); + usize expected = (iter != live_items.end()) ? *iter : num_items; + EXPECT_EQ(filter.live_lower_bound(i), expected); + } - EXPECT_TRUE(filter.check_invariants()); + // Test find_live_range + // + for (usize i = 0; i < 100; ++i) { + std::uniform_int_distribution pick_interval_start{0, num_items - 1}; + usize start_i = pick_interval_start(rng); + + std::uniform_int_distribution pick_interval_end{start_i, num_items}; + usize end_i = pick_interval_end(rng); + + auto iter = live_items.lower_bound(start_i); + if (iter == live_items.end() || *iter >= end_i) { + EXPECT_EQ(filter.find_live_range(Interval{start_i, end_i}), + (Interval{end_i, end_i})); + } else { + usize first = *iter; + usize last = first; + auto next = iter; + + while (next != live_items.end() && *next < end_i && *next == last) { + ++last; + ++next; + } + + EXPECT_EQ(filter.find_live_range(Interval{start_i, end_i}), + (Interval{first, last})); + } + } } } From c398037b58d7bc8d6bdaa098a96580de9b31df1a Mon Sep 17 00:00:00 2001 From: Vidya Silai Date: Mon, 2 Mar 2026 10:18:47 -0500 Subject: [PATCH 19/48] New batteries functions --- src/turtle_kv/tree/algo/segmented_levels.hpp | 9 +++---- src/turtle_kv/tree/algo/segments.hpp | 25 ++++---------------- src/turtle_kv/tree/in_memory_node.cpp | 15 ++++-------- src/turtle_kv/tree/in_memory_node.hpp | 20 ++++++---------- src/turtle_kv/tree/packed_node_page.hpp | 4 ++-- 5 files changed, 23 insertions(+), 50 deletions(-) diff --git a/src/turtle_kv/tree/algo/segmented_levels.hpp b/src/turtle_kv/tree/algo/segmented_levels.hpp index bc1d434..f0ed498 100644 --- a/src/turtle_kv/tree/algo/segmented_levels.hpp +++ b/src/turtle_kv/tree/algo/segmented_levels.hpp @@ -91,21 +91,22 @@ inline i32 get_last_active_pivot(const CInterval& pivot_range) //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - template -using EnableIfHasActivePivotsBitset = - std::enable_if_t().get_active_pivots()), u64>>; +using EnableIfHasActivePivotsBitset = std::enable_if_t< + std::is_convertible_v().get_active_pivots_with_overflow()), + std::array>>; //----- --- -- - - - - template > inline i32 get_first_active_pivot(T&& segment) { - return first_bit(segment.get_active_pivots()); + return first_bit(segment.get_active_pivots_with_overflow()); } template > inline i32 get_last_active_pivot(T&& segment) { - return last_bit(segment.get_active_pivots()); + return last_bit(segment.get_active_pivots_with_overflow()); } //=#=#==#==#===============+=+=+=+=++=++++++++++++++-++-+--+-+----+--------------- diff --git a/src/turtle_kv/tree/algo/segments.hpp b/src/turtle_kv/tree/algo/segments.hpp index f44af5b..5db1e0b 100644 --- a/src/turtle_kv/tree/algo/segments.hpp +++ b/src/turtle_kv/tree/algo/segments.hpp @@ -1,7 +1,7 @@ #pragma once -#include #include +#include #include #include @@ -117,32 +117,17 @@ struct SegmentAlgorithms { // they were when this function was entered, regardless of what `fn` may do to change the state // of the segment. // - const std::pair observed_active_pivots = + const std::array observed_active_pivots = this->segment_.get_active_pivots_with_overflow(); - const u64 first_active_bit = first_bit(observed_active_pivots.first); - const u64 first_active_overflow_bit = first_bit(observed_active_pivots.second); - const u64 first_active_pivot = - (first_active_bit != 64) ? first_active_bit : (first_active_overflow_bit + 64); + const i32 first_active_pivot = first_bit(observed_active_pivots); const i32 first_pivot_i = std::max(pivot_range.lower_bound, // first_active_pivot); - for (i32 pivot_i = first_pivot_i; pivot_i < pivot_range.upper_bound;) { + for (i32 pivot_i = first_pivot_i; pivot_i < pivot_range.upper_bound; + pivot_i = next_bit(observed_active_pivots, pivot_i)) { BATT_INVOKE_LOOP_FN((fn, this->segment_, pivot_i)); - - if (pivot_i < 64) { - pivot_i = next_bit(observed_active_pivots.first, pivot_i); - if (pivot_i == 64) { - pivot_i = first_bit(observed_active_pivots.second) + 64; - } - } else { - const i32 overflow_i = pivot_i - 64; - if (overflow_i >= 64) { - break; - } - pivot_i = next_bit(observed_active_pivots.second, overflow_i) + 64; - } } } diff --git a/src/turtle_kv/tree/in_memory_node.cpp b/src/turtle_kv/tree/in_memory_node.cpp index 467a41c..e6adba9 100644 --- a/src/turtle_kv/tree/in_memory_node.cpp +++ b/src/turtle_kv/tree/in_memory_node.cpp @@ -1721,17 +1721,10 @@ void InMemoryNode::UpdateBuffer::Segment::insert_pivot(i32 pivot_i, bool is_acti this->check_invariants(__FILE__, __LINE__); }); - if (pivot_i < 64) { - // Insert the highest bit from active_pivots to the overflow bit set. - // - this->active_pivots_overflow = - (this->active_pivots_overflow << 1) | ((this->active_pivots >> 63) & u64{1}); - - this->active_pivots = insert_bit(this->active_pivots, pivot_i, is_active); - } else { - const i32 overflow_i = pivot_i - 64; - this->active_pivots_overflow = insert_bit(this->active_pivots_overflow, overflow_i, is_active); - } + std::array active_pivots_out = + insert_bit({this->active_pivots, this->active_pivots_overflow}, pivot_i, is_active); + this->active_pivots = active_pivots_out[0]; + this->active_pivots_overflow = active_pivots_out[1]; } //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - diff --git a/src/turtle_kv/tree/in_memory_node.hpp b/src/turtle_kv/tree/in_memory_node.hpp index 196c711..b3992de 100644 --- a/src/turtle_kv/tree/in_memory_node.hpp +++ b/src/turtle_kv/tree/in_memory_node.hpp @@ -119,32 +119,26 @@ struct InMemoryNode { /** \brief Returns the active pivots bit set, as well as the overflow bit set. */ - std::pair get_active_pivots_with_overflow() const + std::array get_active_pivots_with_overflow() const { - return std::make_pair(this->active_pivots, this->active_pivots_overflow); + return {this->active_pivots, this->active_pivots_overflow}; } /** \brief Marks this segment as containing (or not) active keys addressed to `pivot_i`. */ void set_pivot_active(i32 pivot_i, bool active) { - if (pivot_i < 64) { - this->active_pivots = set_bit(this->active_pivots, pivot_i, active); - } else { - this->active_pivots_overflow = - set_bit(this->active_pivots_overflow, pivot_i - 64, active); - } + std::array active_pivots_out = + set_bit({this->active_pivots, this->active_pivots_overflow}, pivot_i, active); + this->active_pivots = active_pivots_out[0]; + this->active_pivots_overflow = active_pivots_out[1]; } /** \brief Returns true iff this segment has active keys addressed to `pivot_i`. */ bool is_pivot_active(i32 pivot_i) const { - if (pivot_i < 64) { - return get_bit(this->active_pivots, pivot_i); - } else { - return get_bit(this->active_pivots_overflow, pivot_i - 64); - } + return get_bit({this->active_pivots, this->active_pivots_overflow}, pivot_i); } template diff --git a/src/turtle_kv/tree/packed_node_page.hpp b/src/turtle_kv/tree/packed_node_page.hpp index 15e2c9b..0c2513b 100644 --- a/src/turtle_kv/tree/packed_node_page.hpp +++ b/src/turtle_kv/tree/packed_node_page.hpp @@ -142,9 +142,9 @@ struct PackedNodePage { return this->active_pivots; } - std::pair get_active_pivots_with_overflow() const + std::array get_active_pivots_with_overflow() const { - return std::make_pair(this->active_pivots, u64{0}); + return {this->active_pivots, u64{0}}; } llfs::PageId get_leaf_page_id() const From f19922b17e2ad82cefd1ff68a9df1426dfa252c6 Mon Sep 17 00:00:00 2001 From: Vidya Silai Date: Mon, 2 Mar 2026 12:20:06 -0500 Subject: [PATCH 20/48] Start refactor of merge functions --- src/turtle_kv/tree/in_memory_leaf.cpp | 123 +-------- src/turtle_kv/tree/in_memory_leaf.hpp | 5 +- src/turtle_kv/tree/in_memory_node.cpp | 354 ++------------------------ src/turtle_kv/tree/in_memory_node.hpp | 9 +- src/turtle_kv/tree/subtree.cpp | 32 +-- src/turtle_kv/tree/subtree.hpp | 3 +- 6 files changed, 44 insertions(+), 482 deletions(-) diff --git a/src/turtle_kv/tree/in_memory_leaf.cpp b/src/turtle_kv/tree/in_memory_leaf.cpp index 3f0b186..84ea245 100644 --- a/src/turtle_kv/tree/in_memory_leaf.cpp +++ b/src/turtle_kv/tree/in_memory_leaf.cpp @@ -197,16 +197,15 @@ auto InMemoryLeaf::make_split_plan() const -> StatusOr //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - // -StatusOr> InMemoryLeaf::try_merge( - BatchUpdateContext& context, - std::unique_ptr sibling) noexcept +Status InMemoryLeaf::try_merge(BatchUpdateContext& context, + std::unique_ptr sibling) noexcept { BATT_CHECK(this->result_set); BATT_CHECK(sibling->result_set); if (sibling->result_set->empty()) { BATT_CHECK(batt::is_case(this->get_viability())); - return nullptr; + return OkStatus(); } if (this->result_set->empty()) { @@ -215,22 +214,7 @@ StatusOr> InMemoryLeaf::try_merge( this->result_set = std::move(sibling->result_set); this->shared_edit_size_totals_ = sibling->shared_edit_size_totals_; this->edit_size_totals = std::move(sibling->edit_size_totals); - return nullptr; - } - - if (this->get_items_size() + sibling->get_items_size() > this->tree_options.flush_size()) { - bool borrow_from_sibling = false; - if (batt::is_case(this->get_viability())) { - borrow_from_sibling = true; - } else { - BATT_CHECK(batt::is_case(sibling->get_viability())); - } - - Status borrow_status = borrow_from_sibling ? this->try_borrow(context, *sibling) - : sibling->try_borrow(context, *this); - BATT_REQUIRE_OK(borrow_status); - - return {std::move(sibling)}; + return OkStatus(); } BATT_CHECK_LT(this->get_max_key(), sibling->get_min_key()); @@ -240,105 +224,6 @@ StatusOr> InMemoryLeaf::try_merge( this->set_edit_size_totals(context.compute_running_total(*this->result_set)); - return nullptr; -} - -//==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - -// -Status InMemoryLeaf::try_borrow(BatchUpdateContext& context, InMemoryLeaf& sibling) noexcept -{ - const usize orig_edit_count = sibling.result_set->size(); - BATT_CHECK_EQ(sibling.result_set->size() + 1, sibling.edit_size_totals->size()); - - // Calculate the minimum number of bytes we would need to borrow from the sibling to make this - // leaf viable. By definition, we need to get this leaf to be at least a quarter full. - // - const usize min_bytes_to_borrow = (this->tree_options.flush_size() / 4) - this->get_items_size(); - usize n_to_borrow = 0; - - const auto borrow_edits = [&context, &n_to_borrow]( - const auto& src_begin, - const auto& src_end, - MergeCompactor::ResultSet& dst_result_set) -> void { - std::vector buffer; - buffer.reserve(n_to_borrow); - { - batt::ScopedWorkContext work_context{context.worker_pool}; - - const ParallelAlgoDefaults& algo_defaults = parallel_algo_defaults(); - const batt::TaskCount max_tasks{context.worker_pool.size() + 1}; - - parallel_copy(work_context, - src_begin, - src_end, - buffer.data(), - /*min_task_size = */ algo_defaults.copy_edits.min_task_size, - /*max_tasks = */ max_tasks); - } - const ItemView* first_edit = (const ItemView*)buffer.data(); - dst_result_set.append(std::move(buffer), as_slice(first_edit, n_to_borrow)); - }; - - if (this->get_max_key() < sibling.get_min_key()) { - // If the sibling is the right sibling, we borrow from the front of the sibling. - // - for (n_to_borrow = 1; n_to_borrow <= orig_edit_count; ++n_to_borrow) { - usize bytes = (*sibling.edit_size_totals)[n_to_borrow] - sibling.edit_size_totals->front(); - if (bytes >= min_bytes_to_borrow) { - break; - } - } - - // The number of edits being borrowed should always be less than the original edit count of - // the sibling, since borrowing everything is a full merge. - // - BATT_CHECK_LT(n_to_borrow, orig_edit_count); - - auto src_begin = sibling.result_set->get().begin(); - auto src_end = src_begin + n_to_borrow; - - // Copy over edits into this leaf's result_set. - // - borrow_edits(src_begin, src_end, *this->result_set); - - sibling.result_set->drop_before_n(n_to_borrow); - sibling.edit_size_totals->drop_front(n_to_borrow); - } else { - // If the sibling is the left sibling, we borrow from the back of the sibling. - // - for (n_to_borrow = 1; n_to_borrow <= orig_edit_count; ++n_to_borrow) { - usize bytes = sibling.edit_size_totals->back() - - (*sibling.edit_size_totals)[orig_edit_count - n_to_borrow]; - if (bytes >= min_bytes_to_borrow) { - break; - } - } - - BATT_CHECK_LT(n_to_borrow, orig_edit_count); - - usize new_edit_count = orig_edit_count - n_to_borrow; - - auto src_begin = sibling.result_set->get().begin() + new_edit_count; - auto src_end = sibling.result_set->get().end(); - - // Copy over the edits to be borrowed into an intermediary ResultSet and concat it with - // this leaf's current result_set. - // - MergeCompactor::ResultSet items_to_prepend; - - borrow_edits(src_begin, src_end, items_to_prepend); - - this->result_set = MergeCompactor::ResultSet::concat(std::move(items_to_prepend), - std::move(*this->result_set)); - - sibling.result_set->drop_after_n(new_edit_count); - sibling.edit_size_totals->drop_back(n_to_borrow); - } - - this->set_edit_size_totals(context.compute_running_total(*this->result_set)); - - BATT_CHECK(batt::is_case(sibling.get_viability())); - return OkStatus(); } diff --git a/src/turtle_kv/tree/in_memory_leaf.hpp b/src/turtle_kv/tree/in_memory_leaf.hpp index 61df84a..0df356c 100644 --- a/src/turtle_kv/tree/in_memory_leaf.hpp +++ b/src/turtle_kv/tree/in_memory_leaf.hpp @@ -105,10 +105,7 @@ struct InMemoryLeaf { StatusOr make_split_plan() const; - StatusOr> try_merge(BatchUpdateContext& context, - std::unique_ptr sibling) noexcept; - - Status try_borrow(BatchUpdateContext& context, InMemoryLeaf& sibling) noexcept; + Status try_merge(BatchUpdateContext& context, std::unique_ptr sibling) noexcept; Status apply_batch_update(BatchUpdate& update) noexcept; diff --git a/src/turtle_kv/tree/in_memory_node.cpp b/src/turtle_kv/tree/in_memory_node.cpp index 726d801..7aa4706 100644 --- a/src/turtle_kv/tree/in_memory_node.cpp +++ b/src/turtle_kv/tree/in_memory_node.cpp @@ -737,38 +737,17 @@ Status InMemoryNode::merge_child(BatchUpdateContext& update_context, i32 pivot_i // rightmost child in the node. // i32 sibling_i = pivot_i; - i32 right_sibling = pivot_i + 1; - i32 left_sibling = pivot_i - 1; - bool need_update_buffer_compaction = false; - u64 active_segmented_levels = this->update_buffer.compute_active_pivots(); if (pivot_i == 0) { - sibling_i = right_sibling; - } else if ((usize)pivot_i == this->pivot_count() - 1) { - sibling_i = left_sibling; + sibling_i = pivot_i + 1; + } else if (pivot_i == this->pivot_count() - 1) { + sibling_i = pivot_i - 1; } else { - // If we don't have one of the edge cases, try and pick the sibling where the leftmost of - // {child, sibling} is inactive in all segmented levels. This way, the final merged pivot - // won't have on/off flushed ranges in segments. If this is not possible, pick the right - // sibling. - // - if (!get_bit(active_segmented_levels, pivot_i)) { - sibling_i = right_sibling; - } else { - if (!get_bit(active_segmented_levels, left_sibling)) { - sibling_i = left_sibling; - } else { - sibling_i = right_sibling; - } - } - } - - BATT_CHECK_NE(pivot_i, sibling_i); - if (get_bit(active_segmented_levels, std::min(pivot_i, sibling_i))) { - need_update_buffer_compaction = true; + sibling_i = pivot_i + 1; } BATT_REQUIRE_OK(this->children[sibling_i].unpack_if_necessary(update_context.page_loader, + update_context.overcommit, update_context.worker_pool, this->tree_options, this->height - 1)); @@ -781,76 +760,19 @@ Status InMemoryNode::merge_child(BatchUpdateContext& update_context, i32 pivot_i // Call Subtree::try_merge. // - StatusOr> status_or_merged = - this->children[left_pivot_i].try_merge(update_context, - std::move(this->children[right_pivot_i])); - if (!status_or_merged.ok()) { - LOG(ERROR) << BATT_INSPECT(status_or_merged.status()); - } - BATT_REQUIRE_OK(status_or_merged); - - if (*status_or_merged) { - // If try_merge returned a Subtree, a borrow occurred. - // - this->child_pages[left_pivot_i] = llfs::PinnedPage{}; - this->child_pages[right_pivot_i] = llfs::PinnedPage{}; - - // A borrow would have returned the updated right sibling (left sibling was updated in place), - // so overwrite what is currently in this->children. - // - this->children[right_pivot_i] = std::move(**status_or_merged); - - if ((usize)right_pivot_i == old_pivot_count - 1) { - BATT_ASSIGN_OK_RESULT( - this->max_key_, - this->children.back().get_max_key(update_context.page_loader, this->child_pages.back())); - } - - // Compute and store the new pivot key. - // - StatusOr right_child_min_key = - this->children[right_pivot_i].get_min_key(update_context.page_loader, - this->child_pages[right_pivot_i]); - BATT_REQUIRE_OK(right_child_min_key); - StatusOr left_child_max_key = - this->children[left_pivot_i].get_max_key(update_context.page_loader, - this->child_pages[left_pivot_i]); - BATT_REQUIRE_OK(left_child_max_key); - - const KeyView prefix = llfs::find_common_prefix(0, *left_child_max_key, *right_child_min_key); - const KeyView new_pivot_key = right_child_min_key->substr(0, prefix.size() + 1); - this->pivot_keys_[right_pivot_i] = new_pivot_key; - - // Compact the update buffer levels and recompute pending byte counts. - // - BATT_REQUIRE_OK(this->compact_update_buffer_levels(update_context)); - - BATT_CHECK_EQ(this->update_buffer.levels.size(), 1); - BATT_CHECK(batt::is_case(this->update_buffer.levels[0])); - MergedLevel& merged_edits = std::get(this->update_buffer.levels[0]); - - std::fill(this->pending_bytes.begin(), this->pending_bytes.end(), 0); - in_node(*this).update_pending_bytes(update_context.worker_pool, - merged_edits.result_set.get(), - PackedSizeOfEdit{}); - - return OkStatus(); - } + BATT_REQUIRE_OK(this->children[left_pivot_i].try_merge(update_context, + std::move(this->children[right_pivot_i]))); this->child_pages[left_pivot_i] = llfs::PinnedPage{}; this->child_pages.erase(this->child_pages.begin() + right_pivot_i); // Update the update_buffer levels. // - if (need_update_buffer_compaction) { - BATT_REQUIRE_OK(this->compact_update_buffer_levels(update_context)); - } else { - for (Level& level : this->update_buffer.levels) { - if (batt::is_case(level)) { - SegmentedLevel& segmented_level = std::get(level); - in_segmented_level(*this, segmented_level, update_context.page_loader) - .merge_pivots(left_pivot_i, right_pivot_i); - } + for (Level& level : this->update_buffer.levels) { + if (batt::is_case(level)) { + SegmentedLevel& segmented_level = std::get(level); + in_segmented_level(*this, segmented_level, update_context.page_loader) + .merge_pivots(left_pivot_i, right_pivot_i); } } @@ -894,29 +816,9 @@ Status InMemoryNode::merge_child(BatchUpdateContext& update_context, i32 pivot_i //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - // -StatusOr> InMemoryNode::try_merge( - BatchUpdateContext& context, - std::unique_ptr sibling) noexcept +Status InMemoryNode::try_merge(BatchUpdateContext& context, + std::unique_ptr sibling) noexcept { - //----- --- -- - - - - - // If merging both full nodes will cause the merged node's pivot count to exceed the max - // possible pivot count, try a borrow. - // - if (this->pivot_count() + sibling->pivot_count() > this->max_pivot_count()) { - bool borrow_from_sibling = false; - if (batt::is_case(this->get_viability())) { - borrow_from_sibling = true; - } else { - BATT_CHECK(batt::is_case(sibling->get_viability())); - } - - Status borrow_status = borrow_from_sibling ? this->try_borrow(context, *sibling) - : sibling->try_borrow(context, *this); - BATT_REQUIRE_OK(borrow_status); - - return {std::move(sibling)}; - } - BATT_CHECK_LT(this->get_max_key(), sibling->get_min_key()); //----- --- -- - - - - @@ -994,7 +896,6 @@ StatusOr> InMemoryNode::try_merge( for (usize segment_i = 0; segment_i < right_segmented_level.segment_count(); ++segment_i) { Segment& segment = right_segmented_level.get_segment(segment_i); - segment.flushed_pivots <<= left_node_pivot_count; segment.active_pivots <<= left_node_pivot_count; } @@ -1003,6 +904,14 @@ StatusOr> InMemoryNode::try_merge( std::make_move_iterator(right_segmented_level.segments.begin()), std::make_move_iterator(right_segmented_level.segments.end())); + left_segmented_level.segments.erase( + std::unique(left_segmented_level.segments.begin(), + left_segmented_level.segments.end(), + [](const Segment& l, const Segment& r) { + return l.page_id_slot.page_id == r.page_id_slot.page_id; + }), + left_segmented_level.segments.end()); + return OkStatus(); })); } @@ -1027,7 +936,6 @@ StatusOr> InMemoryNode::try_merge( for (usize segment_i = 0; segment_i < right_segmented_level.segment_count(); ++segment_i) { Segment& segment = right_segmented_level.get_segment(segment_i); - segment.flushed_pivots <<= left_node_pivot_count; segment.active_pivots <<= left_node_pivot_count; } @@ -1065,215 +973,6 @@ StatusOr> InMemoryNode::try_merge( sibling->pivot_keys_.begin(), sibling->pivot_keys_.end()); - return nullptr; -} - -//==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - -// -Status InMemoryNode::try_borrow(BatchUpdateContext& context, InMemoryNode& sibling) noexcept -{ - BATT_CHECK(batt::is_case(sibling.get_viability())); - - bool right_sibling = this->get_max_key() < sibling.get_min_key(); - - BATT_CHECK_LT(this->pivot_count(), 4); - u32 num_pivots_to_borrow = 4 - this->pivot_count(); - - //----- --- -- - - - - - // Borrow node metadata. Modify all metadata right now except this->children, since modifying it - // will change the pivot count. - // - if (right_sibling) { - this->pending_bytes.insert(this->pending_bytes.end(), - sibling.pending_bytes.begin(), - sibling.pending_bytes.begin() + num_pivots_to_borrow); - sibling.pending_bytes.erase(sibling.pending_bytes.begin(), - sibling.pending_bytes.begin() + num_pivots_to_borrow); - - // Update this->pending_bytes_is_exact by placing the borrowed pending bytes bits from the - // right sibling directly after the pending bytes bits for this node. - // - u64 borrowed_bits = sibling.pending_bytes_is_exact & ((u64{1} << num_pivots_to_borrow) - 1); - u64 mask = ((u64{1} << num_pivots_to_borrow) - 1) << this->pivot_count(); - this->pending_bytes_is_exact = - (this->pending_bytes_is_exact & ~mask) | (borrowed_bits << this->pivot_count()); - sibling.pending_bytes_is_exact >>= num_pivots_to_borrow; - - // Get rid of the key upper bound in this node and insert the borrowed pivot keys, including - // one past num_pivots_to_borrow, to set the new key upper bound. - // - this->pivot_keys_.pop_back(); - this->pivot_keys_.insert(this->pivot_keys_.end(), - sibling.pivot_keys_.begin(), - sibling.pivot_keys_.begin() + num_pivots_to_borrow + 1); - sibling.pivot_keys_.erase(sibling.pivot_keys_.begin(), - sibling.pivot_keys_.begin() + num_pivots_to_borrow); - - this->child_pages.insert( - this->child_pages.end(), - std::make_move_iterator(sibling.child_pages.begin()), - std::make_move_iterator(sibling.child_pages.begin() + num_pivots_to_borrow)); - sibling.child_pages.erase(sibling.child_pages.begin(), - sibling.child_pages.begin() + num_pivots_to_borrow); - } else { - this->pending_bytes.insert(this->pending_bytes.begin(), - sibling.pending_bytes.end() - num_pivots_to_borrow, - sibling.pending_bytes.end()); - sibling.pending_bytes.erase(sibling.pending_bytes.end() - num_pivots_to_borrow, - sibling.pending_bytes.end()); - - // Shift this->pending_bytes_is_exact up by num_pivots_to_borrow, and place the borrowed - // pending bytes bits at the lowest order bits. - // - u64 borrowed_bits = - (sibling.pending_bytes_is_exact >> (sibling.pivot_count() - num_pivots_to_borrow)) & - ((u64{1} << num_pivots_to_borrow) - 1); - this->pending_bytes_is_exact <<= num_pivots_to_borrow; - this->pending_bytes_is_exact |= borrowed_bits; - u64 mask = ((u64{1} << num_pivots_to_borrow) - 1) - << (sibling.pivot_count() - num_pivots_to_borrow); - sibling.pending_bytes_is_exact &= ~mask; - - sibling.pivot_keys_.pop_back(); - this->pivot_keys_.insert(this->pivot_keys_.begin(), - sibling.pivot_keys_.end() - num_pivots_to_borrow, - sibling.pivot_keys_.end()); - sibling.pivot_keys_.erase(sibling.pivot_keys_.end() - num_pivots_to_borrow + 1, - sibling.pivot_keys_.end()); - - this->child_pages.insert( - this->child_pages.begin(), - std::make_move_iterator(sibling.child_pages.end() - num_pivots_to_borrow), - std::make_move_iterator(sibling.child_pages.end())); - sibling.child_pages.erase(sibling.child_pages.end() - num_pivots_to_borrow, - sibling.child_pages.end()); - } - - //----- --- -- - - - - - // Modify the update buffers of both `this` and `sibling`. - // Calculate the pivot range to borrow from the sibling, and then extract updates from the - // sibling's update buffer that contain this range. - // - i32 borrowed_min_pivot_i = -1; - KeyView borrowed_max_pivot_key; - if (right_sibling) { - borrowed_min_pivot_i = 0; - borrowed_max_pivot_key = sibling.get_pivot_key(num_pivots_to_borrow); - } else { - borrowed_min_pivot_i = sibling.pivot_count() - num_pivots_to_borrow; - borrowed_max_pivot_key = sibling.get_pivot_key(sibling.pivot_count()); - } - Interval borrowed_pivot_range{sibling.get_pivot_key(borrowed_min_pivot_i), - borrowed_max_pivot_key}; - - BatchUpdate borrowed_pivot_batch{ - .context = context, - .result_set = {}, - .edit_size_totals = None, - }; - - Status segment_load_status; - HasPageRefs has_page_refs{false}; - - BATT_ASSIGN_OK_RESULT( // - borrowed_pivot_batch.result_set, // - context.merge_compact_edits( // - /*max_key=*/borrowed_max_pivot_key, // - [&](MergeCompactor& compactor) -> Status { - sibling.push_levels_to_merge(compactor, - context.page_loader, - segment_load_status, - has_page_refs, - as_slice(sibling.update_buffer.levels), - /*min_pivot_i=*/borrowed_min_pivot_i, - /*only_pivot=*/false); - return OkStatus(); - })); - - BATT_REQUIRE_OK(segment_load_status); - - borrowed_pivot_batch.result_set.drop_key_range_half_open(Interval{ - borrowed_max_pivot_key, - sibling.key_upper_bound(), - }); - - borrowed_pivot_batch.edit_size_totals = None; - - // Adjust the update buffer levels metadata in the sibling now that the borrowed updates have - // been extracted. - // - usize remove_pivot_i = right_sibling ? 0 : sibling.pivot_count() - num_pivots_to_borrow; - for (Level& level : sibling.update_buffer.levels) { - batt::case_of( // - level, // - [](EmptyLevel&) { - // nothing to do - }, - [&](MergedLevel& merged_level) { - merged_level.result_set.drop_key_range_half_open(borrowed_pivot_range); - }, - [&](SegmentedLevel& segmented_level) { - for (usize segment_i = 0; segment_i < segmented_level.segment_count(); ++segment_i) { - Segment& segment = segmented_level.get_segment(segment_i); - // Iterate backwards, since calling `remove_bit` will shift the bitset. - // TODO [vsilai 12-6-2025]: consider writing a `remove_bits` function to modify the bit - // sets more efficiently? This would only be for active_pivots. - // - for (usize j = remove_pivot_i + num_pivots_to_borrow - 1; j >= remove_pivot_i; --j) { - segment.remove_pivot(j); - } - } - }); - } - - // Insert the borrowed updates into the update buffer. - // - BATT_REQUIRE_OK(this->update_buffer_insert(borrowed_pivot_batch)); - - usize insert_pivot_i = right_sibling ? this->pivot_count() : 0; - for (Level& level : this->update_buffer.levels) { - batt::case_of( // - level, // - [](EmptyLevel&) { - // nothing to do - }, - [&](MergedLevel& merged_level) { - // nothing to do - }, - [&](SegmentedLevel& segmented_level) { - for (usize segment_i = 0; segment_i < segmented_level.segment_count(); ++segment_i) { - Segment& segment = segmented_level.get_segment(segment_i); - for (usize j = insert_pivot_i; j < insert_pivot_i + num_pivots_to_borrow; ++j) { - segment.insert_pivot(j, /*is_active*/ false); - } - } - }); - } - - //----- --- -- - - - - - // Finally, update the children Subtree vector for both nodes. - // - if (right_sibling) { - this->children.insert(this->children.end(), - std::make_move_iterator(sibling.children.begin()), - std::make_move_iterator(sibling.children.begin() + num_pivots_to_borrow)); - sibling.children.erase(sibling.children.begin(), - sibling.children.begin() + num_pivots_to_borrow); - - BATT_ASSIGN_OK_RESULT( - this->max_key_, - this->children.back().get_max_key(context.page_loader, this->child_pages.back())); - } else { - this->children.insert(this->children.begin(), - std::make_move_iterator(sibling.children.end() - num_pivots_to_borrow), - std::make_move_iterator(sibling.children.end())); - sibling.children.erase(sibling.children.end() - num_pivots_to_borrow, sibling.children.end()); - - BATT_ASSIGN_OK_RESULT( - sibling.max_key_, - sibling.children.back().get_max_key(context.page_loader, sibling.child_pages.back())); - } - return OkStatus(); } @@ -2322,16 +2021,7 @@ void InMemoryNode::UpdateBuffer::Segment::remove_pivot(i32 pivot_i) this->check_invariants(__FILE__, __LINE__); }); - if (get_bit(this->flushed_pivots, pivot_i)) { - const i32 index = bit_rank(this->flushed_pivots, pivot_i); - BATT_ASSERT_GE(index, 0); - BATT_ASSERT_LT(index, this->flushed_item_upper_bound_.size()); - - this->flushed_item_upper_bound_.erase(this->flushed_item_upper_bound_.begin() + index); - } - this->active_pivots = remove_bit(this->active_pivots, pivot_i); - this->flushed_pivots = remove_bit(this->flushed_pivots, pivot_i); } //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - diff --git a/src/turtle_kv/tree/in_memory_node.hpp b/src/turtle_kv/tree/in_memory_node.hpp index 4c282d9..bb2b99f 100644 --- a/src/turtle_kv/tree/in_memory_node.hpp +++ b/src/turtle_kv/tree/in_memory_node.hpp @@ -623,14 +623,7 @@ struct InMemoryNode { * Returns nullptr if `sibling` is completely consumed; otherwise, returns the modified sibling * since a borrow occurred. */ - StatusOr> try_merge(BatchUpdateContext& context, - std::unique_ptr sibling) noexcept; - - /** \brief Attempts to make `this` (which needs a merge) viable by borrowing data - * from one of its siblings. Note that for this function, `sibling` does not have to be the right - * sibling. Both `this` and `sibling` are modified in place. - */ - Status try_borrow(BatchUpdateContext& context, InMemoryNode& sibling) noexcept; + Status try_merge(BatchUpdateContext& context, std::unique_ptr sibling) noexcept; /** \brief Splits the specified child, inserting a new pivot immediately after `pivot_i`. */ diff --git a/src/turtle_kv/tree/subtree.cpp b/src/turtle_kv/tree/subtree.cpp index 49fe455..7d53d70 100644 --- a/src/turtle_kv/tree/subtree.cpp +++ b/src/turtle_kv/tree/subtree.cpp @@ -263,17 +263,17 @@ Status Subtree::apply_batch_update(const TreeOptions& tree_options, }, [&](const NeedsMerge& needs_merge) { // Only perform a flush and shrink if the root has a single pivot. - // - if (!needs_merge.single_pivot) { - return OkStatus(); - } + // + if (!needs_merge.single_pivot) { + return OkStatus(); + } - Status status = new_subtree->flush_and_shrink(update.context); + Status status = new_subtree->flush_and_shrink(update.context); - if (!status.ok()) { - LOG(INFO) << "flush_and_shrink failed;" << BATT_INSPECT(needs_merge); - } - return status; + if (!status.ok()) { + LOG(INFO) << "flush_and_shrink failed;" << BATT_INSPECT(needs_merge); + } + return status; })); return OkStatus(); @@ -584,8 +584,7 @@ StatusOr> Subtree::try_split(BatchUpdateContext& context) //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - // -StatusOr> Subtree::try_merge(BatchUpdateContext& context, - Subtree&& sibling) noexcept +Status Subtree::try_merge(BatchUpdateContext& context, Subtree&& sibling) noexcept { BATT_CHECK(!this->locked_.load()); @@ -605,14 +604,9 @@ StatusOr> Subtree::try_merge(BatchUpdateContext& context, auto& sibling_ptr = std::get(sibling.impl_); BATT_CHECK(sibling_ptr); - BATT_ASSIGN_OK_RESULT(PtrT merged_subtree, - in_memory->try_merge(context, std::move(sibling_ptr))); - - if (merged_subtree == nullptr) { - return Optional{None}; - } + BATT_REQUIRE_OK(in_memory->try_merge(context, std::move(sibling_ptr))); - return {Subtree{std::move(merged_subtree)}}; + return OkStatus(); }); } @@ -766,6 +760,7 @@ void Subtree::lock() //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - // Status Subtree::unpack_if_necessary(llfs::PageLoader& page_loader, + llfs::PageCacheOvercommit& overcommit, batt::WorkerPool& worker_pool, const TreeOptions& tree_options, i32 height) noexcept @@ -786,6 +781,7 @@ Status Subtree::unpack_if_necessary(llfs::PageLoader& page_loader, llfs::PinPageToJob::kDefault, llfs::OkIfNotFound{false}, llfs::LruPriority{(height > 2) ? kNodeLruPriority : kLeafLruPriority}, + overcommit, }); BATT_REQUIRE_OK(status_or_pinned_page) << BATT_INSPECT(height); diff --git a/src/turtle_kv/tree/subtree.hpp b/src/turtle_kv/tree/subtree.hpp index 623d65d..4c16dbd 100644 --- a/src/turtle_kv/tree/subtree.hpp +++ b/src/turtle_kv/tree/subtree.hpp @@ -165,7 +165,7 @@ class Subtree * * If a borrow needs to occur, `this` is modified in place and the modified sibling is returned. */ - StatusOr> try_merge(BatchUpdateContext& context, Subtree&& sibling) noexcept; + Status try_merge(BatchUpdateContext& context, Subtree&& sibling) noexcept; /** \brief Attempt to make the root viable by flushing a batch. If nothing is available to * flush, returns batt::StatusCode::kUnavailable. @@ -210,6 +210,7 @@ class Subtree * place. If the Subtree is already an in-memory type, this function does nothing. */ Status unpack_if_necessary(llfs::PageLoader& page_loader, + llfs::PageCacheOvercommit& overcommit, batt::WorkerPool& worker_pool, const TreeOptions& tree_options, i32 height) noexcept; From 8fbfc0542bdcba70ed22fdd98104455cc6eca277 Mon Sep 17 00:00:00 2001 From: Vidya Silai Date: Wed, 4 Mar 2026 09:56:00 -0500 Subject: [PATCH 21/48] More merge function refactoring --- src/turtle_kv/tree/algo/segments.hpp | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/turtle_kv/tree/algo/segments.hpp b/src/turtle_kv/tree/algo/segments.hpp index f264122..5bac28d 100644 --- a/src/turtle_kv/tree/algo/segments.hpp +++ b/src/turtle_kv/tree/algo/segments.hpp @@ -111,13 +111,9 @@ struct SegmentAlgorithms { template [[nodiscard]] void merge_pivots(i32 left_pivot, i32 right_pivot, const LevelT& level) { - BATT_CHECK(!this->segment_.is_pivot_active(left_pivot)); - - u32 new_flushed_upper_bound = this->segment_.get_flushed_item_upper_bound(level, right_pivot); - bool new_is_active = this->segment_.is_pivot_active(right_pivot); - - this->segment_.set_pivot_active(left_pivot, new_is_active); - this->segment_.set_flushed_item_upper_bound(left_pivot, new_flushed_upper_bound); + bool left_is_active = this->segment_.is_pivot_active(left_pivot); + bool right_is_active = this->segment_.is_pivot_active(right_pivot); + this->segment_.set_pivot_active(left_pivot, left_is_active | right_is_active); this->segment_.remove_pivot(right_pivot); } From 06ae727e4d1aad399aafda59ec4344065cc8dda2 Mon Sep 17 00:00:00 2001 From: Vidya Silai Date: Mon, 9 Mar 2026 09:08:40 -0400 Subject: [PATCH 22/48] Fix compiler errors --- src/turtle_kv/change_log_writer.cpp | 2 +- src/turtle_kv/tree/in_memory_node.cpp | 4 +++- src/turtle_kv/tree/in_memory_node.hpp | 7 +++++-- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/turtle_kv/change_log_writer.cpp b/src/turtle_kv/change_log_writer.cpp index 66f5ca0..1d01a4c 100644 --- a/src/turtle_kv/change_log_writer.cpp +++ b/src/turtle_kv/change_log_writer.cpp @@ -255,7 +255,7 @@ void ChangeLogWriter::writer_task_main() noexcept // appending; in this case, enter our timed polling loop. // const i64 delay_usec = pick_delay_usec(rng); - batt::Task::sleep(boost::posix_time::microseconds(delay_usec)); + batt::Task::sleep(std::chrono::microseconds(delay_usec)); if (this->halt_requested_.load()) { return batt::OkStatus(); diff --git a/src/turtle_kv/tree/in_memory_node.cpp b/src/turtle_kv/tree/in_memory_node.cpp index e6adba9..c94794f 100644 --- a/src/turtle_kv/tree/in_memory_node.cpp +++ b/src/turtle_kv/tree/in_memory_node.cpp @@ -1722,7 +1722,9 @@ void InMemoryNode::UpdateBuffer::Segment::insert_pivot(i32 pivot_i, bool is_acti }); std::array active_pivots_out = - insert_bit({this->active_pivots, this->active_pivots_overflow}, pivot_i, is_active); + insert_bit(std::array{this->active_pivots, this->active_pivots_overflow}, + pivot_i, + is_active); this->active_pivots = active_pivots_out[0]; this->active_pivots_overflow = active_pivots_out[1]; } diff --git a/src/turtle_kv/tree/in_memory_node.hpp b/src/turtle_kv/tree/in_memory_node.hpp index b3992de..0f54681 100644 --- a/src/turtle_kv/tree/in_memory_node.hpp +++ b/src/turtle_kv/tree/in_memory_node.hpp @@ -129,7 +129,9 @@ struct InMemoryNode { void set_pivot_active(i32 pivot_i, bool active) { std::array active_pivots_out = - set_bit({this->active_pivots, this->active_pivots_overflow}, pivot_i, active); + set_bit(std::array{this->active_pivots, this->active_pivots_overflow}, + pivot_i, + active); this->active_pivots = active_pivots_out[0]; this->active_pivots_overflow = active_pivots_out[1]; } @@ -138,7 +140,8 @@ struct InMemoryNode { */ bool is_pivot_active(i32 pivot_i) const { - return get_bit({this->active_pivots, this->active_pivots_overflow}, pivot_i); + return get_bit(std::array{this->active_pivots, this->active_pivots_overflow}, + pivot_i); } template From 31b8118c9d93030ecee784bb51e10cb2e7bbf692 Mon Sep 17 00:00:00 2001 From: Vidya Silai Date: Tue, 10 Mar 2026 09:39:23 -0400 Subject: [PATCH 23/48] Upgrade batteries and llfs --- conanfile.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conanfile.py b/conanfile.py index df05462..41e98e6 100644 --- a/conanfile.py +++ b/conanfile.py @@ -56,10 +56,10 @@ def requirements(self): } self.requires("abseil/20250127.0", **VISIBLE, **OVERRIDE) - self.requires("batteries/[>=0.61.0 <1]", **VISIBLE, **OVERRIDE) + self.requires("batteries/[>=0.65.0 <1]", **VISIBLE, **OVERRIDE) self.requires("boost/1.88.0", **VISIBLE, **OVERRIDE) self.requires("glog/0.7.1", **VISIBLE) - self.requires("llfs/[>=0.43.3 <1]", **VISIBLE) + self.requires("llfs/[>=0.43.4 <1]", **VISIBLE) self.requires("pcg-cpp/cci.20220409", **VISIBLE) self.requires("zlib/1.3.1", **OVERRIDE) From 54e1a5c984453f9a0dcdc61785758ae809f76ecc Mon Sep 17 00:00:00 2001 From: Vidya Silai Date: Tue, 10 Mar 2026 10:09:22 -0400 Subject: [PATCH 24/48] Fix try_split code to use new batteries overloads --- src/turtle_kv/tree/in_memory_node.cpp | 25 +++---------------------- 1 file changed, 3 insertions(+), 22 deletions(-) diff --git a/src/turtle_kv/tree/in_memory_node.cpp b/src/turtle_kv/tree/in_memory_node.cpp index c94794f..180ebe3 100644 --- a/src/turtle_kv/tree/in_memory_node.cpp +++ b/src/turtle_kv/tree/in_memory_node.cpp @@ -1118,28 +1118,9 @@ StatusOr> InMemoryNode::try_split_direct(BatchUpda BATT_CHECK_EQ(orig_pivot_count + 1, orig_pivot_keys.size()); - u64 tried_already = 0; - u64 tried_already_overflow = 0; + std::array tried_already = {0, 0}; usize split_pivot_i = (orig_pivot_count + 1) / 2; - const auto get_tried_bit = [&tried_already, &tried_already_overflow](usize i) -> bool { - if (i < 64) { - return get_bit(tried_already, i); - } else { - const i32 overflow_i = i - 64; - return get_bit(tried_already_overflow, overflow_i); - } - }; - - auto set_tried_bit = [&tried_already, &tried_already_overflow](usize i) { - if (i < 64) { - tried_already = set_bit(tried_already, i, true); - } else { - const i32 overflow_i = i - 64; - tried_already_overflow = set_bit(tried_already_overflow, overflow_i, true); - } - }; - auto* node_lower_half = this; auto node_upper_half = std::make_unique(batt::make_copy(this->pinned_node_page_), this->tree_options, @@ -1152,10 +1133,10 @@ StatusOr> InMemoryNode::try_split_direct(BatchUpda for (;;) { // If we ever try the same split point a second time, fail. // - if (get_tried_bit(split_pivot_i)) { + if (get_bit(tried_already, split_pivot_i)) { return {batt::StatusCode::kInternal}; } - set_tried_bit(split_pivot_i); + tried_already = set_bit(tried_already, split_pivot_i, true); //+++++++++++-+-+--+----- --- -- - - - - From 0a6d0054c7dc276dd973682303b35e5006efde94 Mon Sep 17 00:00:00 2001 From: Vidya Silai Date: Tue, 10 Mar 2026 10:28:10 -0400 Subject: [PATCH 25/48] Fix test code to use new batteries overloads --- src/turtle_kv/tree/testing/fake_segment.hpp | 36 ++++++++------------- 1 file changed, 14 insertions(+), 22 deletions(-) diff --git a/src/turtle_kv/tree/testing/fake_segment.hpp b/src/turtle_kv/tree/testing/fake_segment.hpp index aa0d261..54f201e 100644 --- a/src/turtle_kv/tree/testing/fake_segment.hpp +++ b/src/turtle_kv/tree/testing/fake_segment.hpp @@ -55,36 +55,28 @@ struct FakeSegment { bool is_pivot_active(i32 pivot_i) const { - if (pivot_i < 64) { - return get_bit(this->active_pivots_, pivot_i); - } else { - return get_bit(this->active_pivots_overflow_, pivot_i - 64); - } + return get_bit(std::array{this->active_pivots_, this->active_pivots_overflow_}, + pivot_i); } void set_pivot_active(i32 pivot_i, bool active) { - if (pivot_i < 64) { - this->active_pivots_ = set_bit(this->active_pivots_, pivot_i, active); - } else { - this->active_pivots_overflow_ = set_bit(this->active_pivots_overflow_, pivot_i - 64, active); - } + std::array active_pivots_out = + set_bit(std::array{this->active_pivots_, this->active_pivots_overflow_}, + pivot_i, + active); + this->active_pivots_ = active_pivots_out[0]; + this->active_pivots_overflow_ = active_pivots_out[1]; } void insert_active_pivot(usize pivot_i, bool is_active = true) { - if (pivot_i < 64) { - // Insert the highest bit from active_pivots to the overflow bit set. - // - this->active_pivots_overflow_ = - (this->active_pivots_overflow_ << 1) | ((this->active_pivots_ >> 63) & u64{1}); - - this->active_pivots_ = insert_bit(this->active_pivots_, pivot_i, is_active); - } else { - const i32 overflow_i = pivot_i - 64; - this->active_pivots_overflow_ = - insert_bit(this->active_pivots_overflow_, overflow_i, is_active); - } + std::array active_pivots_out = + insert_bit(std::array{this->active_pivots_, this->active_pivots_overflow_}, + pivot_i, + is_active); + this->active_pivots_ = active_pivots_out[0]; + this->active_pivots_overflow_ = active_pivots_out[1]; } void set_pivot_items_count(usize pivot_i, usize count) From 67a5c46ffd1cf27a7134e40fdf3bf87e731fee35 Mon Sep 17 00:00:00 2001 From: Anthony Astolfi Date: Tue, 10 Mar 2026 12:44:53 -0400 Subject: [PATCH 26/48] Start adding ActivePivotSet concept. --- src/turtle_kv/tree/active_pivots_set.hpp | 148 +++++++++++++++++++ src/turtle_kv/tree/algo/segmented_levels.hpp | 15 +- src/turtle_kv/tree/in_memory_node.cpp | 2 +- src/turtle_kv/tree/in_memory_node.hpp | 20 ++- src/turtle_kv/tree/packed_node_page.cpp | 5 +- src/turtle_kv/tree/packed_node_page.hpp | 13 +- 6 files changed, 175 insertions(+), 28 deletions(-) create mode 100644 src/turtle_kv/tree/active_pivots_set.hpp diff --git a/src/turtle_kv/tree/active_pivots_set.hpp b/src/turtle_kv/tree/active_pivots_set.hpp new file mode 100644 index 0000000..2869cb7 --- /dev/null +++ b/src/turtle_kv/tree/active_pivots_set.hpp @@ -0,0 +1,148 @@ +#pragma once +#define TURTLE_KV_TREE_ACTIVE_PIVOTS_SET_HPP + +#include +#include + +#include +#include + +#include +#include +#include +#include + +namespace turtle_kv { + +//=#=#==#==#===============+=+=+=+=++=++++++++++++++-++-+--+-+----+--------------- +// +template +concept ActivePivotsSet = requires(T pivots) { + // count + { std::declval().count() } -> std::convertible_to; + + // set + { pivots.set(std::declval(), std::declval()) } -> std::same_as; + + // get + { std::declval().get(std::declval()) } -> std::same_as; + + // first + { std::declval().first() } -> std::convertible_to; + + // last + { std::declval().last() } -> std::convertible_to; + + // printable + { + std::declval() << std::declval().printable() + } -> std::convertible_to; +}; + +template +concept HasActivePivotsSet = requires(const T& obj) { + { obj.get_active_pivots() } -> ActivePivotsSet; +}; + +class PackedActivePivotsSet64; + +//=#=#==#==#===============+=+=+=+=++=++++++++++++++-++-+--+-+----+--------------- +// +class ActivePivotsSet128 +{ + friend class PackedActivePivotsSet64; + + public: + BATT_ALWAYS_INLINE usize count() const + { + return bit_count(this->bit_set_); + } + + BATT_ALWAYS_INLINE void set(i32 i, bool v) + { + this->bit_set_ = set_bit(this->bit_set_, i, v); + } + + BATT_ALWAYS_INLINE bool get(i32 i) const + { + return get_bit(this->bit_set_, i); + } + + BATT_ALWAYS_INLINE i32 first() const + { + return first_bit(this->bit_set_); + } + + BATT_ALWAYS_INLINE i32 last() const + { + return last_bit(this->bit_set_); + } + + auto printable() const + { + return [this](std::ostream& out) { + out << std::bitset<64>{this->bit_set_[1]} << "," << std::bitset<64>{this->bit_set_[1]}; + }; + } + + private: + std::array bit_set_ = {0, 0}; +}; + +static_assert(ActivePivotsSet); + +//=#=#==#==#===============+=+=+=+=++=++++++++++++++-++-+--+-+----+--------------- +// +class PackedActivePivotsSet64 +{ + public: + /*implicit*/ PackedActivePivotsSet64(const ActivePivotsSet128& src) noexcept + : bit_set_{src.bit_set_[0]} + { + BATT_CHECK_EQ(src.bit_set_[1], 0); + } + + ActivePivotsSet128 unpack() const + { + ActivePivotsSet128 unpacked; + unpacked.bit_set_[0] = this->bit_set_; + return unpacked; + } + + BATT_ALWAYS_INLINE usize count() const + { + return bit_count(this->bit_set_); + } + + BATT_ALWAYS_INLINE void set(i32 i, bool v) + { + this->bit_set_ = set_bit(this->bit_set_, i, v); + } + + BATT_ALWAYS_INLINE bool get(i32 i) const + { + return get_bit(this->bit_set_, i); + } + + BATT_ALWAYS_INLINE i32 first() const + { + return first_bit(this->bit_set_); + } + + BATT_ALWAYS_INLINE i32 last() const + { + return last_bit(this->bit_set_); + } + + auto printable() const + { + return std::bitset<64>{this->bit_set_.value()}; + } + + private: + little_u64 bit_set_; +}; + +static_assert(ActivePivotsSet); + +} // namespace turtle_kv diff --git a/src/turtle_kv/tree/algo/segmented_levels.hpp b/src/turtle_kv/tree/algo/segmented_levels.hpp index f0ed498..7ba4f5d 100644 --- a/src/turtle_kv/tree/algo/segmented_levels.hpp +++ b/src/turtle_kv/tree/algo/segmented_levels.hpp @@ -92,21 +92,20 @@ inline i32 get_last_active_pivot(const CInterval& pivot_range) template using EnableIfHasActivePivotsBitset = std::enable_if_t< - std::is_convertible_v().get_active_pivots_with_overflow()), - std::array>>; + ActivePivotsSet>().get_active_pivots())>>; //----- --- -- - - - - -template > -inline i32 get_first_active_pivot(T&& segment) +template +inline i32 get_first_active_pivot(const T& segment) { - return first_bit(segment.get_active_pivots_with_overflow()); + return segment.get_active_pivots().first(); } -template > -inline i32 get_last_active_pivot(T&& segment) +template +inline i32 get_last_active_pivot(const T& segment) { - return last_bit(segment.get_active_pivots_with_overflow()); + return segment.get_active_pivots().last(); } //=#=#==#==#===============+=+=+=+=++=++++++++++++++-++-+--+-+----+--------------- diff --git a/src/turtle_kv/tree/in_memory_node.cpp b/src/turtle_kv/tree/in_memory_node.cpp index 180ebe3..038104a 100644 --- a/src/turtle_kv/tree/in_memory_node.cpp +++ b/src/turtle_kv/tree/in_memory_node.cpp @@ -109,7 +109,7 @@ using PackedSegment = PackedUpdateBuffer::Segment; Segment& segment = segmented_level.segments[segment_i]; segment.page_id_slot = llfs::PageIdSlot::from_page_id(packed_segment.leaf_page_id.unpack()); - segment.active_pivots = packed_segment.active_pivots; + segment.active_pivots = packed_segment.active_pivots.unpack(); BATT_ASSIGN_OK_RESULT(segment.filter, packed_node.create_piecewise_filter(level_i, segment_i)); diff --git a/src/turtle_kv/tree/in_memory_node.hpp b/src/turtle_kv/tree/in_memory_node.hpp index 0f54681..21282be 100644 --- a/src/turtle_kv/tree/in_memory_node.hpp +++ b/src/turtle_kv/tree/in_memory_node.hpp @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include @@ -89,12 +90,13 @@ struct InMemoryNode { /** \brief A bit set of pivots in whose key range this segment contains items. Used for * pivots [0, 64). */ - u64 active_pivots = 0; + // u64 active_pivots = 0; + ActivePivotsSet128 active_pivots; /** \brief A bit set of pivots in whose key range this segment contains items. Used for * pivots 64 and greater. */ - u64 active_pivots_overflow = 0; + // u64 active_pivots_overflow = 0; /** \brief A filter over the flushed items in this segment. */ @@ -112,36 +114,32 @@ struct InMemoryNode { /** \brief Returns the active pivots bit set. */ - u64 get_active_pivots() const + auto get_active_pivots() const { return this->active_pivots; } /** \brief Returns the active pivots bit set, as well as the overflow bit set. */ +#if 0 std::array get_active_pivots_with_overflow() const { return {this->active_pivots, this->active_pivots_overflow}; } +#endif /** \brief Marks this segment as containing (or not) active keys addressed to `pivot_i`. */ void set_pivot_active(i32 pivot_i, bool active) { - std::array active_pivots_out = - set_bit(std::array{this->active_pivots, this->active_pivots_overflow}, - pivot_i, - active); - this->active_pivots = active_pivots_out[0]; - this->active_pivots_overflow = active_pivots_out[1]; + this->active_pivots.set(pivot_i, active); } /** \brief Returns true iff this segment has active keys addressed to `pivot_i`. */ bool is_pivot_active(i32 pivot_i) const { - return get_bit(std::array{this->active_pivots, this->active_pivots_overflow}, - pivot_i); + return this->active_pivots.get(pivot_i); } template diff --git a/src/turtle_kv/tree/packed_node_page.cpp b/src/turtle_kv/tree/packed_node_page.cpp index 09f1423..ae9d08d 100644 --- a/src/turtle_kv/tree/packed_node_page.cpp +++ b/src/turtle_kv/tree/packed_node_page.cpp @@ -146,8 +146,7 @@ PackedNodePage* build_node_page(const MutableBuffer& buffer, const InMemoryNode& dst_segment.leaf_page_id = llfs::PackedPageId::from(src_segment.page_id_slot.page_id); dst_segment.active_pivots = src_segment.get_active_pivots(); - BATT_CHECK_EQ(bit_count(src_segment.get_active_pivots()), - bit_count(dst_segment.active_pivots)); + BATT_CHECK_EQ(src_segment.get_active_pivots().count(), dst_segment.active_pivots.count()); dst_segment.filter_start = BATT_CHECKED_CAST(u16, segment_filters_offset); @@ -494,7 +493,7 @@ std::function PackedNodePage::dump() const (segment.filter_start.value() & PackedNodePage::kSegmentStartsFiltered) != 0; out << " - [" << std::setw(2) << std::setfill(' ') << i << "]:" << std::endl << " leaf_page_id: " << segment.leaf_page_id.unpack() << std::endl - << " active_pivots: " << std::bitset<64>{segment.active_pivots.value()} << std::endl + << " active_pivots: " << segment.active_pivots.printable() << std::endl << " filter_start: " << filter_start_i << std::endl << " starts_filtered: " << start_filtered << std::endl << std::endl; diff --git a/src/turtle_kv/tree/packed_node_page.hpp b/src/turtle_kv/tree/packed_node_page.hpp index 0c2513b..2fe0b3b 100644 --- a/src/turtle_kv/tree/packed_node_page.hpp +++ b/src/turtle_kv/tree/packed_node_page.hpp @@ -1,5 +1,6 @@ #pragma once +#include #include #include @@ -126,26 +127,28 @@ struct PackedNodePage { }; struct Segment { - llfs::PackedPageId leaf_page_id; // +8 -> 8 - little_u64 active_pivots; // +8 -> 16 - little_u16 filter_start; // +2 -> 18 + llfs::PackedPageId leaf_page_id; // +8 -> 8 + PackedActivePivotsSet64 active_pivots; // +8 -> 16 + little_u16 filter_start; // +2 -> 18 //+++++++++++-+-+--+----- --- -- - - - - bool is_pivot_active(i32 pivot_i) const { - return get_bit(this->active_pivots, pivot_i); + return this->active_pivots.get(pivot_i); } - u64 get_active_pivots() const + PackedActivePivotsSet64 get_active_pivots() const { return this->active_pivots; } +#if 0 std::array get_active_pivots_with_overflow() const { return {this->active_pivots, u64{0}}; } +#endif llfs::PageId get_leaf_page_id() const { From ee66720dc27eb44a8220010e2c775c44d91a667f Mon Sep 17 00:00:00 2001 From: Vidya Silai Date: Tue, 10 Mar 2026 19:50:02 -0400 Subject: [PATCH 27/48] Finish writing ActivePivotsSet concept --- src/turtle_kv/tree/active_pivots_set.hpp | 107 +++++++++++------- src/turtle_kv/tree/algo/segmented_levels.hpp | 6 - src/turtle_kv/tree/algo/segments.hpp | 9 +- src/turtle_kv/tree/in_memory_node.cpp | 32 ++---- src/turtle_kv/tree/in_memory_node.hpp | 18 +-- .../tree/segmented_level_scanner.hpp | 12 +- .../tree/segmented_level_scanner.test.cpp | 6 +- src/turtle_kv/tree/sharded_level_scanner.hpp | 20 ++-- src/turtle_kv/tree/testing/fake_segment.hpp | 33 ++---- 9 files changed, 104 insertions(+), 139 deletions(-) diff --git a/src/turtle_kv/tree/active_pivots_set.hpp b/src/turtle_kv/tree/active_pivots_set.hpp index 2869cb7..50661fc 100644 --- a/src/turtle_kv/tree/active_pivots_set.hpp +++ b/src/turtle_kv/tree/active_pivots_set.hpp @@ -33,10 +33,16 @@ concept ActivePivotsSet = requires(T pivots) { // last { std::declval().last() } -> std::convertible_to; + // next + { std::declval().next(std::declval()) } -> std::convertible_to; + // printable { std::declval() << std::declval().printable() } -> std::convertible_to; + + // is_inactive + { std::declval().is_inactive() } -> std::same_as; }; template @@ -44,15 +50,20 @@ concept HasActivePivotsSet = requires(const T& obj) { { obj.get_active_pivots() } -> ActivePivotsSet; }; -class PackedActivePivotsSet64; - //=#=#==#==#===============+=+=+=+=++=++++++++++++++-++-+--+-+----+--------------- // -class ActivePivotsSet128 +/** \brief Base class for active pivot bit set representation. Implements basic bit operations. + */ +template +class ActivePivotsSetBase { - friend class PackedActivePivotsSet64; - public: + BATT_ALWAYS_INLINE ActivePivotsSetBase() noexcept : bit_set_{} {} + + BATT_ALWAYS_INLINE explicit ActivePivotsSetBase(const Bitset& bit_set) : bit_set_{bit_set} + { + } + BATT_ALWAYS_INLINE usize count() const { return bit_count(this->bit_set_); @@ -78,69 +89,83 @@ class ActivePivotsSet128 return last_bit(this->bit_set_); } - auto printable() const + BATT_ALWAYS_INLINE i32 next(i32 i) const { - return [this](std::ostream& out) { - out << std::bitset<64>{this->bit_set_[1]} << "," << std::bitset<64>{this->bit_set_[1]}; - }; + return next_bit(this->bit_set_, i); } - private: - std::array bit_set_ = {0, 0}; + BATT_ALWAYS_INLINE void insert(i32 i, bool v) + { + this->bit_set_ = insert_bit(this->bit_set_, i, v); + } + + BATT_ALWAYS_INLINE bool is_inactive() const + { + return this->bit_set_ == Bitset{}; + } + + protected: + Bitset bit_set_; }; -static_assert(ActivePivotsSet); +class PackedActivePivotsSet64; //=#=#==#==#===============+=+=+=+=++=++++++++++++++-++-+--+-+----+--------------- // -class PackedActivePivotsSet64 +/** \brief Representation of a 128-bit bit set of active pivots. + */ +class ActivePivotsSet128 : public ActivePivotsSetBase> { - public: - /*implicit*/ PackedActivePivotsSet64(const ActivePivotsSet128& src) noexcept - : bit_set_{src.bit_set_[0]} - { - BATT_CHECK_EQ(src.bit_set_[1], 0); - } + friend class PackedActivePivotsSet64; - ActivePivotsSet128 unpack() const + public: + BATT_ALWAYS_INLINE auto printable() const { - ActivePivotsSet128 unpacked; - unpacked.bit_set_[0] = this->bit_set_; - return unpacked; + return [this](std::ostream& out) { + out << std::bitset<64>{this->bit_set_[1]} << "," << std::bitset<64>{this->bit_set_[1]}; + }; } - BATT_ALWAYS_INLINE usize count() const + /** \brief Removes the specified number (`count`) pivots from the bit set. + */ + BATT_ALWAYS_INLINE void pop_front_pivots(i32 count) { - return bit_count(this->bit_set_); - } + if (count < 1) { + return; + } - BATT_ALWAYS_INLINE void set(i32 i, bool v) - { - this->bit_set_ = set_bit(this->bit_set_, i, v); + this->bit_set_[0] = (this->bit_set_[0] >> count) | (this->bit_set_[1] << (64 - count)); + this->bit_set_[1] >>= count; } +}; - BATT_ALWAYS_INLINE bool get(i32 i) const - { - return get_bit(this->bit_set_, i); - } +static_assert(ActivePivotsSet); - BATT_ALWAYS_INLINE i32 first() const +//=#=#==#==#===============+=+=+=+=++=++++++++++++++-++-+--+-+----+--------------- +// +/** \brief A packed representation of a 64-bit bit set of active pivots. + */ +class PackedActivePivotsSet64 : public ActivePivotsSetBase +{ + public: + BATT_ALWAYS_INLINE /*implicit*/ PackedActivePivotsSet64(const ActivePivotsSet128& src) noexcept + : ActivePivotsSetBase{little_u64{src.bit_set_[0]}} { - return first_bit(this->bit_set_); + BATT_CHECK_EQ(src.bit_set_[1], 0); } - BATT_ALWAYS_INLINE i32 last() const + BATT_ALWAYS_INLINE ActivePivotsSet128 unpack() const { - return last_bit(this->bit_set_); + ActivePivotsSet128 unpacked; + unpacked.bit_set_[0] = this->bit_set_; + unpacked.bit_set_[1] = 0; + return unpacked; } - auto printable() const + BATT_ALWAYS_INLINE auto printable() const { return std::bitset<64>{this->bit_set_.value()}; } - - private: - little_u64 bit_set_; }; static_assert(ActivePivotsSet); diff --git a/src/turtle_kv/tree/algo/segmented_levels.hpp b/src/turtle_kv/tree/algo/segmented_levels.hpp index 7ba4f5d..293f95d 100644 --- a/src/turtle_kv/tree/algo/segmented_levels.hpp +++ b/src/turtle_kv/tree/algo/segmented_levels.hpp @@ -88,12 +88,6 @@ inline i32 get_last_active_pivot(const CInterval& pivot_range) return pivot_range.upper_bound; } -//==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - - -template -using EnableIfHasActivePivotsBitset = std::enable_if_t< - ActivePivotsSet>().get_active_pivots())>>; - //----- --- -- - - - - template diff --git a/src/turtle_kv/tree/algo/segments.hpp b/src/turtle_kv/tree/algo/segments.hpp index 5db1e0b..68ba93e 100644 --- a/src/turtle_kv/tree/algo/segments.hpp +++ b/src/turtle_kv/tree/algo/segments.hpp @@ -117,16 +117,13 @@ struct SegmentAlgorithms { // they were when this function was entered, regardless of what `fn` may do to change the state // of the segment. // - const std::array observed_active_pivots = - this->segment_.get_active_pivots_with_overflow(); - - const i32 first_active_pivot = first_bit(observed_active_pivots); + const auto observed_active_pivots = this->segment_.get_active_pivots(); const i32 first_pivot_i = std::max(pivot_range.lower_bound, // - first_active_pivot); + observed_active_pivots.first()); for (i32 pivot_i = first_pivot_i; pivot_i < pivot_range.upper_bound; - pivot_i = next_bit(observed_active_pivots, pivot_i)) { + pivot_i = observed_active_pivots.next(pivot_i)) { BATT_INVOKE_LOOP_FN((fn, this->segment_, pivot_i)); } } diff --git a/src/turtle_kv/tree/in_memory_node.cpp b/src/turtle_kv/tree/in_memory_node.cpp index 038104a..7d3f495 100644 --- a/src/turtle_kv/tree/in_memory_node.cpp +++ b/src/turtle_kv/tree/in_memory_node.cpp @@ -1553,7 +1553,7 @@ StatusOr MergedLevel::finish_serialize(const InMemoryNode& node, context.get_build_page_result(this->segment_future_ids_[segment_i])); segment.page_id_slot.page_id = pinned_leaf_page.page_id(); - segment.active_pivots = 0; + segment.active_pivots = {}; const PackedLeafPage& leaf_page = PackedLeafPage::view_of(pinned_leaf_page); @@ -1702,43 +1702,25 @@ void InMemoryNode::UpdateBuffer::Segment::insert_pivot(i32 pivot_i, bool is_acti this->check_invariants(__FILE__, __LINE__); }); - std::array active_pivots_out = - insert_bit(std::array{this->active_pivots, this->active_pivots_overflow}, - pivot_i, - is_active); - this->active_pivots = active_pivots_out[0]; - this->active_pivots_overflow = active_pivots_out[1]; + this->active_pivots.insert(pivot_i, is_active); } //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - // void InMemoryNode::UpdateBuffer::Segment::pop_front_pivots(i32 count) { - if (count < 1) { - return; - } - BATT_CHECK_LT(count, 64); - // Before we modify the bit sets, make sure we aren't losing any active pivots. - // - const u64 mask = (u64{1} << count) - 1; - - BATT_CHECK_EQ(bit_count(mask), count); - BATT_CHECK_EQ((this->active_pivots & mask), u64{0}); - // Shift the active pivot sets down by count. // - this->active_pivots = - (this->active_pivots >> count) | (this->active_pivots_overflow << (64 - count)); - this->active_pivots_overflow = (this->active_pivots_overflow >> count); + this->active_pivots.pop_front_pivots(count); } //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - // bool InMemoryNode::UpdateBuffer::Segment::is_inactive() const { - const bool inactive = (this->active_pivots == 0 && this->active_pivots_overflow == 0); + const bool inactive = this->active_pivots.is_inactive(); if (inactive) { Slice> filter_dropped_ranges = this->filter.dropped(); BATT_CHECK_EQ(filter_dropped_ranges.size(), 1); @@ -1800,14 +1782,14 @@ SmallFn InMemoryNode::UpdateBuffer::SegmentedLevel::dump() SmallFn InMemoryNode::UpdateBuffer::Segment::dump(bool multi_line) const { return [this, multi_line](std::ostream& out) { - auto active = std::bitset<64>{this->active_pivots}; if (multi_line) { out << "Segment:" << std::endl - << " active=" << active << std::endl + << " active=" << this->active_pivots.printable() << std::endl << " filter=" << this->filter.dump() << std::endl << std::endl; } else { - out << "Segment{.active=" << active << ", .filter=" << this->filter.dump() << ",}"; + out << "Segment{.active=" << this->active_pivots.printable() + << ", .filter=" << this->filter.dump() << ",}"; } }; } diff --git a/src/turtle_kv/tree/in_memory_node.hpp b/src/turtle_kv/tree/in_memory_node.hpp index 21282be..461465d 100644 --- a/src/turtle_kv/tree/in_memory_node.hpp +++ b/src/turtle_kv/tree/in_memory_node.hpp @@ -87,17 +87,10 @@ struct InMemoryNode { */ llfs::PageIdSlot page_id_slot; - /** \brief A bit set of pivots in whose key range this segment contains items. Used for - * pivots [0, 64). + /** \brief A bit set of pivots in whose key range this segment contains items. */ - // u64 active_pivots = 0; ActivePivotsSet128 active_pivots; - /** \brief A bit set of pivots in whose key range this segment contains items. Used for - * pivots 64 and greater. - */ - // u64 active_pivots_overflow = 0; - /** \brief A filter over the flushed items in this segment. */ PiecewiseFilter filter; @@ -119,15 +112,6 @@ struct InMemoryNode { return this->active_pivots; } - /** \brief Returns the active pivots bit set, as well as the overflow bit set. - */ -#if 0 - std::array get_active_pivots_with_overflow() const - { - return {this->active_pivots, this->active_pivots_overflow}; - } -#endif - /** \brief Marks this segment as containing (or not) active keys addressed to `pivot_i`. */ void set_pivot_active(i32 pivot_i, bool active) diff --git a/src/turtle_kv/tree/segmented_level_scanner.hpp b/src/turtle_kv/tree/segmented_level_scanner.hpp index 3145447..4c3dd20 100644 --- a/src/turtle_kv/tree/segmented_level_scanner.hpp +++ b/src/turtle_kv/tree/segmented_level_scanner.hpp @@ -53,6 +53,7 @@ class SegmentedLevelScanner : private SegmentedLevelScannerBase using PageLoader = PageLoaderT; using PinnedPageT = typename PageLoader::PinnedPageT; using Segment = typename Level::Segment; + using ActivePivotsSetT = decltype(std::declval().get_active_pivots()); using Item = EditSlice; @@ -205,8 +206,8 @@ inline auto SegmentedLevelScanner::peek_next_impl(bo const Segment* segment = std::addressof(this->level_->get_segment(this->segment_i_)); - u64 active_pivots = segment->get_active_pivots(); - BATT_CHECK_NE(active_pivots, 0) << "This segment should have been dropped!"; + ActivePivotsSetT active_pivots = segment->get_active_pivots(); + BATT_CHECK(!active_pivots.is_inactive()) << "This segment should have been dropped!"; // Make sure we have a leaf page loaded. // @@ -215,7 +216,7 @@ inline auto SegmentedLevelScanner::peek_next_impl(bo // Skip ahead to the next segment that is active at or past the minimum pivot. // - while (last_bit(active_pivots) < this->min_pivot_i_) { + while (active_pivots.last() < this->min_pivot_i_) { ++this->segment_i_; if (this->segment_i_ == this->level_->segment_count()) { return None; @@ -238,9 +239,8 @@ inline auto SegmentedLevelScanner::peek_next_impl(bo this->pinned_leaf_ = std::move(*loaded_page); - i32 target_pivot_i = std::max(first_bit(active_pivots), this->min_pivot_i_); - while (target_pivot_i < (i32)this->node_->pivot_count() && - !get_bit(active_pivots, target_pivot_i)) { + i32 target_pivot_i = std::max(active_pivots.first(), this->min_pivot_i_); + while (target_pivot_i < (i32)this->node_->pivot_count() && !active_pivots.get(target_pivot_i)) { ++target_pivot_i; } diff --git a/src/turtle_kv/tree/segmented_level_scanner.test.cpp b/src/turtle_kv/tree/segmented_level_scanner.test.cpp index 9f499fa..1ffa411 100644 --- a/src/turtle_kv/tree/segmented_level_scanner.test.cpp +++ b/src/turtle_kv/tree/segmented_level_scanner.test.cpp @@ -281,8 +281,6 @@ TEST_F(SegmentedLevelScannerTest, Test) void SegmentedLevelScannerTest::Scenario::run_with_pivot_count(usize pivot_count) { - constexpr usize kMaxPivotCount = 64; - // Configure the test. // const bool debug_output = false; @@ -350,7 +348,7 @@ void SegmentedLevelScannerTest::Scenario::run_with_pivot_count(usize pivot_count if (debug_output) { std::cout << BATT_INSPECT(segment_i) << BATT_INSPECT(leaf_view.get_key_crange()) << "\t" - << std::bitset{fake_segment.get_active_pivots()} << " " + << fake_segment.get_active_pivots().printable() << " " << batt::dump_range(fake_segment.pivot_items_count_) << std::endl; } } @@ -424,7 +422,7 @@ void SegmentedLevelScannerTest::Scenario::run_with_pivot_count(usize pivot_count if (debug_output) { std::cout << std::setw(3) << segment_i - << ": active=" << std::bitset{segment.get_active_pivots()} + << ": active=" << segment.get_active_pivots().printable() << std::endl; } } diff --git a/src/turtle_kv/tree/sharded_level_scanner.hpp b/src/turtle_kv/tree/sharded_level_scanner.hpp index 0652efb..bdf3c65 100644 --- a/src/turtle_kv/tree/sharded_level_scanner.hpp +++ b/src/turtle_kv/tree/sharded_level_scanner.hpp @@ -67,6 +67,7 @@ class ShardedLevelScanner : private SegmentedLevelScannerBase using PageLoader = PageLoaderT; using PinnedPageT = typename PageLoader::PinnedPageT; using Segment = typename Level::Segment; + using ActivePivotsSetT = decltype(std::declval().get_active_pivots()); using Item = ShardedKeyValueSlice; @@ -143,11 +144,11 @@ class ShardedLevelScanner : private SegmentedLevelScannerBase void update_cached_items(usize prev_item_i); Optional continue_full_leaf_after_segment_check(bool advance, - u64 active_pivots, + ActivePivotsSetT active_pivots, const Segment* segment) noexcept; Optional continue_sharded_after_segment_check(bool advance, - u64 active_pivots, + ActivePivotsSetT active_pivots, const Segment* segment) noexcept; //+++++++++++-+-+--+----- --- -- - - - - @@ -241,9 +242,9 @@ inline auto ShardedLevelScanner::peek_next_impl(bool const Segment* segment = std::addressof(this->level_->get_segment(this->segment_i_)); - u64 active_pivots = segment->get_active_pivots(); + ActivePivotsSetT active_pivots = segment->get_active_pivots(); { - BATT_CHECK_NE(active_pivots, 0) << "This segment should have been dropped!"; + BATT_CHECK(!active_pivots.is_inactive()) << "This segment should have been dropped!"; } // Check for need to load new segment. @@ -253,7 +254,7 @@ inline auto ShardedLevelScanner::peek_next_impl(bool // Skip ahead to the next segment that is active at or past the minimum pivot. // - while (last_bit(active_pivots) < this->min_pivot_i_) { + while (active_pivots.last() < this->min_pivot_i_) { ++this->segment_i_; if (this->segment_i_ == this->level_->segment_count()) { return None; @@ -262,9 +263,8 @@ inline auto ShardedLevelScanner::peek_next_impl(bool active_pivots = segment->get_active_pivots(); } - i32 target_pivot_i = std::max(first_bit(active_pivots), this->min_pivot_i_); - while (target_pivot_i < (i32)this->node_->pivot_count() && - !get_bit(active_pivots, target_pivot_i)) { + i32 target_pivot_i = std::max(active_pivots.first(), this->min_pivot_i_); + while (target_pivot_i < (i32)this->node_->pivot_count() && !active_pivots.get(target_pivot_i)) { ++target_pivot_i; } @@ -342,7 +342,7 @@ inline auto ShardedLevelScanner::peek_next_impl(bool template inline auto ShardedLevelScanner::continue_sharded_after_segment_check( bool advance, - u64 active_pivots, + ActivePivotsSetT active_pivots, const Segment* segment) noexcept -> Optional { const void* page_start = this->head_shard_slice_.data(); @@ -417,7 +417,7 @@ inline auto ShardedLevelScanner::continue_sharded_af template inline auto ShardedLevelScanner::continue_full_leaf_after_segment_check( bool advance, - u64 active_pivots, + ActivePivotsSetT active_pivots, const Segment* segment) noexcept -> Optional { const PackedLeafPage& leaf_page = diff --git a/src/turtle_kv/tree/testing/fake_segment.hpp b/src/turtle_kv/tree/testing/fake_segment.hpp index 54f201e..1c6f1db 100644 --- a/src/turtle_kv/tree/testing/fake_segment.hpp +++ b/src/turtle_kv/tree/testing/fake_segment.hpp @@ -2,6 +2,8 @@ #include +#include + #include #include @@ -24,8 +26,7 @@ struct FakeLevel; // struct FakeSegment { llfs::PageId page_id_; - u64 active_pivots_ = 0; - u64 active_pivots_overflow_ = 0; + ActivePivotsSet128 active_pivots_ = {}; PiecewiseFilter filter_; std::map pivot_items_count_; @@ -43,40 +44,24 @@ struct FakeSegment { }); } - u64 get_active_pivots() const + auto get_active_pivots() const { return this->active_pivots_; } - std::pair get_active_pivots_with_overflow() const - { - return std::make_pair(this->active_pivots_, this->active_pivots_overflow_); - } - bool is_pivot_active(i32 pivot_i) const { - return get_bit(std::array{this->active_pivots_, this->active_pivots_overflow_}, - pivot_i); + return this->active_pivots_.get(pivot_i); } void set_pivot_active(i32 pivot_i, bool active) { - std::array active_pivots_out = - set_bit(std::array{this->active_pivots_, this->active_pivots_overflow_}, - pivot_i, - active); - this->active_pivots_ = active_pivots_out[0]; - this->active_pivots_overflow_ = active_pivots_out[1]; + this->active_pivots_.set(pivot_i, active); } void insert_active_pivot(usize pivot_i, bool is_active = true) { - std::array active_pivots_out = - insert_bit(std::array{this->active_pivots_, this->active_pivots_overflow_}, - pivot_i, - is_active); - this->active_pivots_ = active_pivots_out[0]; - this->active_pivots_overflow_ = active_pivots_out[1]; + this->active_pivots_.insert(pivot_i, is_active); } void set_pivot_items_count(usize pivot_i, usize count) @@ -101,13 +86,13 @@ struct FakeSegment { void clear_active_pivots() { - this->active_pivots_ = 0; + this->active_pivots_ = {}; this->pivot_items_count_.clear(); } bool is_inactive() const { - const bool inactive = (this->active_pivots_ == 0 && this->active_pivots_overflow_ == 0); + const bool inactive = this->active_pivots_.is_inactive(); if (inactive) { Slice> filter_dropped_ranges = this->filter_.dropped(); BATT_CHECK_EQ(filter_dropped_ranges.size(), 1); From 3872edf585c2360b72b422d2dbce97c0b2f00561 Mon Sep 17 00:00:00 2001 From: Vidya Silai Date: Tue, 10 Mar 2026 20:01:41 -0400 Subject: [PATCH 28/48] Remove old code --- src/turtle_kv/tree/packed_node_page.hpp | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/turtle_kv/tree/packed_node_page.hpp b/src/turtle_kv/tree/packed_node_page.hpp index 2fe0b3b..0329c8e 100644 --- a/src/turtle_kv/tree/packed_node_page.hpp +++ b/src/turtle_kv/tree/packed_node_page.hpp @@ -143,13 +143,6 @@ struct PackedNodePage { return this->active_pivots; } -#if 0 - std::array get_active_pivots_with_overflow() const - { - return {this->active_pivots, u64{0}}; - } -#endif - llfs::PageId get_leaf_page_id() const { return this->leaf_page_id.unpack(); From ace4445288f94f8f3344309e9fb6d5b5ed9f4424 Mon Sep 17 00:00:00 2001 From: Vidya Silai Date: Wed, 11 Mar 2026 10:54:05 -0400 Subject: [PATCH 29/48] Fix build issues --- src/turtle_kv/tree/active_pivots_set.hpp | 29 +++++++++++- src/turtle_kv/tree/in_memory_node.cpp | 41 ++++++----------- src/turtle_kv/tree/in_memory_node.hpp | 2 - src/turtle_kv/tree/in_memory_node.test.cpp | 51 +++++++++++++--------- src/turtle_kv/tree/subtree.cpp | 6 +-- 5 files changed, 75 insertions(+), 54 deletions(-) diff --git a/src/turtle_kv/tree/active_pivots_set.hpp b/src/turtle_kv/tree/active_pivots_set.hpp index 50661fc..a1f33e2 100644 --- a/src/turtle_kv/tree/active_pivots_set.hpp +++ b/src/turtle_kv/tree/active_pivots_set.hpp @@ -99,6 +99,11 @@ class ActivePivotsSetBase this->bit_set_ = insert_bit(this->bit_set_, i, v); } + BATT_ALWAYS_INLINE void remove(i32 i) + { + this->bit_set_ = remove_bit(this->bit_set_, i); + } + BATT_ALWAYS_INLINE bool is_inactive() const { return this->bit_set_ == Bitset{}; @@ -134,8 +139,28 @@ class ActivePivotsSet128 : public ActivePivotsSetBase> return; } - this->bit_set_[0] = (this->bit_set_[0] >> count) | (this->bit_set_[1] << (64 - count)); - this->bit_set_[1] >>= count; + if (count < 64) { + this->bit_set_[0] = (this->bit_set_[0] >> count) | (this->bit_set_[1] << (64 - count)); + this->bit_set_[1] >>= count; + } else { + this->bit_set_[0] = this->bit_set_[1] >> (count - 64); + this->bit_set_[1] = 0; + } + } + + BATT_ALWAYS_INLINE void pop_back_pivots(i32 count) + { + if (count < 1) { + return; + } + + if (count < 64) { + this->bit_set_[1] = (this->bit_set_[1] << count) | (this->bit_set_[0] >> (64 - count)); + this->bit_set_[0] <<= count; + } else { + this->bit_set_[1] = this->bit_set_[0] << (count - 64); + this->bit_set_[0] = 0; + } } }; diff --git a/src/turtle_kv/tree/in_memory_node.cpp b/src/turtle_kv/tree/in_memory_node.cpp index 12c56d3..d509bf3 100644 --- a/src/turtle_kv/tree/in_memory_node.cpp +++ b/src/turtle_kv/tree/in_memory_node.cpp @@ -741,7 +741,7 @@ Status InMemoryNode::merge_child(BatchUpdateContext& update_context, i32 pivot_i if (pivot_i == 0) { sibling_i = pivot_i + 1; - } else if (pivot_i == this->pivot_count() - 1) { + } else if ((usize)pivot_i == this->pivot_count() - 1) { sibling_i = pivot_i - 1; } else { sibling_i = pivot_i + 1; @@ -772,7 +772,10 @@ Status InMemoryNode::merge_child(BatchUpdateContext& update_context, i32 pivot_i for (Level& level : this->update_buffer.levels) { if (batt::is_case(level)) { SegmentedLevel& segmented_level = std::get(level); - in_segmented_level(*this, segmented_level, update_context.page_loader) + in_segmented_level(*this, + segmented_level, + update_context.page_loader, + update_context.overcommit) .merge_pivots(left_pivot_i, right_pivot_i); } } @@ -798,9 +801,10 @@ Status InMemoryNode::merge_child(BatchUpdateContext& update_context, i32 pivot_i this->pivot_keys_.erase(this->pivot_keys_.begin() + right_pivot_i); if ((usize)right_pivot_i == old_pivot_count - 1) { - BATT_ASSIGN_OK_RESULT( - this->max_key_, - this->children.back().get_max_key(update_context.page_loader, this->child_pages.back())); + BATT_ASSIGN_OK_RESULT(this->max_key_, + this->children.back().get_max_key(update_context.page_loader, + update_context.overcommit, + this->child_pages.back())); } // Finally, split the newly merged child if needed. @@ -897,8 +901,8 @@ Status InMemoryNode::try_merge(BatchUpdateContext& context, for (usize segment_i = 0; segment_i < right_segmented_level.segment_count(); ++segment_i) { Segment& segment = right_segmented_level.get_segment(segment_i); - segment.active_pivots <<= left_node_pivot_count; - } + segment.active_pivots.pop_back_pivots(left_node_pivot_count); + }; left_segmented_level.segments.insert( left_segmented_level.segments.end(), @@ -937,7 +941,7 @@ Status InMemoryNode::try_merge(BatchUpdateContext& context, for (usize segment_i = 0; segment_i < right_segmented_level.segment_count(); ++segment_i) { Segment& segment = right_segmented_level.get_segment(segment_i); - segment.active_pivots <<= left_node_pivot_count; + segment.active_pivots.pop_back_pivots(left_node_pivot_count); } this->update_buffer.levels.emplace_back(right_segmented_level); @@ -1973,6 +1977,7 @@ void InMemoryNode::UpdateBuffer::SegmentedLevel::check_items_sorted( segmented_level, context.page_loader, llfs::PinPageToJob::kDefault, + context.overcommit, segment_load_status, /*min_pivot_i=*/0} // | seq::boxed(); @@ -2022,7 +2027,7 @@ void InMemoryNode::UpdateBuffer::Segment::remove_pivot(i32 pivot_i) this->check_invariants(__FILE__, __LINE__); }); - this->active_pivots = remove_bit(this->active_pivots, pivot_i); + this->active_pivots.remove(pivot_i); } //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - @@ -2066,24 +2071,6 @@ SmallFn InMemoryNode::UpdateBuffer::dump() const }; } -//==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - -// -u64 InMemoryNode::UpdateBuffer::compute_active_pivots() const -{ - u64 active_pivots = 0; - for (const Level& level : this->levels) { - if (batt::is_case(level)) { - const SegmentedLevel& segmented_level = std::get(level); - for (usize segment_i = 0; segment_i < segmented_level.segment_count(); ++segment_i) { - const Segment& segment = segmented_level.get_segment(segment_i); - active_pivots |= segment.get_active_pivots(); - } - } - } - - return active_pivots; -} - //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - // SmallFn InMemoryNode::UpdateBuffer::EmptyLevel::dump() const diff --git a/src/turtle_kv/tree/in_memory_node.hpp b/src/turtle_kv/tree/in_memory_node.hpp index de5e31e..a5d7750 100644 --- a/src/turtle_kv/tree/in_memory_node.hpp +++ b/src/turtle_kv/tree/in_memory_node.hpp @@ -420,8 +420,6 @@ struct InMemoryNode { SmallFn dump() const; - u64 compute_active_pivots() const; - usize count_non_empty_levels() const { usize count = 0; diff --git a/src/turtle_kv/tree/in_memory_node.test.cpp b/src/turtle_kv/tree/in_memory_node.test.cpp index b5d03d4..4f8fbc1 100644 --- a/src/turtle_kv/tree/in_memory_node.test.cpp +++ b/src/turtle_kv/tree/in_memory_node.test.cpp @@ -135,16 +135,6 @@ void verify_table_point_queries(Table& expected_table, Table& actual_table, Rng& } } -void verify_deleted_point_queries(Table& expected_table, - Table& actual_table, - const std::vector& deleted_keys) -{ - for (const KeyView& key : deleted_keys) { - EXPECT_EQ(expected_table.get(key).status(), batt::StatusCode::kNotFound); - EXPECT_EQ(actual_table.get(key).status(), batt::StatusCode::kNotFound); - } -} - void verify_range_scan(LatencyMetric* scan_latency, Table& expected_table, const Slice>& actual_read_items, @@ -234,6 +224,14 @@ struct BatchUpdateGenerator { return result_set; } + + void verify_deleted_point_queries(Table& expected_table, Table& actual_table) + { + for (const KeyView& key : this->pending_deletes) { + EXPECT_EQ(expected_table.get(key).status(), batt::StatusCode::kNotFound); + EXPECT_EQ(actual_table.get(key).status(), batt::StatusCode::kNotFound); + } + } }; //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - @@ -348,7 +346,7 @@ void SubtreeBatchUpdateScenario::run() }, // TODO [vsilai 2026-01-09] Enable delete support for batch generation. // - .result_set = update_generator.next_batch(i, rng, /*update_pending_deletes=*/false), + .result_set = update_generator.next_batch(i, rng, /*update_pending_deletes=*/true), .edit_size_totals = None, }; update.update_edit_size_totals(); @@ -386,8 +384,8 @@ void SubtreeBatchUpdateScenario::run() << BATT_INSPECT(this->seed) << BATT_INSPECT(i); ASSERT_NO_FATAL_FAILURE( - verify_deleted_point_queries(expected_table, actual_table, pending_deletes)) - << BATT_INSPECT(this->seed) << BATT_INSPECT(i); + update_generator.verify_deleted_point_queries(expected_table, actual_table)) + << BATT_INSPECT(this->seed) << BATT_INSPECT(i); if (((i + 1) % chi) == 0) { if (my_id == 0) { @@ -424,7 +422,7 @@ void SubtreeBatchUpdateScenario::run() << BATT_INSPECT(this->seed) << BATT_INSPECT(i); ASSERT_NO_FATAL_FAILURE( - verify_deleted_point_queries(expected_table, actual_table, pending_deletes)) + update_generator.verify_deleted_point_queries(expected_table, actual_table)) << BATT_INSPECT(this->seed) << BATT_INSPECT(i); { @@ -533,6 +531,7 @@ TEST(InMemoryNodeTest, SubtreeDeletions) SubtreeTable actual_table{*page_cache, tree_options, tree}; batt::WorkerPool& worker_pool = batt::WorkerPool::null_pool(); + turtle_kv::BatchUpdateMetrics metrics; Optional page_loader{*page_cache}; @@ -580,6 +579,8 @@ TEST(InMemoryNodeTest, SubtreeDeletions) .worker_pool = worker_pool, .page_loader = *page_loader, .cancel_token = batt::CancelToken{}, + .metrics = metrics, + .overcommit = llfs::PageCacheOvercommit::not_allowed(), }, .result_set = std::move(result), .edit_size_totals = None, @@ -589,7 +590,8 @@ TEST(InMemoryNodeTest, SubtreeDeletions) Status table_update_status = update_table(expected_table, update.result_set); ASSERT_TRUE(table_update_status.ok()) << BATT_INSPECT(table_update_status); - StatusOr tree_height_before = tree.get_height(*page_loader); + StatusOr tree_height_before = + tree.get_height(*page_loader, llfs::PageCacheOvercommit::not_allowed()); ASSERT_TRUE(tree_height_before.ok()) << BATT_INSPECT(tree_height_before); Status status = // @@ -601,7 +603,8 @@ TEST(InMemoryNodeTest, SubtreeDeletions) ASSERT_TRUE(status.ok()) << BATT_INSPECT(status) << BATT_INSPECT(i); - StatusOr tree_height_after = tree.get_height(*page_loader); + StatusOr tree_height_after = + tree.get_height(*page_loader, llfs::PageCacheOvercommit::not_allowed()); ASSERT_TRUE(tree_height_after.ok()) << BATT_INSPECT(tree_height_after); if (*tree_height_after == 0) { @@ -620,7 +623,10 @@ TEST(InMemoryNodeTest, SubtreeDeletions) if (((i + 1) % chi) == 0) { std::unique_ptr page_job = page_cache->new_job(); - TreeSerializeContext context{tree_options, *page_job, worker_pool}; + TreeSerializeContext context{tree_options, + *page_job, + worker_pool, + llfs::PageCacheOvercommit::not_allowed()}; Status start_status = tree.start_serialize(context); ASSERT_TRUE(start_status.ok()) << BATT_INSPECT(start_status); @@ -647,12 +653,16 @@ TEST(InMemoryNodeTest, SubtreeDeletions) std::array, kMaxScanSize> scan_items_buffer; KeyView min_key = update.result_set.get_min_key(); + PageSliceStorage page_slice_storage; + KVStoreScanner kv_scanner{*page_loader, root_ptr->page_id_slot_or_panic(), - BATT_OK_RESULT_OR_PANIC(root_ptr->get_height(*page_loader)), + BATT_OK_RESULT_OR_PANIC(root_ptr->get_height( + *page_loader, // + llfs::PageCacheOvercommit::not_allowed())), min_key, tree_options.trie_index_sharded_view_size(), - None}; + &page_slice_storage}; usize n_read = 0; { @@ -689,7 +699,8 @@ TEST(InMemoryNodeTest, SubtreeDeletions) LOG(INFO) << "Deleting key/value pairs from tree..."; for (usize i = 0; i < total_batches; ++i) { bool perform_scan = i == 0 ? true : false; - StatusOr tree_height = tree.get_height(*page_loader); + StatusOr tree_height = + tree.get_height(*page_loader, llfs::PageCacheOvercommit::not_allowed()); ASSERT_TRUE(tree_height.ok()) << BATT_INSPECT(tree_height); if (*tree_height > 0) { apply_tree_updates(create_deletion_batch, perform_scan); diff --git a/src/turtle_kv/tree/subtree.cpp b/src/turtle_kv/tree/subtree.cpp index 7d53d70..3811465 100644 --- a/src/turtle_kv/tree/subtree.cpp +++ b/src/turtle_kv/tree/subtree.cpp @@ -268,7 +268,7 @@ Status Subtree::apply_batch_update(const TreeOptions& tree_options, return OkStatus(); } - Status status = new_subtree->flush_and_shrink(update.context); + Status status = new_subtree.flush_and_shrink(update_context); if (!status.ok()) { LOG(INFO) << "flush_and_shrink failed;" << BATT_INSPECT(needs_merge); @@ -591,13 +591,13 @@ Status Subtree::try_merge(BatchUpdateContext& context, Subtree&& sibling) noexce return batt::case_of( this->impl_, - [&](const llfs::PageIdSlot& page_id_slot) -> StatusOr> { + [&](const llfs::PageIdSlot& page_id_slot) -> Status { BATT_PANIC() << "Cannot try merging a serialized subtree!"; return {batt::StatusCode::kUnimplemented}; }, - [&](auto& in_memory) -> StatusOr> { + [&](auto& in_memory) -> Status { using PtrT = std::decay_t; BATT_CHECK(batt::is_case(sibling.impl_)); From 424b4d72565a07e38e1b9dcfee4122df80952b30 Mon Sep 17 00:00:00 2001 From: Vidya Silai Date: Wed, 11 Mar 2026 11:08:05 -0400 Subject: [PATCH 30/48] Fix pop_front_pivots function --- src/turtle_kv/tree/active_pivots_set.hpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/turtle_kv/tree/active_pivots_set.hpp b/src/turtle_kv/tree/active_pivots_set.hpp index 50661fc..55065f7 100644 --- a/src/turtle_kv/tree/active_pivots_set.hpp +++ b/src/turtle_kv/tree/active_pivots_set.hpp @@ -134,8 +134,13 @@ class ActivePivotsSet128 : public ActivePivotsSetBase> return; } - this->bit_set_[0] = (this->bit_set_[0] >> count) | (this->bit_set_[1] << (64 - count)); - this->bit_set_[1] >>= count; + if (count < 64) { + this->bit_set_[0] = (this->bit_set_[0] >> count) | (this->bit_set_[1] << (64 - count)); + this->bit_set_[1] >>= count; + } else { + this->bit_set_[0] = this->bit_set_[1] >> (count - 64); + this->bit_set_[1] = 0; + } } }; From c8950faf80347c88f387d040fb97cc5cb745ab33 Mon Sep 17 00:00:00 2001 From: Anthony Astolfi Date: Mon, 16 Mar 2026 13:26:30 -0400 Subject: [PATCH 31/48] Modify the ActivePivotsSet concepts a bit. --- src/turtle_kv/tree/active_pivots_set.hpp | 50 ++++++++++++------- src/turtle_kv/tree/algo/segmented_levels.hpp | 4 +- src/turtle_kv/tree/in_memory_node.cpp | 2 +- src/turtle_kv/tree/in_memory_node.hpp | 2 +- .../tree/segmented_level_scanner.hpp | 2 +- src/turtle_kv/tree/sharded_level_scanner.hpp | 2 +- src/turtle_kv/tree/testing/fake_segment.hpp | 4 +- 7 files changed, 39 insertions(+), 27 deletions(-) diff --git a/src/turtle_kv/tree/active_pivots_set.hpp b/src/turtle_kv/tree/active_pivots_set.hpp index 55065f7..e3cf986 100644 --- a/src/turtle_kv/tree/active_pivots_set.hpp +++ b/src/turtle_kv/tree/active_pivots_set.hpp @@ -17,37 +17,47 @@ namespace turtle_kv { //=#=#==#==#===============+=+=+=+=++=++++++++++++++-++-+--+-+----+--------------- // template -concept ActivePivotsSet = requires(T pivots) { +concept ConstActivePivotsSet = requires(const T& obj, i32 index, bool value, std::ostream& out) { // count - { std::declval().count() } -> std::convertible_to; - - // set - { pivots.set(std::declval(), std::declval()) } -> std::same_as; + { obj.count() } -> std::convertible_to; // get - { std::declval().get(std::declval()) } -> std::same_as; + { obj.get(index) } -> std::convertible_to; // first - { std::declval().first() } -> std::convertible_to; + { obj.first() } -> std::convertible_to; // last - { std::declval().last() } -> std::convertible_to; + { obj.last() } -> std::convertible_to; // next - { std::declval().next(std::declval()) } -> std::convertible_to; + { obj.next(index) } -> std::convertible_to; // printable - { - std::declval() << std::declval().printable() - } -> std::convertible_to; + { out << obj.printable() } -> std::convertible_to; + + // is_empty + { obj.is_empty() } -> std::convertible_to; +}; - // is_inactive - { std::declval().is_inactive() } -> std::same_as; +//=#=#==#==#===============+=+=+=+=++=++++++++++++++-++-+--+-+----+--------------- +// +template +concept MutableActivePivotsSet = requires(T& obj, i32 index, bool value) { + // set + { obj.set(index, value) }; }; +//=#=#==#==#===============+=+=+=+=++=++++++++++++++-++-+--+-+----+--------------- +// template -concept HasActivePivotsSet = requires(const T& obj) { - { obj.get_active_pivots() } -> ActivePivotsSet; +concept ActivePivotsSet = ConstActivePivotsSet && MutableActivePivotsSet; + +//=#=#==#==#===============+=+=+=+=++=++++++++++++++-++-+--+-+----+--------------- +// +template +concept HasConstActivePivotsSet = requires(const T& obj) { + { obj.get_active_pivots() } -> ConstActivePivotsSet; }; //=#=#==#==#===============+=+=+=+=++=++++++++++++++-++-+--+-+----+--------------- @@ -58,7 +68,9 @@ template class ActivePivotsSetBase { public: - BATT_ALWAYS_INLINE ActivePivotsSetBase() noexcept : bit_set_{} {} + BATT_ALWAYS_INLINE ActivePivotsSetBase() noexcept : bit_set_{} + { + } BATT_ALWAYS_INLINE explicit ActivePivotsSetBase(const Bitset& bit_set) : bit_set_{bit_set} { @@ -99,7 +111,7 @@ class ActivePivotsSetBase this->bit_set_ = insert_bit(this->bit_set_, i, v); } - BATT_ALWAYS_INLINE bool is_inactive() const + BATT_ALWAYS_INLINE bool is_empty() const { return this->bit_set_ == Bitset{}; } @@ -127,7 +139,7 @@ class ActivePivotsSet128 : public ActivePivotsSetBase> } /** \brief Removes the specified number (`count`) pivots from the bit set. - */ + */ BATT_ALWAYS_INLINE void pop_front_pivots(i32 count) { if (count < 1) { diff --git a/src/turtle_kv/tree/algo/segmented_levels.hpp b/src/turtle_kv/tree/algo/segmented_levels.hpp index 293f95d..cf6ab8f 100644 --- a/src/turtle_kv/tree/algo/segmented_levels.hpp +++ b/src/turtle_kv/tree/algo/segmented_levels.hpp @@ -90,13 +90,13 @@ inline i32 get_last_active_pivot(const CInterval& pivot_range) //----- --- -- - - - - -template +template inline i32 get_first_active_pivot(const T& segment) { return segment.get_active_pivots().first(); } -template +template inline i32 get_last_active_pivot(const T& segment) { return segment.get_active_pivots().last(); diff --git a/src/turtle_kv/tree/in_memory_node.cpp b/src/turtle_kv/tree/in_memory_node.cpp index 7d3f495..2dab177 100644 --- a/src/turtle_kv/tree/in_memory_node.cpp +++ b/src/turtle_kv/tree/in_memory_node.cpp @@ -1720,7 +1720,7 @@ void InMemoryNode::UpdateBuffer::Segment::pop_front_pivots(i32 count) // bool InMemoryNode::UpdateBuffer::Segment::is_inactive() const { - const bool inactive = this->active_pivots.is_inactive(); + const bool inactive = this->active_pivots.is_empty(); if (inactive) { Slice> filter_dropped_ranges = this->filter.dropped(); BATT_CHECK_EQ(filter_dropped_ranges.size(), 1); diff --git a/src/turtle_kv/tree/in_memory_node.hpp b/src/turtle_kv/tree/in_memory_node.hpp index 461465d..8f5fbe8 100644 --- a/src/turtle_kv/tree/in_memory_node.hpp +++ b/src/turtle_kv/tree/in_memory_node.hpp @@ -107,7 +107,7 @@ struct InMemoryNode { /** \brief Returns the active pivots bit set. */ - auto get_active_pivots() const + ActivePivotsSet128 get_active_pivots() const { return this->active_pivots; } diff --git a/src/turtle_kv/tree/segmented_level_scanner.hpp b/src/turtle_kv/tree/segmented_level_scanner.hpp index 4c3dd20..f747d30 100644 --- a/src/turtle_kv/tree/segmented_level_scanner.hpp +++ b/src/turtle_kv/tree/segmented_level_scanner.hpp @@ -207,7 +207,7 @@ inline auto SegmentedLevelScanner::peek_next_impl(bo const Segment* segment = std::addressof(this->level_->get_segment(this->segment_i_)); ActivePivotsSetT active_pivots = segment->get_active_pivots(); - BATT_CHECK(!active_pivots.is_inactive()) << "This segment should have been dropped!"; + BATT_CHECK(!active_pivots.is_empty()) << "This segment should have been dropped!"; // Make sure we have a leaf page loaded. // diff --git a/src/turtle_kv/tree/sharded_level_scanner.hpp b/src/turtle_kv/tree/sharded_level_scanner.hpp index bdf3c65..d22e558 100644 --- a/src/turtle_kv/tree/sharded_level_scanner.hpp +++ b/src/turtle_kv/tree/sharded_level_scanner.hpp @@ -244,7 +244,7 @@ inline auto ShardedLevelScanner::peek_next_impl(bool ActivePivotsSetT active_pivots = segment->get_active_pivots(); { - BATT_CHECK(!active_pivots.is_inactive()) << "This segment should have been dropped!"; + BATT_CHECK(!active_pivots.is_empty()) << "This segment should have been dropped!"; } // Check for need to load new segment. diff --git a/src/turtle_kv/tree/testing/fake_segment.hpp b/src/turtle_kv/tree/testing/fake_segment.hpp index 1c6f1db..805034c 100644 --- a/src/turtle_kv/tree/testing/fake_segment.hpp +++ b/src/turtle_kv/tree/testing/fake_segment.hpp @@ -44,7 +44,7 @@ struct FakeSegment { }); } - auto get_active_pivots() const + ActivePivotsSet128 get_active_pivots() const { return this->active_pivots_; } @@ -92,7 +92,7 @@ struct FakeSegment { bool is_inactive() const { - const bool inactive = this->active_pivots_.is_inactive(); + const bool inactive = this->active_pivots_.is_empty(); if (inactive) { Slice> filter_dropped_ranges = this->filter_.dropped(); BATT_CHECK_EQ(filter_dropped_ranges.size(), 1); From 6a097d0d4aa558b57065592ba5350af3410fb2d8 Mon Sep 17 00:00:00 2001 From: Anthony Astolfi Date: Mon, 16 Mar 2026 14:20:03 -0400 Subject: [PATCH 32/48] Fix split_pivot bug re: new piecewise filters. --- src/turtle_kv/tree/algo/segmented_levels.hpp | 26 ++-- src/turtle_kv/tree/algo/segments.hpp | 118 ++++++++++++------- src/turtle_kv/util/piecewise_filter.ipp | 4 +- 3 files changed, 97 insertions(+), 51 deletions(-) diff --git a/src/turtle_kv/tree/algo/segmented_levels.hpp b/src/turtle_kv/tree/algo/segmented_levels.hpp index cf6ab8f..045a0c8 100644 --- a/src/turtle_kv/tree/algo/segmented_levels.hpp +++ b/src/turtle_kv/tree/algo/segmented_levels.hpp @@ -242,7 +242,7 @@ struct SegmentedLevelAlgorithms { // If we can split the pivot without loading the leaf, great! // - if (in_segment(segment).split_pivot(pivot_i, None, this->level_)) { + if (in_segment(segment).split_pivot(pivot_i, /*split_indices=*/None, this->level_)) { continue; } @@ -255,17 +255,29 @@ struct SegmentedLevelAlgorithms { const PackedLeafPage& leaf_page = PackedLeafPage::view_of(segment_pinned_leaf); - const usize pivot_offset_in_leaf = - std::distance(leaf_page.items_begin(), leaf_page.lower_bound(pivot_key)); + const auto first_item_in_leaf = leaf_page.items_begin(); + + const usize pivot_begin_in_leaf = + std::distance(first_item_in_leaf, leaf_page.lower_bound(pivot_key)); const usize split_offset_in_leaf = - std::distance(leaf_page.items_begin(), leaf_page.lower_bound(split_key)); + std::distance(first_item_in_leaf, leaf_page.lower_bound(split_key)); + + const usize pivot_end_in_leaf = + std::distance(first_item_in_leaf, leaf_page.lower_bound(old_pivot_key_range.upper_bound)); - VLOG(1) << " --" << BATT_INSPECT(split_offset_in_leaf) << BATT_INSPECT(pivot_offset_in_leaf); + VLOG(1) << " --" << BATT_INSPECT(split_offset_in_leaf) << BATT_INSPECT(pivot_begin_in_leaf); - BATT_CHECK_LE(pivot_offset_in_leaf, split_offset_in_leaf); + BATT_CHECK_LE(pivot_begin_in_leaf, split_offset_in_leaf); + BATT_CHECK_LE(split_offset_in_leaf, pivot_end_in_leaf); - BATT_CHECK(in_segment(segment).split_pivot(pivot_i, split_offset_in_leaf, this->level_)); + BATT_CHECK(in_segment(segment).split_pivot(pivot_i, + SegmentPivotSplitIndices{ + pivot_begin_in_leaf, + split_offset_in_leaf, + pivot_end_in_leaf, + }, + this->level_)); } return OkStatus(); diff --git a/src/turtle_kv/tree/algo/segments.hpp b/src/turtle_kv/tree/algo/segments.hpp index 68ba93e..6d82bea 100644 --- a/src/turtle_kv/tree/algo/segments.hpp +++ b/src/turtle_kv/tree/algo/segments.hpp @@ -11,12 +11,72 @@ #include #include +#include #include #include namespace turtle_kv { +//=#=#==#==#===============+=+=+=+=++=++++++++++++++-++-+--+-+----+--------------- +// +/** \brief The set of indices involved in a pivot split; each index is a 0-based item index within + * the leaf page. + */ +struct SegmentPivotSplitIndices { + /** \brief The lower bound (inclusive) of the lower-half of the split. This is also the lower + * bound of the pre-split region. + */ + u32 lower_bound; + + /** \brief The index of the first item belonging to the right-hand-side after the split. + * This is the upper bound (non-inclusive) of the new lower-half, and the lower bound + * (inclusive) of the new upper-half. + */ + u32 split_point; + + /** \brief The uppwer bound (non-inclusive) of the upper-half of the split. This is also the + * upper bound of the pre-split region. + */ + u32 upper_bound; + + //+++++++++++-+-+--+----- --- -- - - - - + + SegmentPivotSplitIndices() = delete; + + explicit SegmentPivotSplitIndices(u32 lower, u32 middle, u32 upper) noexcept + : lower_bound{lower} + , split_point{middle} + , upper_bound{upper} + { + } + + explicit SegmentPivotSplitIndices(usize lower, usize middle, usize upper) noexcept + : SegmentPivotSplitIndices{ + BATT_CHECKED_CAST(u32, lower), + BATT_CHECKED_CAST(u32, middle), + BATT_CHECKED_CAST(u32, upper), + } + { + } + + /** \brief Returns the lower half index range for the split. + */ + Interval lower_range() const + { + return {this->lower_bound, this->split_point}; + } + + /** \brief Returns the upper half index range for the split. + */ + Interval upper_range() const + { + return {this->split_point, this->upper_bound}; + } +}; + +//=#=#==#==#===============+=+=+=+=++=++++++++++++++-++-+--+-+----+--------------- +// template struct SegmentAlgorithms { //+++++++++++-+-+--+----- --- -- - - - - @@ -38,7 +98,7 @@ struct SegmentAlgorithms { */ template [[nodiscard]] bool split_pivot(i32 pivot_i, - Optional split_offset_in_leaf, + Optional split_indices, const LevelT& level) const { using batt::BoolStatus; @@ -48,7 +108,7 @@ struct SegmentAlgorithms { this->segment_.check_invariants(__FILE__, __LINE__); }); - BATT_CHECK_LT(pivot_i, InMemoryNode::kMaxTempPivots - 1); + BATT_CHECK_LT(pivot_i, (i32)InMemoryNode::kMaxTempPivots - 1); // Simplest case: pivot not active for this segment. // @@ -57,52 +117,26 @@ struct SegmentAlgorithms { return true; } - const BoolStatus old_pivot_becomes_inactive = [&] { - if (!split_offset_in_leaf) { - return BoolStatus::kUnknown; - } - - if (*split_offset_in_leaf == 0) { - return BoolStatus::kTrue; - } - - return batt::bool_status_from( - this->segment_.is_index_filtered(level, *split_offset_in_leaf - 1)); - }(); - - const BoolStatus new_pivot_has_flushed_items = [&] { - if (old_pivot_becomes_inactive == BoolStatus::kUnknown) { - return BoolStatus::kUnknown; - } - - return batt::bool_status_from(old_pivot_becomes_inactive == BoolStatus::kTrue && - this->segment_.is_index_filtered(level, *split_offset_in_leaf)); - }(); - - // Next simplest: pivot active, but flush count is zero for pivot. + // If the pivot we are splitting is currently active, then we need to know the split_indices + // before we can accurately compute whether the lower/upper ranges of the split are active. // - if (old_pivot_becomes_inactive == BoolStatus::kFalse) { - BATT_CHECK_EQ(new_pivot_has_flushed_items, BoolStatus::kFalse); - this->segment_.insert_pivot(pivot_i + 1, true); - return true; + if (!split_indices) { + return false; } - // At this point we can only proceed if we know the item count of the split position relative to - // the pivot key range start. + // Ask the segment filter whether the lower/upper ranges of the split have live items. // - if (old_pivot_becomes_inactive == BoolStatus::kUnknown || - new_pivot_has_flushed_items == BoolStatus::kUnknown) { - return false; - } + const Interval lower_live_range = + this->segment_.get_live_item_range(level, split_indices->lower_range()); - BATT_CHECK_EQ(old_pivot_becomes_inactive, BoolStatus::kTrue); - BATT_CHECK(split_offset_in_leaf); + const Interval upper_live_range = + this->segment_.get_live_item_range(level, split_indices->upper_range()); - // If the split is not after the last flushed item, then the lower pivot (in the split) is now - // inactive and the upper one is active, possibly with some flushed items. - // - this->segment_.set_pivot_active(pivot_i, false); - this->segment_.insert_pivot(pivot_i + 1, true); + const bool lower_pivot_active = !lower_live_range.empty(); + const bool upper_pivot_active = !upper_live_range.empty(); + + this->segment_.set_pivot_active(pivot_i, lower_pivot_active); + this->segment_.insert_pivot(pivot_i + 1, upper_pivot_active); return true; } diff --git a/src/turtle_kv/util/piecewise_filter.ipp b/src/turtle_kv/util/piecewise_filter.ipp index 8b7449b..3c619a8 100644 --- a/src/turtle_kv/util/piecewise_filter.ipp +++ b/src/turtle_kv/util/piecewise_filter.ipp @@ -154,7 +154,7 @@ Interval PiecewiseFilter::find_live_range(Interval i) OffsetT start_i = i.lower_bound; OffsetT end_i = i.upper_bound; - BATT_CHECK_LT(start_i, end_i); + BATT_CHECK_LE(start_i, end_i); auto iter = std::lower_bound(this->dropped_.begin(), this->dropped_.end(), @@ -183,7 +183,7 @@ Interval PiecewiseFilter::find_live_range(Interval i) end_i = std::min(end_i, iter->lower_bound); } - BATT_CHECK_LT(start_i, end_i) << BATT_INSPECT(i); + BATT_CHECK_LE(start_i, end_i) << BATT_INSPECT(i); return Interval{start_i, end_i}; } From 016764918b62b49237a80c83f0c0fc5cf63ad2a3 Mon Sep 17 00:00:00 2001 From: Anthony Astolfi Date: Mon, 16 Mar 2026 14:52:42 -0400 Subject: [PATCH 33/48] Make sure InMemoryNode::Segmment::active_pivots is initialized empty. --- src/turtle_kv/tree/active_pivots_set.hpp | 19 +++++++++++++++++++ src/turtle_kv/tree/in_memory_node.cpp | 2 +- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/src/turtle_kv/tree/active_pivots_set.hpp b/src/turtle_kv/tree/active_pivots_set.hpp index e3cf986..c74ac80 100644 --- a/src/turtle_kv/tree/active_pivots_set.hpp +++ b/src/turtle_kv/tree/active_pivots_set.hpp @@ -131,6 +131,25 @@ class ActivePivotsSet128 : public ActivePivotsSetBase> friend class PackedActivePivotsSet64; public: + using Super = ActivePivotsSetBase>; + using Self = ActivePivotsSet128; + + BATT_ALWAYS_INLINE ActivePivotsSet128() noexcept : Super{} + { + // Make sure the bit set starts out empty. + // + this->clear(); + } + + /** \brief Removes all pivots from the set. + */ + BATT_ALWAYS_INLINE void clear() + { + this->bit_set_.fill(0); + } + + /** \brief Returns a printable representation of this object. + */ BATT_ALWAYS_INLINE auto printable() const { return [this](std::ostream& out) { diff --git a/src/turtle_kv/tree/in_memory_node.cpp b/src/turtle_kv/tree/in_memory_node.cpp index 2dab177..0cfd6b7 100644 --- a/src/turtle_kv/tree/in_memory_node.cpp +++ b/src/turtle_kv/tree/in_memory_node.cpp @@ -1553,7 +1553,7 @@ StatusOr MergedLevel::finish_serialize(const InMemoryNode& node, context.get_build_page_result(this->segment_future_ids_[segment_i])); segment.page_id_slot.page_id = pinned_leaf_page.page_id(); - segment.active_pivots = {}; + segment.active_pivots.clear(); const PackedLeafPage& leaf_page = PackedLeafPage::view_of(pinned_leaf_page); From fe715a8f17e58ccfed2de7f4c6578c6629b432a0 Mon Sep 17 00:00:00 2001 From: Anthony Astolfi Date: Mon, 16 Mar 2026 15:04:44 -0400 Subject: [PATCH 34/48] Replace magic number `4` with kMinPivotCount constant. --- src/turtle_kv/tree/in_memory_node.cpp | 15 ++++++++------- src/turtle_kv/tree/subtree_viability.hpp | 22 ++++++++++++++-------- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/src/turtle_kv/tree/in_memory_node.cpp b/src/turtle_kv/tree/in_memory_node.cpp index 0cfd6b7..152ec25 100644 --- a/src/turtle_kv/tree/in_memory_node.cpp +++ b/src/turtle_kv/tree/in_memory_node.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -979,7 +980,7 @@ SubtreeViability InMemoryNode::get_viability() const NeedsMerge needs_merge; needs_merge.single_pivot = (this->pivot_count() == 1); - needs_merge.too_few_pivots = (this->pivot_count() < 4); + needs_merge.too_few_pivots = (this->pivot_count() < kMinPivotCount); if (needs_merge) { return needs_merge; @@ -1118,7 +1119,7 @@ StatusOr> InMemoryNode::try_split_direct(BatchUpda BATT_CHECK_EQ(orig_pivot_count + 1, orig_pivot_keys.size()); - std::array tried_already = {0, 0}; + ActivePivotsSet128 tried_already; usize split_pivot_i = (orig_pivot_count + 1) / 2; auto* node_lower_half = this; @@ -1133,10 +1134,10 @@ StatusOr> InMemoryNode::try_split_direct(BatchUpda for (;;) { // If we ever try the same split point a second time, fail. // - if (get_bit(tried_already, split_pivot_i)) { + if (tried_already.get(split_pivot_i)) { return {batt::StatusCode::kInternal}; } - tried_already = set_bit(tried_already, split_pivot_i, true); + tried_already.set(split_pivot_i, true); //+++++++++++-+-+--+----- --- -- - - - - @@ -1258,7 +1259,7 @@ StatusOr> InMemoryNode::try_split_direct(BatchUpda // If the lower half is too large, then move the split point down and retry if possible. // - if (split_pivot_i > 4 && batt::is_case(lower_viability) && + if (split_pivot_i > kMinPivotCount && batt::is_case(lower_viability) && !batt::is_case(upper_viability)) { --split_pivot_i; continue; @@ -1266,8 +1267,8 @@ StatusOr> InMemoryNode::try_split_direct(BatchUpda // If the upper half is too large, then move the split point up and retry if possible. // - if (split_pivot_i + 4 < orig_pivot_count && batt::is_case(upper_viability) && - !batt::is_case(lower_viability)) { + if (split_pivot_i + kMinPivotCount < orig_pivot_count && + batt::is_case(upper_viability) && !batt::is_case(lower_viability)) { ++split_pivot_i; continue; } diff --git a/src/turtle_kv/tree/subtree_viability.hpp b/src/turtle_kv/tree/subtree_viability.hpp index 08ae2d8..0ae893b 100644 --- a/src/turtle_kv/tree/subtree_viability.hpp +++ b/src/turtle_kv/tree/subtree_viability.hpp @@ -1,5 +1,7 @@ #pragma once +#include + #include #include @@ -8,6 +10,10 @@ namespace turtle_kv { +/** \brief The minimum number of pivots allowed in a non-root node. + */ +inline constexpr u16 kMinPivotCount = 4; + //=#=#==#==#===============+=+=+=+=++=++++++++++++++-++-+--+-+----+--------------- // struct Viable { @@ -107,9 +113,9 @@ inline bool compacting_levels_might_fix(const SubtreeViability& viability) }, [](const NeedsSplit& needs_split) { return (needs_split.segment_filters_too_large || // - needs_split.too_many_segments) && // - !needs_split.items_too_large && // - !needs_split.keys_too_large && // + needs_split.too_many_segments) && // + !needs_split.items_too_large && // + !needs_split.keys_too_large && // !needs_split.too_many_pivots; }); } @@ -130,11 +136,11 @@ inline bool normal_flush_might_fix(const SubtreeViability& viability) return false; }, [](const NeedsSplit& needs_split) { - return needs_split.height == 2 && // - (needs_split.segment_filters_too_large // - || needs_split.too_many_segments) && // - !needs_split.items_too_large && // - !needs_split.keys_too_large && // + return needs_split.height == 2 && // + (needs_split.segment_filters_too_large // + || needs_split.too_many_segments) && // + !needs_split.items_too_large && // + !needs_split.keys_too_large && // !needs_split.too_many_pivots; }); } From 203c1e04f20c841f5efee9697c0e6800d06fa80c Mon Sep 17 00:00:00 2001 From: Anthony Astolfi Date: Mon, 16 Mar 2026 15:11:12 -0400 Subject: [PATCH 35/48] Move the minimum pivots constant next to the max one (config.hpp); remove weird ternaries in InMemoryNode::max_pivot_count()/max_segment_count(). --- src/turtle_kv/config.hpp | 8 +++++++- src/turtle_kv/tree/in_memory_node.cpp | 6 +++--- src/turtle_kv/tree/in_memory_node.hpp | 4 ++-- src/turtle_kv/tree/subtree_viability.hpp | 7 +++---- 4 files changed, 15 insertions(+), 10 deletions(-) diff --git a/src/turtle_kv/config.hpp b/src/turtle_kv/config.hpp index ec4b104..4cc1a56 100644 --- a/src/turtle_kv/config.hpp +++ b/src/turtle_kv/config.hpp @@ -91,6 +91,12 @@ constexpr i64 kNewLeafLruPriority = kLeafLruPriority + kNewPagePriorityBoost; constexpr u32 kDefaultLeafShardedViewSize = 4096; -constexpr usize kMaxPivots = 64; +/** \brief The minimum number of pivots allowed in a non-root node. + */ +inline constexpr usize kMinPivots = 4; + +/** \brief The maximum number of pivots allowed in a packed node. + */ +inline constexpr usize kMaxPivots = 64; } // namespace turtle_kv diff --git a/src/turtle_kv/tree/in_memory_node.cpp b/src/turtle_kv/tree/in_memory_node.cpp index 152ec25..723d393 100644 --- a/src/turtle_kv/tree/in_memory_node.cpp +++ b/src/turtle_kv/tree/in_memory_node.cpp @@ -980,7 +980,7 @@ SubtreeViability InMemoryNode::get_viability() const NeedsMerge needs_merge; needs_merge.single_pivot = (this->pivot_count() == 1); - needs_merge.too_few_pivots = (this->pivot_count() < kMinPivotCount); + needs_merge.too_few_pivots = (this->pivot_count() < kMinPivots); if (needs_merge) { return needs_merge; @@ -1259,7 +1259,7 @@ StatusOr> InMemoryNode::try_split_direct(BatchUpda // If the lower half is too large, then move the split point down and retry if possible. // - if (split_pivot_i > kMinPivotCount && batt::is_case(lower_viability) && + if (split_pivot_i > kMinPivots && batt::is_case(lower_viability) && !batt::is_case(upper_viability)) { --split_pivot_i; continue; @@ -1267,7 +1267,7 @@ StatusOr> InMemoryNode::try_split_direct(BatchUpda // If the upper half is too large, then move the split point up and retry if possible. // - if (split_pivot_i + kMinPivotCount < orig_pivot_count && + if (split_pivot_i + kMinPivots < orig_pivot_count && batt::is_case(upper_viability) && !batt::is_case(lower_viability)) { ++split_pivot_i; continue; diff --git a/src/turtle_kv/tree/in_memory_node.hpp b/src/turtle_kv/tree/in_memory_node.hpp index 8f5fbe8..09604d6 100644 --- a/src/turtle_kv/tree/in_memory_node.hpp +++ b/src/turtle_kv/tree/in_memory_node.hpp @@ -462,12 +462,12 @@ struct InMemoryNode { usize max_pivot_count() const { - return this->is_size_tiered() ? kMaxPivots : kMaxPivots; + return kMaxPivots; } usize max_segment_count() const { - return this->is_size_tiered() ? (kMaxPivots - 1) : (kMaxPivots - 1); + return kMaxPivots - 1; } Slice get_pivot_keys() const diff --git a/src/turtle_kv/tree/subtree_viability.hpp b/src/turtle_kv/tree/subtree_viability.hpp index 0ae893b..4d4add5 100644 --- a/src/turtle_kv/tree/subtree_viability.hpp +++ b/src/turtle_kv/tree/subtree_viability.hpp @@ -1,5 +1,8 @@ #pragma once +#include +// + #include #include @@ -10,10 +13,6 @@ namespace turtle_kv { -/** \brief The minimum number of pivots allowed in a non-root node. - */ -inline constexpr u16 kMinPivotCount = 4; - //=#=#==#==#===============+=+=+=+=++=++++++++++++++-++-+--+-+----+--------------- // struct Viable { From b9a533f9834ab75c1d3e74d90b1af375a1a227e5 Mon Sep 17 00:00:00 2001 From: Anthony Astolfi Date: Mon, 16 Mar 2026 15:25:29 -0400 Subject: [PATCH 36/48] Add some more defensive checks that we never exceed the maximum number of temporary pivots. --- src/turtle_kv/tree/in_memory_node.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/turtle_kv/tree/in_memory_node.cpp b/src/turtle_kv/tree/in_memory_node.cpp index 723d393..102e785 100644 --- a/src/turtle_kv/tree/in_memory_node.cpp +++ b/src/turtle_kv/tree/in_memory_node.cpp @@ -16,6 +16,7 @@ #include #include +#include namespace turtle_kv { @@ -47,6 +48,8 @@ using PackedSegment = PackedUpdateBuffer::Segment; const usize pivot_count = packed_node.pivot_count(); + BATT_REQUIRE_LE(pivot_count, kMaxPivots); + node->tree_options = tree_options; node->height = packed_node.height; node->children.resize(pivot_count); @@ -609,6 +612,13 @@ Status InMemoryNode::split_child(BatchUpdateContext& update_context, i32 pivot_i update_context.metrics.split_count.add(1); #endif + // Make sure we don't exceed the temporary pivot count limit. + // + BATT_CHECK_LT(this->pivot_count(), (i32)InMemoryNode::kMaxTempPivots); + auto on_scope_exit = batt::finally([&] { + BATT_CHECK_LE(this->pivot_count(), (i32)InMemoryNode::kMaxTempPivots); + }); + Subtree& child = this->children[pivot_i]; StatusOr> status_or_sibling = child.try_split(update_context); From e9eadbd5f6a707807bf2f9349fee6241c243a4fc Mon Sep 17 00:00:00 2001 From: Anthony Astolfi Date: Mon, 16 Mar 2026 15:34:05 -0400 Subject: [PATCH 37/48] Add comment about new kPivotCountMask. --- src/turtle_kv/tree/packed_node_page.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/turtle_kv/tree/packed_node_page.hpp b/src/turtle_kv/tree/packed_node_page.hpp index 0329c8e..325da00 100644 --- a/src/turtle_kv/tree/packed_node_page.hpp +++ b/src/turtle_kv/tree/packed_node_page.hpp @@ -49,7 +49,7 @@ struct PackedNodePage { kMaxPivots + 1 /*max_key*/ + 1 /*common_prefix*/ + 1 /*final_offset*/; static constexpr u8 kFlagSizeTiered = 0x80; - static constexpr u8 kPivotCountMask = 0x7f; + static constexpr u8 kPivotCountMask = 0x7f; // Needs to be at least 64 static constexpr u16 kSegmentStartsFiltered = 0x8000; using Key = PackedNodePageKey; From 6e01a5040ab63fdf68c37c1114e266ac44461e98 Mon Sep 17 00:00:00 2001 From: Anthony Astolfi Date: Mon, 16 Mar 2026 15:35:28 -0400 Subject: [PATCH 38/48] assert size of packed active pivots set. --- src/turtle_kv/tree/active_pivots_set.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/turtle_kv/tree/active_pivots_set.hpp b/src/turtle_kv/tree/active_pivots_set.hpp index c74ac80..b7ec9c1 100644 --- a/src/turtle_kv/tree/active_pivots_set.hpp +++ b/src/turtle_kv/tree/active_pivots_set.hpp @@ -205,5 +205,6 @@ class PackedActivePivotsSet64 : public ActivePivotsSetBase }; static_assert(ActivePivotsSet); +static_assert(sizeof(PackedActivePivotsSet64) == 8); } // namespace turtle_kv From dde70180b194a642512875f827e56485c7d2cc5e Mon Sep 17 00:00:00 2001 From: Anthony Astolfi Date: Mon, 16 Mar 2026 15:39:32 -0400 Subject: [PATCH 39/48] Static asserts: ActivePivotsSet. --- src/turtle_kv/tree/segmented_level_scanner.hpp | 3 +++ src/turtle_kv/tree/sharded_level_scanner.hpp | 3 +++ 2 files changed, 6 insertions(+) diff --git a/src/turtle_kv/tree/segmented_level_scanner.hpp b/src/turtle_kv/tree/segmented_level_scanner.hpp index f747d30..462c9eb 100644 --- a/src/turtle_kv/tree/segmented_level_scanner.hpp +++ b/src/turtle_kv/tree/segmented_level_scanner.hpp @@ -1,5 +1,6 @@ #pragma once +#include #include #include @@ -55,6 +56,8 @@ class SegmentedLevelScanner : private SegmentedLevelScannerBase using Segment = typename Level::Segment; using ActivePivotsSetT = decltype(std::declval().get_active_pivots()); + static_assert(ActivePivotsSet); + using Item = EditSlice; //+++++++++++-+-+--+----- --- -- - - - - diff --git a/src/turtle_kv/tree/sharded_level_scanner.hpp b/src/turtle_kv/tree/sharded_level_scanner.hpp index d22e558..5352ab2 100644 --- a/src/turtle_kv/tree/sharded_level_scanner.hpp +++ b/src/turtle_kv/tree/sharded_level_scanner.hpp @@ -1,5 +1,6 @@ #pragma once +#include #include #include @@ -69,6 +70,8 @@ class ShardedLevelScanner : private SegmentedLevelScannerBase using Segment = typename Level::Segment; using ActivePivotsSetT = decltype(std::declval().get_active_pivots()); + static_assert(ActivePivotsSet); + using Item = ShardedKeyValueSlice; static constexpr u64 kDefaultLeafShardedViewSize = 4096; From 1c7efe130a8ee7f7d72362d7125d32f55479bf74 Mon Sep 17 00:00:00 2001 From: Anthony Astolfi Date: Mon, 16 Mar 2026 15:42:10 -0400 Subject: [PATCH 40/48] Revert change. --- src/turtle_kv/tree/subtree_viability.hpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/turtle_kv/tree/subtree_viability.hpp b/src/turtle_kv/tree/subtree_viability.hpp index 4d4add5..0f6416d 100644 --- a/src/turtle_kv/tree/subtree_viability.hpp +++ b/src/turtle_kv/tree/subtree_viability.hpp @@ -1,8 +1,5 @@ #pragma once -#include -// - #include #include From d0f1745d66d6520e9bfddfba7fc4939b632841d2 Mon Sep 17 00:00:00 2001 From: Vidya Silai Date: Mon, 16 Mar 2026 15:59:20 -0400 Subject: [PATCH 41/48] Start adding HybridLevel --- src/turtle_kv/tree/in_memory_node.cpp | 62 +++++++++++++++++++++- src/turtle_kv/tree/in_memory_node.hpp | 23 +++++--- src/turtle_kv/tree/in_memory_node.test.cpp | 20 ++++--- src/turtle_kv/tree/packed_leaf_page.hpp | 3 ++ 4 files changed, 90 insertions(+), 18 deletions(-) diff --git a/src/turtle_kv/tree/in_memory_node.cpp b/src/turtle_kv/tree/in_memory_node.cpp index d509bf3..5ab56c0 100644 --- a/src/turtle_kv/tree/in_memory_node.cpp +++ b/src/turtle_kv/tree/in_memory_node.cpp @@ -25,6 +25,7 @@ using Level = UpdateBuffer::Level; using EmptyLevel = UpdateBuffer::EmptyLevel; using MergedLevel = UpdateBuffer::MergedLevel; using SegmentedLevel = UpdateBuffer::SegmentedLevel; +using HybridLevel = UpdateBuffer::HybridLevel; using Segment = UpdateBuffer::Segment; using PackedUpdateBuffer = PackedNodePage::UpdateBuffer; @@ -667,6 +668,20 @@ Status InMemoryNode::split_child(BatchUpdateContext& update_context, i32 pivot_i update_context.page_loader, update_context.overcommit) // .split_pivot(pivot_i, pivot_key_range, sibling_min_key); + }, + [&](HybridLevel& hybrid_level) -> Status { + for (auto& sub_level : hybrid_levels.levels) { + if (batt::is_case(sub_level)) { + SegmentedLevel& segmented_sub_level = std::get(sub_level); + BATT_REQUIRE_OK(in_segmented_level(*this, + segmented_sub_level, + update_context.page_loader, + update_context.overcommit) // + .split_pivot(pivot_i, pivot_key_range, sibling_min_key)); + } + + return OkStatus(); + } })); } @@ -853,7 +868,7 @@ Status InMemoryNode::try_merge(BatchUpdateContext& context, }, [&](MergedLevel& right_merged_level) -> Status { this->update_buffer.levels[i] = - std::move(left_merged_level.concat(right_merged_level)); + std::move(left_merged_level.concat(std::move(right_merged_level))); return OkStatus(); }, [&](SegmentedLevel& right_segmented_level) -> Status { @@ -1959,6 +1974,51 @@ void InMemoryNode::UpdateBuffer::SegmentedLevel::check_items_sorted( } } +//==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - +// +BoxedSeq InMemoryNode::UpdateBuffer::HybridLevel::edit_slices( + InMemoryNode& node, + BatchUpdateContext& update_context, + Status& segment_load_status, + i32 min_pivot_i, + bool only_pivot, + Optional min_key) const +{ + std::vector> sequences; + sequences.reserve(this->levels.size()); + + for (const std::variant& level : this->levels) { + batt::case_of( + level, + [&](const MergedLevel& merged_level) { + sequences.emplace_back( + merged_level.result_set.live_edit_slices(node.get_pivot_key(min_pivot_i)) | + seq::boxed()); + }, + [&](const SegmentedLevel& segmented_level) { + if (!only_pivot || segmented_level.is_pivot_active(min_pivot_i)) { + sequences.emplace_back( + SegmentedLevelScanner{ + node, + segmented_level, + update_context.page_loader, + llfs::PinPageToJob::kDefault, + update_context.overcommit, + segment_load_status, + min_pivot_i, + min_key} | + seq::boxed()); + } + }); + } + + if (sequences.empty()) { + return seq::Empty{} | seq::boxed(); + } + + return as_seq(std::move(sequences)) | seq::flatten() | seq::boxed(); +} + //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - // /*static*/ StatusOr InMemoryNode::UpdateBuffer::concat_segmented_and_merged_level( diff --git a/src/turtle_kv/tree/in_memory_node.hpp b/src/turtle_kv/tree/in_memory_node.hpp index a5d7750..13a1286 100644 --- a/src/turtle_kv/tree/in_memory_node.hpp +++ b/src/turtle_kv/tree/in_memory_node.hpp @@ -386,7 +386,7 @@ struct InMemoryNode { return estimated; } - MergedLevel concat(MergedLevel& that) + MergedLevel concat(MergedLevel&& that) { return MergedLevel{ .result_set = MergeCompactor::ResultSet::concat(std::move(this->result_set), @@ -404,7 +404,18 @@ struct InMemoryNode { SmallFn dump() const; }; - using Level = std::variant; + struct HybridLevel { + std::vector> levels; + + BoxedSeq edit_slices(InMemoryNode& node, + BatchUpdateContext& update_context, + Status& segment_load_status, + i32 min_pivot_i, + bool only_pivot, + Optional min_key) const; + }; + + using Level = std::variant; //+++++++++++-+-+--+----- --- -- - - - - @@ -619,7 +630,7 @@ struct InMemoryNode { Subtree try_shrink(); /** \brief Merge the node in place with its right sibling. - * + * * Returns nullptr if `sibling` is completely consumed; otherwise, returns the modified sibling * since a borrow occurred. */ @@ -646,9 +657,9 @@ struct InMemoryNode { i32 pivot_i, const Interval& pivot_key_range); - /** \brief Merges and compacts all live edits in all levels/segments, producing a single level (if - * not size-tiered), or a series of non-key-overlapping levels with a single segment in each (if - * size-tiered). + /** \brief Merges and compacts all live edits in all levels/segments, producing a single level + * (if not size-tiered), or a series of non-key-overlapping levels with a single segment in each + * (if size-tiered). * * This can be done if node splitting fails, to reduce the serialized space required by getting * rid of all the non-zero flushed key upper bounds. This should NOT be done under normal diff --git a/src/turtle_kv/tree/in_memory_node.test.cpp b/src/turtle_kv/tree/in_memory_node.test.cpp index 4f8fbc1..ef88481 100644 --- a/src/turtle_kv/tree/in_memory_node.test.cpp +++ b/src/turtle_kv/tree/in_memory_node.test.cpp @@ -194,6 +194,7 @@ struct BatchUpdateGenerator { StableStringStore strings; RandomResultSetGenerator result_set_generator; std::vector pending_deletes; + std::vector already_deleted; usize delete_frequency; explicit BatchUpdateGenerator(usize delete_frequency_param, @@ -209,6 +210,8 @@ struct BatchUpdateGenerator { ResultSet result_set = result_set_generator(DecayToItem{}, rng, this->strings, this->pending_deletes); + this->already_deleted = this->pending_deletes; + if (update_pending_deletes) { if (!this->pending_deletes.empty()) { this->pending_deletes.clear(); @@ -227,7 +230,7 @@ struct BatchUpdateGenerator { void verify_deleted_point_queries(Table& expected_table, Table& actual_table) { - for (const KeyView& key : this->pending_deletes) { + for (const KeyView& key : this->already_deleted) { EXPECT_EQ(expected_table.get(key).status(), batt::StatusCode::kNotFound); EXPECT_EQ(actual_table.get(key).status(), batt::StatusCode::kNotFound); } @@ -344,8 +347,6 @@ void SubtreeBatchUpdateScenario::run() .metrics = metrics, .overcommit = llfs::PageCacheOvercommit::not_allowed(), }, - // TODO [vsilai 2026-01-09] Enable delete support for batch generation. - // .result_set = update_generator.next_batch(i, rng, /*update_pending_deletes=*/true), .edit_size_totals = None, }; @@ -497,7 +498,7 @@ TEST(InMemoryNodeTest, SubtreeDeletions) .set_value_size_hint(value_size); usize items_per_leaf = tree_options.flush_size() / tree_options.expected_item_size(); - usize total_batches = 81; + usize total_batches = 100; std::vector keys; keys.reserve(total_batches * items_per_leaf); @@ -551,15 +552,12 @@ TEST(InMemoryNodeTest, SubtreeDeletions) usize per_batch = items_per_leaf / total_batches; usize batch_remainder = items_per_leaf % total_batches; - usize total_amount_per_batch = per_batch + (batch_number < batch_remainder ? 1 : 0); - for (usize i = 0; i < total_batches; ++i) { - usize base_i = i * items_per_leaf; - usize offset = batch_number * per_batch + std::min(batch_number, batch_remainder); + usize offset = batch_number * per_batch + std::min(batch_number, batch_remainder); + usize total_amount_per_batch = per_batch + (batch_number < batch_remainder ? 1 : 0); - for (usize j = 0; j < total_amount_per_batch; ++j) { - current_batch.emplace_back(keys[base_i + offset + j], ValueView::deleted()); - } + for (usize j = 0; j < total_amount_per_batch; ++j) { + current_batch.emplace_back(keys[offset + j], ValueView::deleted()); } BATT_CHECK_LE(current_batch.size(), items_per_leaf) << BATT_INSPECT(batch_number); diff --git a/src/turtle_kv/tree/packed_leaf_page.hpp b/src/turtle_kv/tree/packed_leaf_page.hpp index 2784737..c0d1244 100644 --- a/src/turtle_kv/tree/packed_leaf_page.hpp +++ b/src/turtle_kv/tree/packed_leaf_page.hpp @@ -441,6 +441,9 @@ inline usize PackedLeafLayoutPlan::compute_trie_step_size() const step_size = 1; } else if (this->key_count < 256) { step_size = this->key_count / 16; + if (step_size == 0) { + step_size = 1; + } } return step_size; From fa6a609bbdd3068aa9adb3f6603d2d5ae2f844fe Mon Sep 17 00:00:00 2001 From: Vidya Silai Date: Mon, 16 Mar 2026 16:10:06 -0400 Subject: [PATCH 42/48] Test changes --- src/turtle_kv/util/piecewise_filter.test.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/turtle_kv/util/piecewise_filter.test.cpp b/src/turtle_kv/util/piecewise_filter.test.cpp index 72c8f7a..83b1cd1 100644 --- a/src/turtle_kv/util/piecewise_filter.test.cpp +++ b/src/turtle_kv/util/piecewise_filter.test.cpp @@ -64,10 +64,10 @@ TEST(PiecewiseFilterTest, InvalidFilterTest) // TEST(PiecewiseFilterTest, QueryTest) { - const usize num_items = 1000000; + const usize num_items = 1000; - for (usize i = 0; i < 100; ++i) { - std::default_random_engine rng{i}; + for (usize seed = 0; seed < 100; ++seed) { + std::default_random_engine rng{seed}; PiecewiseFilter filter; EXPECT_TRUE(filter.check_invariants()); From eb53e2f239dfd004f9be21b69a1d0fbf190c076f Mon Sep 17 00:00:00 2001 From: Vidya Silai Date: Mon, 23 Mar 2026 11:23:56 -0400 Subject: [PATCH 43/48] Bug fixes --- src/turtle_kv/kv_store_scanner.cpp | 3 + src/turtle_kv/tree/active_pivots_set.hpp | 3 - src/turtle_kv/tree/algo/segmented_levels.hpp | 18 +- src/turtle_kv/tree/in_memory_node.cpp | 237 ++++++++++--------- src/turtle_kv/tree/in_memory_node.hpp | 42 +++- src/turtle_kv/tree/in_memory_node.test.cpp | 178 +++++++------- src/turtle_kv/tree/subtree.cpp | 2 - src/turtle_kv/tree/subtree_viability.hpp | 2 + 8 files changed, 265 insertions(+), 220 deletions(-) diff --git a/src/turtle_kv/kv_store_scanner.cpp b/src/turtle_kv/kv_store_scanner.cpp index 8d86790..bad4864 100644 --- a/src/turtle_kv/kv_store_scanner.cpp +++ b/src/turtle_kv/kv_store_scanner.cpp @@ -469,6 +469,9 @@ Status KVStoreScanner::set_next_item() for (;;) { if (this->heap_.empty()) { + if (this->next_item_ && this->next_item_->value.is_delete()) { + this->next_item_ = None; + } return OkStatus(); } diff --git a/src/turtle_kv/tree/active_pivots_set.hpp b/src/turtle_kv/tree/active_pivots_set.hpp index 6959fc6..1c06b42 100644 --- a/src/turtle_kv/tree/active_pivots_set.hpp +++ b/src/turtle_kv/tree/active_pivots_set.hpp @@ -178,7 +178,6 @@ class ActivePivotsSet128 : public ActivePivotsSetBase> this->bit_set_[1] = 0; } } -<<<<<<< HEAD BATT_ALWAYS_INLINE void pop_back_pivots(i32 count) { @@ -194,8 +193,6 @@ class ActivePivotsSet128 : public ActivePivotsSetBase> this->bit_set_[0] = 0; } } -======= ->>>>>>> main }; static_assert(ActivePivotsSet); diff --git a/src/turtle_kv/tree/algo/segmented_levels.hpp b/src/turtle_kv/tree/algo/segmented_levels.hpp index 1397357..84dd73f 100644 --- a/src/turtle_kv/tree/algo/segmented_levels.hpp +++ b/src/turtle_kv/tree/algo/segmented_levels.hpp @@ -187,15 +187,21 @@ struct SegmentedLevelAlgorithms { const PackedLeafPage& leaf = PackedLeafPage::view_of(pinned_page.get_page_buffer()); - auto flushed_last = leaf.lower_bound(max_key); - if (flushed_last != leaf.items_end() && get_key(*flushed_last) == max_key) { - ++flushed_last; - } - CInterval flush_key_crange{pivot_lower_bound_key, max_key}; segment.drop_key_range(flush_key_crange, leaf.items_slice()); - if (flushed_last == leaf.items_end() || get_key(*flushed_last) >= pivot_upper_bound_key) { + auto pivot_first = leaf.lower_bound(pivot_lower_bound_key); + usize pivot_first_i = std::distance(leaf.items_begin(), pivot_first); + + auto pivot_last = leaf.lower_bound(pivot_upper_bound_key); + usize pivot_last_i = std::distance(leaf.items_begin(), pivot_last); + + Interval pivot_live_range = + segment.get_live_item_range(this->level_, + Interval{BATT_CHECKED_CAST(u32, pivot_first_i), + BATT_CHECKED_CAST(u32, pivot_last_i)}); + + if (pivot_live_range.empty()) { segment.set_pivot_active(pivot_i, false); } diff --git a/src/turtle_kv/tree/in_memory_node.cpp b/src/turtle_kv/tree/in_memory_node.cpp index 81d5e01..32d3e50 100644 --- a/src/turtle_kv/tree/in_memory_node.cpp +++ b/src/turtle_kv/tree/in_memory_node.cpp @@ -753,6 +753,10 @@ Subtree InMemoryNode::try_shrink() // Status InMemoryNode::merge_child(BatchUpdateContext& update_context, i32 pivot_i) noexcept { + static Metrics& r_metrics = Self::metrics(); + + LatencyTimer timer{Every2ToTheConst<0>{}, r_metrics.merge_latency}; + // If there are no siblings to merge with, we must be in the middle of collapsing the tree // (flush and shrink). // @@ -783,7 +787,6 @@ Status InMemoryNode::merge_child(BatchUpdateContext& update_context, i32 pivot_i // const i32 right_pivot_i = std::max(pivot_i, sibling_i); const i32 left_pivot_i = std::min(pivot_i, sibling_i); - const usize old_pivot_count = this->pivot_count(); // Call Subtree::try_merge. // @@ -826,17 +829,11 @@ Status InMemoryNode::merge_child(BatchUpdateContext& update_context, i32 pivot_i // this->pivot_keys_.erase(this->pivot_keys_.begin() + right_pivot_i); - if ((usize)right_pivot_i == old_pivot_count - 1) { - BATT_ASSIGN_OK_RESULT(this->max_key_, - this->children.back().get_max_key(update_context.page_loader, - update_context.overcommit, - this->child_pages.back())); - } - // Finally, split the newly merged child if needed. // SubtreeViability merged_viability = this->children[left_pivot_i].get_viability(); if (batt::is_case(merged_viability)) { + r_metrics.merge_then_split_count.add(1); BATT_REQUIRE_OK(this->make_child_viable(update_context, left_pivot_i)); } else { BATT_CHECK(batt::is_case(merged_viability)); @@ -858,129 +855,35 @@ Status InMemoryNode::try_merge(BatchUpdateContext& context, usize i = 0; for (; i < this->update_buffer.levels.size(); ++i) { Level& left_level = this->update_buffer.levels[i]; - BATT_REQUIRE_OK(batt::case_of( // - left_level, // - [&](EmptyLevel&) -> Status { + batt::case_of( // + left_level, // + [&](EmptyLevel&) { if (i < sibling->update_buffer.levels.size()) { Level& right_level = sibling->update_buffer.levels[i]; if (!batt::is_case(right_level)) { this->update_buffer.levels[i] = std::move(right_level); } } - - return OkStatus(); }, - [&](MergedLevel& left_merged_level) -> Status { + [&](MergedLevel& left_merged_level) { if (i < sibling->update_buffer.levels.size()) { - BATT_REQUIRE_OK(batt::case_of( - sibling->update_buffer.levels[i], - [](EmptyLevel&) -> Status { - return OkStatus(); - }, - [&](MergedLevel& right_merged_level) -> Status { - this->update_buffer.levels[i] = - std::move(left_merged_level.concat(std::move(right_merged_level))); - return OkStatus(); - }, - [&](SegmentedLevel& right_segmented_level) -> Status { - HybridLevel new_hybrid_level; - new_hybrid_level.add_new_sub_level(std::move(left_merged_level)); - new_hybrid_level.add_new_sub_level(std::move(right_segmented_level)); - - this->update_buffer.levels[i] = std::move(new_hybrid_level); - - return OkStatus(); - }, - [&](HybridLevel& right_hybrid_level) -> Status { - HybridLevel new_hybrid_level; - new_hybrid_level.add_new_sub_level(std::move(left_merged_level)); - new_hybrid_level.add_new_sub_level(std::move(right_hybrid_level)); - - this->update_buffer.levels[i] = std::move(new_hybrid_level); - - return OkStatus(); - })); + this->update_buffer.levels[i] = + std::move(left_merged_level.merge(std::move(sibling->update_buffer.levels[i]))); } - - return OkStatus(); }, - [&](SegmentedLevel& left_segmented_level) -> Status { + [&](SegmentedLevel& left_segmented_level) { if (i < sibling->update_buffer.levels.size()) { - BATT_REQUIRE_OK(batt::case_of( - sibling->update_buffer.levels[i], - [](EmptyLevel&) -> Status { - return OkStatus(); - }, - [&](MergedLevel& right_merged_level) -> Status { - HybridLevel new_hybrid_level; - new_hybrid_level.add_new_sub_level(std::move(left_segmented_level)); - new_hybrid_level.add_new_sub_level(std::move(right_merged_level)); - - this->update_buffer.levels[i] = std::move(new_hybrid_level); - - return OkStatus(); - }, - [&](SegmentedLevel& right_segmented_level) -> Status { - // First shift the right level's bitsets to the left by the number of pivots - // in the left node. - // - usize left_node_pivot_count = this->pivot_count(); - for (usize segment_i = 0; segment_i < right_segmented_level.segment_count(); - ++segment_i) { - Segment& segment = right_segmented_level.get_segment(segment_i); - segment.active_pivots.pop_back_pivots(left_node_pivot_count); - }; - - left_segmented_level.segments.insert( - left_segmented_level.segments.end(), - std::make_move_iterator(right_segmented_level.segments.begin()), - std::make_move_iterator(right_segmented_level.segments.end())); - - // Erase potential duplicate segments that have resulted from a previous split. - // - left_segmented_level.segments.erase( - std::unique(left_segmented_level.segments.begin(), - left_segmented_level.segments.end(), - [](const Segment& l, const Segment& r) { - return l.page_id_slot.page_id == r.page_id_slot.page_id; - }), - left_segmented_level.segments.end()); - - return OkStatus(); - }, - [&](HybridLevel& right_hybrid_level) -> Status { - HybridLevel new_hybrid_level; - new_hybrid_level.add_new_sub_level(std::move(left_segmented_level)); - new_hybrid_level.add_new_sub_level(std::move(right_hybrid_level)); - - this->update_buffer.levels[i] = std::move(new_hybrid_level); - - return OkStatus(); - })); + this->update_buffer.levels[i] = + std::move(left_segmented_level.merge(std::move(sibling->update_buffer.levels[i]), + this->pivot_count())); } - - return OkStatus(); }, - [&](HybridLevel& left_hybrid_level) -> Status { + [&](HybridLevel& left_hybrid_level) { if (i < sibling->update_buffer.levels.size()) { - batt::case_of( - sibling->update_buffer.levels[i], - [](EmptyLevel&) { - - }, - [&](MergedLevel& right_merged_level) { - left_hybrid_level.add_new_sub_level(std::move(right_merged_level)); - }, - [&](SegmentedLevel& right_segmented_level) { - left_hybrid_level.add_new_sub_level(std::move(right_segmented_level)); - }, - [&](HybridLevel& right_hybrid_level) { - left_hybrid_level.add_new_sub_level(std::move(right_hybrid_level)); - }); + this->update_buffer.levels[i] = + std::move(left_hybrid_level.merge(std::move(sibling->update_buffer.levels[i]))); } - - return OkStatus(); - })); + }); } // Carry over any remaining levels from the right node's update buffer. @@ -1848,6 +1751,34 @@ StatusOr Segment::load_leaf_page(llfs::PageLoader& page_loader }); } +//==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - +// +Level MergedLevel::merge(Level&& sibling_level) +{ + return batt::case_of( + sibling_level, + [&](EmptyLevel&) -> Level { + return std::move(sibling_level); + }, + [&](MergedLevel& right_merged_level) -> Level { + return this->concat(std::move(right_merged_level)); + }, + [&](SegmentedLevel& right_segmented_level) -> Level { + HybridLevel new_hybrid_level; + new_hybrid_level.add_new_sub_level(std::move(*this)); + new_hybrid_level.add_new_sub_level(std::move(right_segmented_level)); + + return new_hybrid_level; + }, + [&](HybridLevel& right_hybrid_level) -> Level { + HybridLevel new_hybrid_level; + new_hybrid_level.add_new_sub_level(std::move(*this)); + new_hybrid_level.add_new_sub_level(std::move(right_hybrid_level)); + + return new_hybrid_level; + }); +} + //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - // StatusOr MergedLevel::start_serialize(const InMemoryNode& node, @@ -2109,8 +2040,6 @@ bool InMemoryNode::UpdateBuffer::SegmentedLevel::set_pivot_items_flushed( // bool InMemoryNode::UpdateBuffer::SegmentedLevel::set_pivot_completely_flushed(usize pivot_i) { - // TODO [vsilai 2026-3-17]: call drop_key_range on segment filter with pivot key range? - // for (usize segment_i = 0; segment_i < this->segment_count();) { Segment& segment = this->get_segment(segment_i); @@ -2137,6 +2066,56 @@ usize InMemoryNode::UpdateBuffer::SegmentedLevel::segment_filter_cut_points() co return n; } +//==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - +// +Level InMemoryNode::UpdateBuffer::SegmentedLevel::merge(Level&& sibling_level, + usize node_pivot_count) +{ + return batt::case_of( + sibling_level, + [&](EmptyLevel&) -> Level { + return std::move(sibling_level); + }, + [&](MergedLevel& right_merged_level) -> Level { + HybridLevel new_hybrid_level; + new_hybrid_level.add_new_sub_level(std::move(*this)); + new_hybrid_level.add_new_sub_level(std::move(right_merged_level)); + + return new_hybrid_level; + }, + [&](SegmentedLevel& right_segmented_level) -> Level { + // First shift the right level's bitsets to the left by the number of pivots + // in the left node. + // + for (usize segment_i = 0; segment_i < right_segmented_level.segment_count(); ++segment_i) { + Segment& segment = right_segmented_level.get_segment(segment_i); + segment.active_pivots.pop_back_pivots(node_pivot_count); + }; + + this->segments.insert(this->segments.end(), + std::make_move_iterator(right_segmented_level.segments.begin()), + std::make_move_iterator(right_segmented_level.segments.end())); + + // Erase potential duplicate segments that have resulted from a previous split. + // + this->segments.erase(std::unique(this->segments.begin(), + this->segments.end(), + [](const Segment& l, const Segment& r) { + return l.page_id_slot.page_id == r.page_id_slot.page_id; + }), + this->segments.end()); + + return *this; + }, + [&](HybridLevel& right_hybrid_level) -> Level { + HybridLevel new_hybrid_level; + new_hybrid_level.add_new_sub_level(std::move(*this)); + new_hybrid_level.add_new_sub_level(std::move(right_hybrid_level)); + + return new_hybrid_level; + }); +} + //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - // BoxedSeq InMemoryNode::UpdateBuffer::HybridLevel::to_boxed_seq( @@ -2277,6 +2256,28 @@ void InMemoryNode::UpdateBuffer::HybridLevel::drop_after_pivot(i32 pivot_i, } } +//==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - +// +Level InMemoryNode::UpdateBuffer::HybridLevel::merge(Level&& sibling_level) +{ + batt::case_of( + sibling_level, + [&](EmptyLevel&) { + + }, + [&](MergedLevel& right_merged_level) { + this->add_new_sub_level(std::move(right_merged_level)); + }, + [&](SegmentedLevel& right_segmented_level) { + this->add_new_sub_level(std::move(right_segmented_level)); + }, + [&](HybridLevel& right_hybrid_level) { + this->add_new_sub_level(std::move(right_hybrid_level)); + }); + + return *this; +} + //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - // StatusOr InMemoryNode::UpdateBuffer::HybridLevel::start_serialize( diff --git a/src/turtle_kv/tree/in_memory_node.hpp b/src/turtle_kv/tree/in_memory_node.hpp index b211171..76e7129 100644 --- a/src/turtle_kv/tree/in_memory_node.hpp +++ b/src/turtle_kv/tree/in_memory_node.hpp @@ -55,6 +55,14 @@ struct InMemoryNode { /** \brief Captures statistics about the number of levels per node. */ StatsMetric level_depth_stats; + + /** \brief The total time spent on merging two Subtrees and updating parent metadata. + */ + LatencyMetric merge_latency; + + /** \brief The number of times a merge operation was followed by a split operation. + */ + CountMetric merge_then_split_count; }; static Metrics& metrics() @@ -74,7 +82,12 @@ struct InMemoryNode { struct UpdateBuffer { using Self = UpdateBuffer; + struct EmptyLevel; + struct MergedLevel; struct SegmentedLevel; + struct HybridLevel; + + using Level = std::variant; //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - // @@ -325,6 +338,8 @@ struct InMemoryNode { */ void check_items_sorted(const InMemoryNode& node, llfs::PageLoader& page_loader) const; + /** \brief Converts the unflushed items in this level to a boxed sequence. + */ BoxedSeq to_boxed_seq(const InMemoryNode& node, BatchUpdateContext& update_context, Status& segment_load_status, @@ -332,16 +347,35 @@ struct InMemoryNode { bool only_pivot, Optional min_key) const; + /** \brief Marks the items contained in `flush_key_crange` that are addressed to `pivot_i` + * as flushed within this level. + */ bool set_pivot_items_flushed(const InMemoryNode& node, BatchUpdateContext& update_context, usize pivot_i, const CInterval& flush_key_crange, Status segment_load_status); + /** \brief Marks the pivot `pivot_i` as completely flushed within this level. + */ bool set_pivot_completely_flushed(usize pivot_i); + /** \brief Calculates the number of filter cut points needed for this level when it will + * be serialized. + */ usize segment_filter_cut_points() const; + /** \brief Merges this level with a "sibling" level from another node. + * + * This function is called when two nodes are being merged and their update buffers are + * being merged as well. In this function, this level is the "left" level (i.e., the level + * comes from the left node in the merge) and `sibling_level` is the "right" level. + * + * `node_pivot_count` is the number of pivots in the left node (i.e., the node that this + * level exists in). + */ + Level merge(Level&& sibling_level, usize node_pivot_count); + /** \brief Prints a human-readable representation of the level. */ SmallFn dump() const; @@ -428,6 +462,8 @@ struct InMemoryNode { return this->result_set.empty(); } + Level merge(Level&& sibling_level); + /** \brief Returns the number of segment leaf page build jobs added to the context. */ StatusOr start_serialize(const InMemoryNode& node, TreeSerializeContext& context); @@ -450,7 +486,7 @@ struct InMemoryNode { { return as_const_slice(this->levels); } - + void add_new_sub_level(std::variant&& level) { this->levels.emplace_back(std::move(level)); @@ -490,6 +526,8 @@ struct InMemoryNode { llfs::PageLoader& page_loader, const TreeOptions& tree_options); + Level merge(Level&& sibling_level); + StatusOr start_serialize(const InMemoryNode& node, TreeSerializeContext& context); StatusOr finish_serialize(const InMemoryNode& node, @@ -498,8 +536,6 @@ struct InMemoryNode { SmallFn dump() const; }; - using Level = std::variant; - //+++++++++++-+-+--+----- --- -- - - - - SmallVec levels; diff --git a/src/turtle_kv/tree/in_memory_node.test.cpp b/src/turtle_kv/tree/in_memory_node.test.cpp index ef88481..ce0624e 100644 --- a/src/turtle_kv/tree/in_memory_node.test.cpp +++ b/src/turtle_kv/tree/in_memory_node.test.cpp @@ -170,6 +170,56 @@ void verify_range_scan(LatencyMetric* scan_latency, } } +void perform_range_scan(Table& expected_table, + Subtree& tree, + const TreeOptions& tree_options, + PinningPageLoader& page_loader, + const KeyView& min_key, + usize scan_len, + LatencyMetric* scan_latency, + usize iteration) +{ + auto root_ptr = std::make_shared(tree.clone_serialized_or_panic()); + + std::array, kMaxScanSize> scan_items_buffer; + + PageSliceStorage page_slice_storage; + + KVStoreScanner kv_scanner{ + page_loader, + root_ptr->page_id_slot_or_panic(), + BATT_OK_RESULT_OR_PANIC(root_ptr->get_height(page_loader, // + llfs::PageCacheOvercommit::not_allowed())), + min_key, + tree_options.trie_index_sharded_view_size(), + &page_slice_storage}; + + usize n_read = 0; + { + LatencyTimer timer{*scan_latency}; + BATT_CHECK_OK(kv_scanner.start()); + for (auto& kv_pair : scan_items_buffer) { + Optional item = kv_scanner.next(); + if (!item) { + break; + } + kv_pair.first = item->key; + kv_pair.second = item->value; + ++n_read; + if (n_read == scan_len) { + break; + } + } + } + + ASSERT_NO_FATAL_FAILURE(verify_range_scan(nullptr, + expected_table, + as_slice(scan_items_buffer.data(), n_read), + min_key, + scan_len)) + << BATT_INSPECT(iteration) << BATT_INSPECT_STR(min_key) << BATT_INSPECT(scan_len); +} + struct SubtreeBatchUpdateScenario { static std::atomic& size_tiered_count() { @@ -428,46 +478,18 @@ void SubtreeBatchUpdateScenario::run() { auto root_ptr = std::make_shared(tree.clone_serialized_or_panic()); - std::unique_ptr scanner_page_job = page_cache->new_job(); const usize scan_len = pick_scan_len(rng); - std::array, kMaxScanSize> scan_items_buffer; KeyView min_key = update.result_set.get_min_key(); - PageSliceStorage page_slice_storage; - - KVStoreScanner kv_scanner{ - *page_loader, - root_ptr->page_id_slot_or_panic(), - BATT_OK_RESULT_OR_PANIC(root_ptr->get_height(*page_loader, // - llfs::PageCacheOvercommit::not_allowed())), - min_key, - tree_options.trie_index_sharded_view_size(), - &page_slice_storage}; - - usize n_read = 0; - { - LatencyTimer timer{scan_latency}; - BATT_CHECK_OK(kv_scanner.start()); - for (auto& kv_pair : scan_items_buffer) { - Optional item = kv_scanner.next(); - if (!item) { - break; - } - kv_pair.first = item->key; - kv_pair.second = item->value; - ++n_read; - if (n_read == scan_len) { - break; - } - } - } - ASSERT_NO_FATAL_FAILURE(verify_range_scan(nullptr, - expected_table, - as_slice(scan_items_buffer.data(), n_read), - min_key, - scan_len)) - << BATT_INSPECT(i) << BATT_INSPECT_STR(min_key) << BATT_INSPECT(scan_len); + perform_range_scan(expected_table, + *root_ptr, + tree_options, + *page_loader, + min_key, + scan_len, + &scan_latency, + i); } if (my_id == 0) { @@ -487,6 +509,8 @@ void SubtreeBatchUpdateScenario::run() TEST(InMemoryNodeTest, SubtreeDeletions) { + LatencyMetric scan_latency; + const usize key_size = 24; const usize value_size = 100; const usize chi = 4; @@ -550,21 +574,19 @@ TEST(InMemoryNodeTest, SubtreeDeletions) std::vector current_batch; current_batch.reserve(items_per_leaf); - usize per_batch = items_per_leaf / total_batches; - usize batch_remainder = items_per_leaf % total_batches; - - usize offset = batch_number * per_batch + std::min(batch_number, batch_remainder); - usize total_amount_per_batch = per_batch + (batch_number < batch_remainder ? 1 : 0); - - for (usize j = 0; j < total_amount_per_batch; ++j) { - current_batch.emplace_back(keys[offset + j], ValueView::deleted()); + for (usize i = 0; i < items_per_leaf; ++i) { + usize key_i = batch_number + i * total_batches; + if (key_i < keys.size()) { + current_batch.emplace_back(keys[key_i], ValueView::deleted()); + } } + BATT_CHECK_LE(current_batch.size(), items_per_leaf) << BATT_INSPECT(batch_number); return current_batch; }; - const auto apply_tree_updates = [&](auto batch_creation_func, bool perform_scan) { + const auto apply_tree_updates = [&](auto batch_creation_func) { for (usize i = 0; i < total_batches; ++i) { std::vector current_batch = batch_creation_func(i); @@ -643,47 +665,20 @@ TEST(InMemoryNodeTest, SubtreeDeletions) verify_table_point_queries(expected_table, actual_table, rng, batt::log2_ceil(i))) << BATT_INSPECT(i); - if (perform_scan) { + { auto root_ptr = std::make_shared(tree.clone_serialized_or_panic()); - std::unique_ptr scanner_page_job = page_cache->new_job(); const usize scan_len = 20; - std::array, kMaxScanSize> scan_items_buffer; KeyView min_key = update.result_set.get_min_key(); - PageSliceStorage page_slice_storage; - - KVStoreScanner kv_scanner{*page_loader, - root_ptr->page_id_slot_or_panic(), - BATT_OK_RESULT_OR_PANIC(root_ptr->get_height( - *page_loader, // - llfs::PageCacheOvercommit::not_allowed())), - min_key, - tree_options.trie_index_sharded_view_size(), - &page_slice_storage}; - - usize n_read = 0; - { - BATT_CHECK_OK(kv_scanner.start()); - for (auto& kv_pair : scan_items_buffer) { - Optional item = kv_scanner.next(); - if (!item) { - break; - } - kv_pair.first = item->key; - kv_pair.second = item->value; - ++n_read; - if (n_read == scan_len) { - break; - } - } - } - ASSERT_NO_FATAL_FAILURE(verify_range_scan(nullptr, - expected_table, - as_slice(scan_items_buffer.data(), n_read), - min_key, - scan_len)) - << BATT_INSPECT(i) << BATT_INSPECT_STR(min_key) << BATT_INSPECT(scan_len); + perform_range_scan(expected_table, + *root_ptr, + tree_options, + *page_loader, + min_key, + scan_len, + &scan_latency, + i); } page_loader.emplace(*page_cache); @@ -692,20 +687,27 @@ TEST(InMemoryNodeTest, SubtreeDeletions) }; LOG(INFO) << "Inserting key/value pairs into tree..."; - apply_tree_updates(create_insertion_batch, false); + apply_tree_updates(create_insertion_batch); LOG(INFO) << "Deleting key/value pairs from tree..."; - for (usize i = 0; i < total_batches; ++i) { - bool perform_scan = i == 0 ? true : false; - StatusOr tree_height = - tree.get_height(*page_loader, llfs::PageCacheOvercommit::not_allowed()); - ASSERT_TRUE(tree_height.ok()) << BATT_INSPECT(tree_height); + StatusOr tree_height = + tree.get_height(*page_loader, llfs::PageCacheOvercommit::not_allowed()); + ASSERT_TRUE(tree_height.ok()) << BATT_INSPECT(tree_height); + for (;;) { if (*tree_height > 0) { - apply_tree_updates(create_deletion_batch, perform_scan); + apply_tree_updates(create_deletion_batch); } else { break; } + tree_height = tree.get_height(*page_loader, llfs::PageCacheOvercommit::not_allowed()); + ASSERT_TRUE(tree_height.ok()) << BATT_INSPECT(tree_height); } + + LOG(INFO) << BATT_INSPECT(InMemoryNode::metrics().merge_latency); + + LOG(INFO) << BATT_INSPECT(InMemoryNode::metrics().merge_then_split_count); + + LOG(INFO) << BATT_INSPECT(scan_latency); } } // namespace diff --git a/src/turtle_kv/tree/subtree.cpp b/src/turtle_kv/tree/subtree.cpp index 3811465..33839e1 100644 --- a/src/turtle_kv/tree/subtree.cpp +++ b/src/turtle_kv/tree/subtree.cpp @@ -245,8 +245,6 @@ Status Subtree::apply_batch_update(const TreeOptions& tree_options, return OkStatus(); }, [&](NeedsSplit needs_split) { - // TODO [vsilai 2025-12-09]: revist when VLDB changes are merged in. - // if (normal_flush_might_fix_root(needs_split)) { Status flush_status = new_subtree.try_flush(update_context); if (flush_status.ok() && batt::is_case(new_subtree.get_viability())) { diff --git a/src/turtle_kv/tree/subtree_viability.hpp b/src/turtle_kv/tree/subtree_viability.hpp index ca5ca37..93bb828 100644 --- a/src/turtle_kv/tree/subtree_viability.hpp +++ b/src/turtle_kv/tree/subtree_viability.hpp @@ -160,6 +160,8 @@ inline bool normal_flush_might_fix_root(const SubtreeViability& viability) }); } +//==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - +// inline bool is_root_viable(const SubtreeViability& viability) { return batt::case_of( From 43e38f2045ef5d8437683dcc81e995edce60b8a1 Mon Sep 17 00:00:00 2001 From: Vidya Silai Date: Mon, 23 Mar 2026 16:28:03 -0400 Subject: [PATCH 44/48] Fix some comments --- src/turtle_kv/tree/in_memory_node.hpp | 3 +-- src/turtle_kv/tree/subtree.hpp | 6 +----- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/src/turtle_kv/tree/in_memory_node.hpp b/src/turtle_kv/tree/in_memory_node.hpp index 76e7129..fca1665 100644 --- a/src/turtle_kv/tree/in_memory_node.hpp +++ b/src/turtle_kv/tree/in_memory_node.hpp @@ -202,8 +202,7 @@ struct InMemoryNode { */ void insert_pivot(i32 pivot_i, bool is_active); - /** \brief Removes a pivot bit in this->active_pivots and this->flushed_pivots at position - * `pivot_i`. + /** \brief Removes a bit in the active pivots bit set at position `pivot_i`. */ void remove_pivot(i32 pivot_i); diff --git a/src/turtle_kv/tree/subtree.hpp b/src/turtle_kv/tree/subtree.hpp index 4c16dbd..db4ee8d 100644 --- a/src/turtle_kv/tree/subtree.hpp +++ b/src/turtle_kv/tree/subtree.hpp @@ -159,11 +159,7 @@ class Subtree */ StatusOr> try_split(BatchUpdateContext& context); - /** \brief Attempts to merge the given Subtree in place with its right sibling. - * - * If the in place merge is successful, `sibling` is completely consumed and `None` is returned. - * - * If a borrow needs to occur, `this` is modified in place and the modified sibling is returned. + /** \brief Merges the Subtree in place with `sibling`. */ Status try_merge(BatchUpdateContext& context, Subtree&& sibling) noexcept; From dd18c3d06d7d9a223954b8efaa1524863f41d126 Mon Sep 17 00:00:00 2001 From: Vidya Silai Date: Tue, 31 Mar 2026 10:27:24 -0400 Subject: [PATCH 45/48] Fix Subtree::flush_and_shrink loop logic --- src/turtle_kv/tree/subtree.cpp | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/turtle_kv/tree/subtree.cpp b/src/turtle_kv/tree/subtree.cpp index 33839e1..6b3eedf 100644 --- a/src/turtle_kv/tree/subtree.cpp +++ b/src/turtle_kv/tree/subtree.cpp @@ -323,11 +323,6 @@ Status Subtree::flush_and_shrink(BatchUpdateContext& context) noexcept return flush_status; } - SubtreeViability current_viability = this->get_viability(); - if (is_root_viable(current_viability)) { - break; - } - // Nothing was available to flush since the node's update buffer is empty. Try collapsing one // level of the tree. // @@ -622,7 +617,7 @@ Status Subtree::try_flush(BatchUpdateContext& context) }, [&](const std::unique_ptr& leaf [[maybe_unused]]) -> Status { - return {batt::StatusCode::kUnavailable}; + return OkStatus(); }, [&](const std::unique_ptr& node) -> Status { From f92f2498cead01ab379a2df4f7c7542513183109 Mon Sep 17 00:00:00 2001 From: Vidya Silai Date: Tue, 31 Mar 2026 10:30:00 -0400 Subject: [PATCH 46/48] Small style fix --- src/turtle_kv/tree/subtree.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/turtle_kv/tree/subtree.cpp b/src/turtle_kv/tree/subtree.cpp index 6b3eedf..93fd1a5 100644 --- a/src/turtle_kv/tree/subtree.cpp +++ b/src/turtle_kv/tree/subtree.cpp @@ -319,7 +319,7 @@ Status Subtree::flush_and_shrink(BatchUpdateContext& context) noexcept // First, try flushing. If flushing makes the root viable, return immediately. // Status flush_status = this->try_flush(context); - if (flush_status != OkStatus() && flush_status != batt::StatusCode::kUnavailable) { + if (!flush_status.ok() && flush_status != batt::StatusCode::kUnavailable) { return flush_status; } From d2621d79d9e6c300702ebc292a05d6217e70ae8b Mon Sep 17 00:00:00 2001 From: Vidya Silai Date: Tue, 31 Mar 2026 15:30:18 -0400 Subject: [PATCH 47/48] Some feedback changes --- src/turtle_kv/tree/active_pivots_set.hpp | 16 ++- src/turtle_kv/tree/in_memory_leaf.cpp | 6 +- src/turtle_kv/tree/in_memory_node.cpp | 125 +++++++++++++++-------- src/turtle_kv/tree/in_memory_node.hpp | 25 +++-- src/turtle_kv/tree/subtree.cpp | 12 +-- 5 files changed, 125 insertions(+), 59 deletions(-) diff --git a/src/turtle_kv/tree/active_pivots_set.hpp b/src/turtle_kv/tree/active_pivots_set.hpp index 1c06b42..4959410 100644 --- a/src/turtle_kv/tree/active_pivots_set.hpp +++ b/src/turtle_kv/tree/active_pivots_set.hpp @@ -158,11 +158,11 @@ class ActivePivotsSet128 : public ActivePivotsSetBase> BATT_ALWAYS_INLINE auto printable() const { return [this](std::ostream& out) { - out << std::bitset<64>{this->bit_set_[1]} << "," << std::bitset<64>{this->bit_set_[1]}; + out << std::bitset<64>{this->bit_set_[1]} << "," << std::bitset<64>{this->bit_set_[0]}; }; } - /** \brief Removes the specified number (`count`) pivots from the bit set. + /** \brief Removes the specified number (`count`) pivots from the beginning of the bit set. */ BATT_ALWAYS_INLINE void pop_front_pivots(i32 count) { @@ -179,6 +179,8 @@ class ActivePivotsSet128 : public ActivePivotsSetBase> } } + /** \brief Removes the specified number (`count`) pivots from the end of the bit set. + */ BATT_ALWAYS_INLINE void pop_back_pivots(i32 count) { if (count < 1) { @@ -193,6 +195,16 @@ class ActivePivotsSet128 : public ActivePivotsSetBase> this->bit_set_[0] = 0; } } + + /** \brief Performs a bit wise OR operation with another bit set. + */ + BATT_ALWAYS_INLINE Self& operator|=(const Self& other) + { + this->bit_set_[0] |= other.bit_set_[0]; + this->bit_set_[1] |= other.bit_set_[1]; + + return *this; + } }; static_assert(ActivePivotsSet); diff --git a/src/turtle_kv/tree/in_memory_leaf.cpp b/src/turtle_kv/tree/in_memory_leaf.cpp index 84ea245..3cf708a 100644 --- a/src/turtle_kv/tree/in_memory_leaf.cpp +++ b/src/turtle_kv/tree/in_memory_leaf.cpp @@ -204,12 +204,14 @@ Status InMemoryLeaf::try_merge(BatchUpdateContext& context, BATT_CHECK(sibling->result_set); if (sibling->result_set->empty()) { - BATT_CHECK(batt::is_case(this->get_viability())); + BATT_CHECK(batt::is_case(this->get_viability())) + << "Sibling leaf is not viable, so this leaf must be viable!"; return OkStatus(); } if (this->result_set->empty()) { - BATT_CHECK(batt::is_case(sibling->get_viability())); + BATT_CHECK(batt::is_case(sibling->get_viability())) + << "This leaf is not viable, so sibling leaf must be viable!"; this->pinned_leaf_page_ = std::move(sibling->pinned_leaf_page_); this->result_set = std::move(sibling->result_set); this->shared_edit_size_totals_ = sibling->shared_edit_size_totals_; diff --git a/src/turtle_kv/tree/in_memory_node.cpp b/src/turtle_kv/tree/in_memory_node.cpp index 32d3e50..a94bd6e 100644 --- a/src/turtle_kv/tree/in_memory_node.cpp +++ b/src/turtle_kv/tree/in_memory_node.cpp @@ -56,7 +56,7 @@ using PackedSegment = PackedUpdateBuffer::Segment; node->children.resize(pivot_count); node->child_pages.resize(pivot_count); node->pending_bytes.resize(pivot_count); - node->pending_bytes_is_exact = 0; + node->pending_bytes_is_exact = {}; node->pivot_keys_.resize(pivot_count + 1); node->max_key_ = packed_node.max_key(); node->common_key_prefix = packed_node.common_key_prefix(); @@ -526,8 +526,8 @@ Status InMemoryNode::flush_to_pivot(BatchUpdateContext& update_context, i32 pivo BATT_REQUIRE_OK(this->set_pivot_completely_flushed(pivot_i, pivot_key_range)); BATT_CHECK_EQ(trim_result.n_bytes_trimmed, 0); this->pending_bytes[pivot_i] = 0; - this->pending_bytes_is_exact = set_bit(this->pending_bytes_is_exact, pivot_i, true); - BATT_CHECK_EQ(get_bit(this->pending_bytes_is_exact, pivot_i), true); + this->pending_bytes_is_exact.set(pivot_i, true); + BATT_CHECK_EQ(this->pending_bytes_is_exact.get(pivot_i), true); return OkStatus(); } @@ -548,8 +548,8 @@ Status InMemoryNode::flush_to_pivot(BatchUpdateContext& update_context, i32 pivo << BATT_INSPECT(trim_result.n_bytes_trimmed) << BATT_INSPECT(child_update.get_byte_size()); this->pending_bytes[pivot_i] = trim_result.n_bytes_trimmed; - this->pending_bytes_is_exact = set_bit(this->pending_bytes_is_exact, pivot_i, true); - BATT_CHECK_EQ(get_bit(this->pending_bytes_is_exact, pivot_i), true); + this->pending_bytes_is_exact.set(pivot_i, true); + BATT_CHECK_EQ(this->pending_bytes_is_exact.get(pivot_i), true); // Recursively apply batch update. // @@ -681,18 +681,11 @@ Status InMemoryNode::split_child(BatchUpdateContext& update_context, i32 pivot_i .split_pivot(pivot_i, pivot_key_range, sibling_min_key); }, [&](HybridLevel& hybrid_level) -> Status { - for (auto& sub_level : hybrid_level.levels) { - if (batt::is_case(sub_level)) { - SegmentedLevel& segmented_sub_level = std::get(sub_level); - BATT_REQUIRE_OK(in_segmented_level(*this, - segmented_sub_level, - update_context.page_loader, - update_context.overcommit) // - .split_pivot(pivot_i, pivot_key_range, sibling_min_key)); - } - } - - return OkStatus(); + return hybrid_level.split_pivot(*this, + update_context, + pivot_i, + pivot_key_range, + sibling_min_key); })); } @@ -719,8 +712,8 @@ Status InMemoryNode::split_child(BatchUpdateContext& update_context, i32 pivot_i // The pending bytes counts for this pivot and its new sibling are not exact. // - this->pending_bytes_is_exact = set_bit(this->pending_bytes_is_exact, pivot_i, false); - this->pending_bytes_is_exact = insert_bit(this->pending_bytes_is_exact, sibling_i, false); + this->pending_bytes_is_exact.set(pivot_i, false); + this->pending_bytes_is_exact.insert(sibling_i, false); this->pending_bytes.insert(this->pending_bytes.begin() + sibling_i, sibling_pending_bytes); @@ -769,9 +762,7 @@ Status InMemoryNode::merge_child(BatchUpdateContext& update_context, i32 pivot_i // i32 sibling_i = pivot_i; - if (pivot_i == 0) { - sibling_i = pivot_i + 1; - } else if ((usize)pivot_i == this->pivot_count() - 1) { + if ((usize)pivot_i == this->pivot_count() - 1) { sibling_i = pivot_i - 1; } else { sibling_i = pivot_i + 1; @@ -796,17 +787,28 @@ Status InMemoryNode::merge_child(BatchUpdateContext& update_context, i32 pivot_i this->child_pages[left_pivot_i] = llfs::PinnedPage{}; this->child_pages.erase(this->child_pages.begin() + right_pivot_i); - // Update the update_buffer levels. + // Update the update_buffer levels. Note that we only have to do this for levels containing + // segments, since merging two pivots entails updating the active pivots bit set. // for (Level& level : this->update_buffer.levels) { - if (batt::is_case(level)) { - SegmentedLevel& segmented_level = std::get(level); - in_segmented_level(*this, - segmented_level, - update_context.page_loader, - update_context.overcommit) - .merge_pivots(left_pivot_i, right_pivot_i); - } + batt::case_of( + level, + [](EmptyLevel&) { + // nothing to do + }, + [](MergedLevel&) { + // nothing to do + }, + [&](SegmentedLevel& segmented_level) { + in_segmented_level(*this, + segmented_level, + update_context.page_loader, + update_context.overcommit) + .merge_pivots(left_pivot_i, right_pivot_i); + }, + [&](HybridLevel& hybrid_level) { + hybrid_level.merge_pivots(*this, update_context, left_pivot_i, right_pivot_i); + }); } // Update this->children. @@ -819,11 +821,10 @@ Status InMemoryNode::merge_child(BatchUpdateContext& update_context, i32 pivot_i this->pending_bytes[left_pivot_i] += this->pending_bytes[right_pivot_i]; this->pending_bytes.erase(this->pending_bytes.begin() + right_pivot_i); - bool is_pending_bytes_exact = get_bit(this->pending_bytes_is_exact, left_pivot_i) & - get_bit(this->pending_bytes_is_exact, right_pivot_i); - this->pending_bytes_is_exact = - set_bit(this->pending_bytes_is_exact, left_pivot_i, is_pending_bytes_exact); - this->pending_bytes_is_exact = remove_bit(this->pending_bytes_is_exact, right_pivot_i); + bool is_pending_bytes_exact = this->pending_bytes_is_exact.get(left_pivot_i) && + this->pending_bytes_is_exact.get(right_pivot_i); + this->pending_bytes_is_exact.set(left_pivot_i, is_pending_bytes_exact); + this->pending_bytes_is_exact.remove(right_pivot_i); // Remove the pivot key of the removed child subtree from this->pivot_keys_. // @@ -921,7 +922,7 @@ Status InMemoryNode::try_merge(BatchUpdateContext& context, sibling->pending_bytes.begin(), sibling->pending_bytes.end()); - sibling->pending_bytes_is_exact <<= this->pivot_count(); + sibling->pending_bytes_is_exact.pop_back_pivots(this->pivot_count()); this->pending_bytes_is_exact |= sibling->pending_bytes_is_exact; this->child_pages.insert(this->child_pages.end(), @@ -1343,7 +1344,7 @@ StatusOr> InMemoryNode::try_split_direct(BatchUpda SmallVec orig_child_pages = std::move(this->child_pages); SmallVec orig_pending_bytes = std::move(this->pending_bytes); KeyView orig_max_key = this->max_key_; - const u64 orig_pending_bytes_is_exact = this->pending_bytes_is_exact; + const ActivePivotsSet128 orig_pending_bytes_is_exact = this->pending_bytes_is_exact; auto reset_this_on_failure = batt::finally([&] { this->pivot_keys_ = std::move(orig_pivot_keys); @@ -1358,7 +1359,7 @@ StatusOr> InMemoryNode::try_split_direct(BatchUpda this->children.clear(); this->child_pages.clear(); this->pending_bytes.clear(); - this->pending_bytes_is_exact = u64{0}; + this->pending_bytes_is_exact = {}; BATT_CHECK_EQ(orig_pivot_count + 1, orig_pivot_keys.size()); @@ -1416,7 +1417,7 @@ StatusOr> InMemoryNode::try_split_direct(BatchUpda BATT_CHECK_EQ(split_pivot_i, lower_half_pivot_count); node_lower_half->pending_bytes.resize(lower_half_pivot_count); - node_lower_half->pending_bytes_is_exact = 0; + node_lower_half->pending_bytes_is_exact = {}; node_lower_half->child_pages.resize(lower_half_pivot_count); for (usize i = 0; i < lower_half_pivot_count; ++i) { @@ -1442,7 +1443,7 @@ StatusOr> InMemoryNode::try_split_direct(BatchUpda BATT_CHECK_EQ(split_pivot_i + upper_half_pivot_count, orig_children.size()); node_upper_half->pending_bytes.resize(upper_half_pivot_count); - node_upper_half->pending_bytes_is_exact = 0; + node_upper_half->pending_bytes_is_exact = {}; node_upper_half->child_pages.resize(upper_half_pivot_count); for (usize i = 0; i < upper_half_pivot_count; ++i) { @@ -2256,6 +2257,48 @@ void InMemoryNode::UpdateBuffer::HybridLevel::drop_after_pivot(i32 pivot_i, } } +//==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - +// +Status InMemoryNode::UpdateBuffer::HybridLevel::split_pivot( + InMemoryNode& node, + BatchUpdateContext& update_context, + i32 pivot_i, + const Interval& pivot_key_range, + const KeyView& sibling_min_key) +{ + for (auto& sub_level : this->levels) { + if (batt::is_case(sub_level)) { + SegmentedLevel& segmented_sub_level = std::get(sub_level); + BATT_REQUIRE_OK(in_segmented_level(node, + segmented_sub_level, + update_context.page_loader, + update_context.overcommit) // + .split_pivot(pivot_i, pivot_key_range, sibling_min_key)); + } + } + + return OkStatus(); +} + +//==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - +// +void InMemoryNode::UpdateBuffer::HybridLevel::merge_pivots(InMemoryNode& node, + BatchUpdateContext& update_context, + i32 left_pivot_i, + i32 right_pivot_i) +{ + for (auto& sub_level : this->levels) { + if (batt::is_case(sub_level)) { + SegmentedLevel& segmented_sub_level = std::get(sub_level); + in_segmented_level(*this, + segmented_sub_level, + update_context.page_loader, + update_context.overcommit) + .merge_pivots(left_pivot_i, right_pivot_i); + } + } +} + //==#==========+==+=+=++=+++++++++++-+-+--+----- --- -- - - - - // Level InMemoryNode::UpdateBuffer::HybridLevel::merge(Level&& sibling_level) diff --git a/src/turtle_kv/tree/in_memory_node.hpp b/src/turtle_kv/tree/in_memory_node.hpp index fca1665..a9be968 100644 --- a/src/turtle_kv/tree/in_memory_node.hpp +++ b/src/turtle_kv/tree/in_memory_node.hpp @@ -364,12 +364,12 @@ struct InMemoryNode { */ usize segment_filter_cut_points() const; - /** \brief Merges this level with a "sibling" level from another node. - * + /** \brief Merges this level with a "sibling" level from another node. + * * This function is called when two nodes are being merged and their update buffers are * being merged as well. In this function, this level is the "left" level (i.e., the level - * comes from the left node in the merge) and `sibling_level` is the "right" level. - * + * comes from the left node in the merge) and `sibling_level` is the "right" level. + * * `node_pivot_count` is the number of pivots in the left node (i.e., the node that this * level exists in). */ @@ -525,6 +525,17 @@ struct InMemoryNode { llfs::PageLoader& page_loader, const TreeOptions& tree_options); + Status split_pivot(InMemoryNode& node, + BatchUpdateContext& update_context, + i32 pivot_i, + const Interval& pivot_key_range, + const KeyView& sibling_min_key); + + void merge_pivots(InMemoryNode& node, + BatchUpdateContext& update_context, + i32 left_pivot_i, + i32 right_pivot_i); + Level merge(Level&& sibling_level); StatusOr start_serialize(const InMemoryNode& node, TreeSerializeContext& context); @@ -564,7 +575,7 @@ struct InMemoryNode { SmallVec children; SmallVec child_pages; SmallVec pending_bytes; - u64 pending_bytes_is_exact = 0; + ActivePivotsSet128 pending_bytes_is_exact = {}; Optional latest_flush_pivot_i_; SmallVec pivot_keys_; KeyView max_key_; @@ -669,8 +680,8 @@ struct InMemoryNode { void add_pending_bytes(usize pivot_i, usize byte_count) { - this->pending_bytes_is_exact = set_bit(this->pending_bytes_is_exact, pivot_i, false); - BATT_CHECK_EQ(get_bit(this->pending_bytes_is_exact, pivot_i), false); + this->pending_bytes_is_exact.set(pivot_i, false); + BATT_CHECK_EQ(this->pending_bytes_is_exact.get(pivot_i), false); this->pending_bytes[pivot_i] += byte_count; } diff --git a/src/turtle_kv/tree/subtree.cpp b/src/turtle_kv/tree/subtree.cpp index 93fd1a5..1f0365b 100644 --- a/src/turtle_kv/tree/subtree.cpp +++ b/src/turtle_kv/tree/subtree.cpp @@ -323,13 +323,10 @@ Status Subtree::flush_and_shrink(BatchUpdateContext& context) noexcept return flush_status; } - // Nothing was available to flush since the node's update buffer is empty. Try collapsing one - // level of the tree. + // Nothing was available to flush since either the node's update buffer is empty or we have + // a leaf root. If possible and necessary, try collapsing one level of the tree. // if (flush_status == batt::StatusCode::kUnavailable) { - // Note: At this point, we must have a node and not a leaf, since the `is_root_viable` check - // above will return Viable` for a leaf and we break out of the loop in that case. - // BATT_REQUIRE_OK(this->try_shrink()); } } @@ -593,9 +590,10 @@ Status Subtree::try_merge(BatchUpdateContext& context, Subtree&& sibling) noexce [&](auto& in_memory) -> Status { using PtrT = std::decay_t; - BATT_CHECK(batt::is_case(sibling.impl_)); + BATT_CHECK(batt::is_case(sibling.impl_)) + << "Sibling Subtree must be the same in-memory type as this Subtree!"; auto& sibling_ptr = std::get(sibling.impl_); - BATT_CHECK(sibling_ptr); + BATT_CHECK_NOT_NULLPTR(sibling_ptr); BATT_REQUIRE_OK(in_memory->try_merge(context, std::move(sibling_ptr))); From 0def57060bd9e884a9e6790c10b48a938e9f0e02 Mon Sep 17 00:00:00 2001 From: Vidya Silai Date: Wed, 1 Apr 2026 15:11:43 -0400 Subject: [PATCH 48/48] Some more Subtree updates --- src/turtle_kv/tree/subtree.cpp | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/src/turtle_kv/tree/subtree.cpp b/src/turtle_kv/tree/subtree.cpp index 1f0365b..67b9c9d 100644 --- a/src/turtle_kv/tree/subtree.cpp +++ b/src/turtle_kv/tree/subtree.cpp @@ -315,7 +315,9 @@ Status Subtree::flush_and_shrink(BatchUpdateContext& context) noexcept BATT_CHECK(!this->is_serialized()); - while (!is_root_viable(this->get_viability())) { + usize retries = 0; + + while (!is_root_viable(this->get_viability()) && retries < kMaxPivots) { // First, try flushing. If flushing makes the root viable, return immediately. // Status flush_status = this->try_flush(context); @@ -329,17 +331,14 @@ Status Subtree::flush_and_shrink(BatchUpdateContext& context) noexcept if (flush_status == batt::StatusCode::kUnavailable) { BATT_REQUIRE_OK(this->try_shrink()); } + + ++retries; } // If the root is a leaf and there are no items in the leaf, set the root to be an empty subtree. // if (batt::is_case>(this->impl_)) { - std::unique_ptr& root_leaf = std::get>(this->impl_); - BATT_CHECK(root_leaf); - - if (!root_leaf->get_item_count()) { - this->impl_ = llfs::PageIdSlot::from_page_id(llfs::PageId{}); - } + BATT_REQUIRE_OK(this->try_shrink()); } return OkStatus(); @@ -578,6 +577,13 @@ Status Subtree::try_merge(BatchUpdateContext& context, Subtree&& sibling) noexce { BATT_CHECK(!this->locked_.load()); + BATT_ASSIGN_OK_RESULT(i32 this_height, + this->get_height(context.page_loader, context.overcommit)); + BATT_ASSIGN_OK_RESULT(i32 sibling_height, + sibling.get_height(context.page_loader, context.overcommit)); + + BATT_CHECK_EQ(this_height, sibling_height); + return batt::case_of( this->impl_, @@ -637,7 +643,11 @@ Status Subtree::try_shrink() noexcept }, [&](const std::unique_ptr& leaf [[maybe_unused]]) -> StatusOr { - return {batt::StatusCode::kUnavailable}; + if (!leaf->get_item_count()) { + return llfs::PageIdSlot::from_page_id(llfs::PageId{}); + } + + return {std::move(*this)}; }, [&](const std::unique_ptr& node) -> StatusOr {