diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock index 330e6591856..5902c1024be 100644 --- a/quickwit/Cargo.lock +++ b/quickwit/Cargo.lock @@ -8551,6 +8551,29 @@ dependencies = [ "tracing-opentelemetry", ] +[[package]] +name = "quickwit-compaction" +version = "0.8.0" +dependencies = [ + "anyhow", + "async-trait", + "itertools 0.14.0", + "quickwit-actors", + "quickwit-common", + "quickwit-config", + "quickwit-doc-mapper", + "quickwit-indexing", + "quickwit-metastore", + "quickwit-metrics", + "quickwit-proto", + "quickwit-storage", + "serde_json", + "time", + "tokio", + "tracing", + "ulid", +] + [[package]] name = "quickwit-config" version = "0.8.0" @@ -8956,6 +8979,7 @@ dependencies = [ "mockall", "quickwit-actors", "quickwit-common", + "quickwit-compaction", "quickwit-config", "quickwit-doc-mapper", "quickwit-index-management", @@ -9330,6 +9354,7 @@ dependencies = [ "quickwit-actors", "quickwit-cluster", "quickwit-common", + "quickwit-compaction", "quickwit-config", "quickwit-control-plane", "quickwit-datafusion", diff --git a/quickwit/Cargo.toml b/quickwit/Cargo.toml index 74b7f6069b9..e42092ed7ae 100644 --- a/quickwit/Cargo.toml +++ b/quickwit/Cargo.toml @@ -14,6 +14,7 @@ members = [ "quickwit-datetime", "quickwit-df-core", "quickwit-directories", + "quickwit-compaction", "quickwit-doc-mapper", "quickwit-dst", "quickwit-index-management", @@ -60,6 +61,7 @@ default-members = [ "quickwit-control-plane", "quickwit-datetime", "quickwit-directories", + "quickwit-compaction", "quickwit-doc-mapper", "quickwit-index-management", "quickwit-indexing", @@ -242,6 +244,7 @@ sea-query = { version = "0.32" } sea-query-binder = { version = "0.7", features = [ "runtime-tokio-rustls", "sqlx-postgres", + "postgres-array", ] } # ^1.0.184 due to serde-rs/serde#2538 serde = { version = "1.0.228", features = ["derive", "rc"] } @@ -372,6 +375,7 @@ quickwit-cluster = { path = "quickwit-cluster" } quickwit-codegen = { path = "quickwit-codegen" } quickwit-codegen-example = { path = "quickwit-codegen/example" } quickwit-common = { path = "quickwit-common" } +quickwit-compaction = { path = "quickwit-compaction" } quickwit-config = { path = "quickwit-config" } quickwit-control-plane = { path = "quickwit-control-plane" } quickwit-datafusion = { path = "quickwit-datafusion" } diff --git a/quickwit/quickwit-cli/src/tool.rs b/quickwit/quickwit-cli/src/tool.rs index 0fd6b1917ca..88bb53887bf 100644 --- a/quickwit/quickwit-cli/src/tool.rs +++ b/quickwit/quickwit-cli/src/tool.rs @@ -17,6 +17,7 @@ use std::io::{IsTerminal, Stdout, Write, stdout}; use std::num::NonZeroUsize; use std::path::PathBuf; use std::str::FromStr; +use std::sync::Arc; use std::time::{Duration, Instant}; use std::{env, fmt, io}; @@ -37,11 +38,11 @@ use quickwit_config::{ TransformConfig, VecSourceParams, }; use quickwit_index_management::{IndexService, clear_cache_directory}; -use quickwit_indexing::BoxedPipelineHandle; use quickwit_indexing::actors::{IndexingService, MergePipeline, MergeSchedulerService}; use quickwit_indexing::models::{ DetachIndexingPipeline, DetachMergePipeline, IndexingStatistics, SpawnPipeline, }; +use quickwit_indexing::{BoxedPipelineHandle, IndexingSplitCache}; use quickwit_ingest::IngesterPool; use quickwit_metastore::IndexMetadataResponseExt; use quickwit_proto::indexing::CpuCapacity; @@ -55,12 +56,12 @@ use quickwit_serve::{ }; use quickwit_storage::{BundleStorage, Storage}; use thousands::Separable; -use tracing::{debug, info}; +use tracing::debug; use crate::checklist::{GREEN_COLOR, RED_COLOR}; use crate::{ - THROUGHPUT_WINDOW_SIZE, config_cli_arg, get_resolvers, load_node_config, run_index_checklist, - start_actor_runtimes, + THROUGHPUT_WINDOW_SIZE, config_cli_arg, get_resolvers, info, load_node_config, + run_index_checklist, start_actor_runtimes, }; pub fn build_tool_command() -> Command { @@ -221,8 +222,8 @@ pub enum ToolCliCommand { GarbageCollect(GarbageCollectIndexArgs), LocalIngest(LocalIngestDocsArgs), LocalSearch(LocalSearchArgs), - Merge(MergeArgs), ExtractSplit(ExtractSplitArgs), + Merge(MergeArgs), } impl ToolCliCommand { @@ -448,6 +449,8 @@ pub async fn local_ingest_docs_cli(args: LocalIngestDocsArgs) -> anyhow::Result< )?; let universe = Universe::new(); let merge_scheduler_service_mailbox = universe.get_or_spawn_one(); + let split_cache = + Arc::new(IndexingSplitCache::from_config(&indexer_config, &config.data_dir_path).await?); let indexing_server = IndexingService::new( config.node_id.clone(), config.data_dir_path.clone(), @@ -456,10 +459,11 @@ pub async fn local_ingest_docs_cli(args: LocalIngestDocsArgs) -> anyhow::Result< cluster, metastore, None, - merge_scheduler_service_mailbox, + Some(merge_scheduler_service_mailbox), IngesterPool::default(), storage_resolver, EventBroker::default(), + split_cache, ) .await?; let (indexing_server_mailbox, indexing_server_handle) = @@ -493,11 +497,13 @@ pub async fn local_ingest_docs_cli(args: LocalIngestDocsArgs) -> anyhow::Result< let statistics = start_statistics_reporting_loop(indexing_pipeline_handle, args.input_path_opt.is_none()) .await?; + merge_pipeline_handle .mailbox() .ask(quickwit_indexing::FinishPendingMergesAndShutdownPipeline) .await?; merge_pipeline_handle.join().await; + // Shutdown the indexing server. universe .send_exit_with_success(&indexing_server_mailbox) @@ -593,10 +599,11 @@ pub async fn merge_cli(args: MergeArgs) -> anyhow::Result<()> { cluster, metastore, None, - merge_scheduler_service, + Some(merge_scheduler_service), IngesterPool::default(), storage_resolver, EventBroker::default(), + Arc::new(IndexingSplitCache::no_caching()), ) .await?; let (indexing_service_mailbox, indexing_service_handle) = diff --git a/quickwit/quickwit-compaction/Cargo.toml b/quickwit/quickwit-compaction/Cargo.toml new file mode 100644 index 00000000000..caf7fde7e1b --- /dev/null +++ b/quickwit/quickwit-compaction/Cargo.toml @@ -0,0 +1,37 @@ +[package] +name = "quickwit-compaction" +description = "Compactor implementation and CompactionService" + +version.workspace = true +edition.workspace = true +homepage.workspace = true +documentation.workspace = true +repository.workspace = true +authors.workspace = true +license.workspace = true + +[dependencies] +anyhow = { workspace = true } +async-trait = { workspace = true } +quickwit-actors = { workspace = true } +quickwit-common = { workspace = true } +quickwit-config = { workspace = true } +quickwit-doc-mapper = { workspace = true } +quickwit-indexing = { workspace = true } +quickwit-metastore = { workspace = true } +quickwit-metrics = { workspace = true } +quickwit-proto = { workspace = true } +quickwit-storage = { workspace = true } +serde_json = { workspace = true } +time = { workspace = true } +tracing = { workspace = true } +tokio = { workspace = true } +ulid = { workspace = true } +itertools = "0.14.0" + +[dev-dependencies] +quickwit-actors = { workspace = true, features = ["testsuite"] } +quickwit-doc-mapper = { workspace = true, features = ["testsuite"] } +quickwit-metastore = { workspace = true, features = ["testsuite"] } +quickwit-proto = { workspace = true, features = ["testsuite"] } +quickwit-storage = { workspace = true, features = ["testsuite"] } diff --git a/quickwit/quickwit-compaction/src/compaction_pipeline.rs b/quickwit/quickwit-compaction/src/compaction_pipeline.rs new file mode 100644 index 00000000000..9fc2b261967 --- /dev/null +++ b/quickwit/quickwit-compaction/src/compaction_pipeline.rs @@ -0,0 +1,469 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; +use std::time::Instant; + +use quickwit_actors::{ActorHandle, HEARTBEAT, Health, QueueCapacity, SpawnContext, Supervisable}; +use quickwit_common::KillSwitch; +use quickwit_common::io::{IoControls, Limiter}; +use quickwit_common::pubsub::EventBroker; +use quickwit_common::temp_dir::TempDirectory; +use quickwit_config::RetentionPolicy; +use quickwit_doc_mapper::DocMapper; +use quickwit_indexing::actors::{ + MergeExecutor, MergeSplitDownloader, Packager, Publisher, Uploader, UploaderType, +}; +use quickwit_indexing::merge_policy::MergeOperation; +use quickwit_indexing::{IndexingSplitStore, SplitsUpdateMailbox}; +use quickwit_metrics::{counter, gauge, histogram, label_values}; +use quickwit_proto::indexing::MergePipelineId; +use quickwit_proto::metastore::MetastoreServiceClient; +use quickwit_proto::types::{IndexUid, SourceId, SplitId}; +use tokio::sync::Semaphore; +use tracing::{error, info}; + +use crate::metrics::{ + COMPACTION_DURATION, COMPACTIONS_FAILED, COMPACTIONS_IN_PROGRESS, COMPACTIONS_SUCCEEDED, + SOURCE_UID_MERGE_LEVEL, +}; +use crate::{TaskId, source_uid_metrics_label}; + +#[derive(Clone, Debug, PartialEq)] +pub enum PipelineStatus { + InProgress, + Completed, + Failed { error: String }, +} + +impl PipelineStatus { + pub fn is_terminal(&self) -> bool { + matches!( + self, + PipelineStatus::Completed | PipelineStatus::Failed { .. } + ) + } +} + +pub struct PipelineStatusUpdate { + pub task_id: TaskId, + pub index_uid: IndexUid, + pub source_id: SourceId, + pub split_ids: Vec, + pub status: PipelineStatus, +} + +struct CompactionPipelineHandles { + merge_split_downloader: ActorHandle, + merge_executor: ActorHandle, + merge_packager: ActorHandle, + merge_uploader: ActorHandle, + merge_publisher: ActorHandle, + next_check_for_progress: Instant, +} + +impl CompactionPipelineHandles { + /// Returns true once per HEARTBEAT interval. Between heartbeats we only + /// check whether actors are alive, not whether they are making progress. + /// This gives CPU-intensive actors (like the packager) up to 30s to + /// complete before being declared unhealthy. + fn should_check_for_progress(&mut self) -> bool { + let now = Instant::now(); + let check_for_progress = now > self.next_check_for_progress; + if check_for_progress { + self.next_check_for_progress = now + *HEARTBEAT; + } + check_for_progress + } +} + +/// A single-use compaction pipeline. Processes one merge task and terminates. +/// +/// Owned by the `CompactorSupervisor`, which periodically calls +/// `check_actor_health()` to collect status updates. The pipeline manages +/// its own retry logic internally. +pub struct CompactionPipeline { + task_id: TaskId, + merge_operation: MergeOperation, + pipeline_id: MergePipelineId, + status: PipelineStatus, + kill_switch: KillSwitch, + scratch_directory: TempDirectory, + handles: Option, + doc_mapper: Arc, + merge_policy: Arc, + retention_policy: Option, + metastore: MetastoreServiceClient, + split_store: IndexingSplitStore, + io_throughput_limiter: Option, + max_concurrent_split_uploads: usize, + event_broker: EventBroker, + merge_execution_semaphore: Arc, + pipeline_start: Option, +} + +impl CompactionPipeline { + #[allow(clippy::too_many_arguments)] + pub fn new( + task_id: TaskId, + scratch_directory: TempDirectory, + merge_operation: MergeOperation, + pipeline_id: MergePipelineId, + doc_mapper: Arc, + merge_policy: Arc, + retention_policy: Option, + metastore: MetastoreServiceClient, + split_store: IndexingSplitStore, + io_throughput_limiter: Option, + max_concurrent_split_uploads: usize, + event_broker: EventBroker, + merge_execution_semaphore: Arc, + ) -> Self { + CompactionPipeline { + task_id, + status: PipelineStatus::InProgress, + kill_switch: KillSwitch::default(), + scratch_directory, + handles: None, + merge_operation, + pipeline_id, + doc_mapper, + merge_policy, + retention_policy, + metastore, + split_store, + io_throughput_limiter, + max_concurrent_split_uploads, + event_broker, + merge_execution_semaphore, + pipeline_start: None, + } + } + + pub fn status(&self) -> &PipelineStatus { + &self.status + } + + fn supervisables(&self) -> Vec<&dyn Supervisable> { + let Some(handles) = &self.handles else { + return Vec::new(); + }; + vec![ + &handles.merge_split_downloader, + &handles.merge_executor, + &handles.merge_packager, + &handles.merge_uploader, + &handles.merge_publisher, + ] + } + + /// Returns pipeline status update by checking the health of individual pipeline actors. + /// + /// If the pipeline is already completed or failed (terminal status), returns the status + /// without re-checking actors. The pipeline sits in this "completed" state (finished or failed) + /// until the supervisor cleans up the spot in favor of a new pipeline. This is done to + /// ensure that the planner acks the success/failure. + pub fn pipeline_status_update(&mut self) -> PipelineStatusUpdate { + self.update_status(); + self.build_status_update() + } + + fn update_status(&mut self) { + // Pipeline is finished but yet to be cleaned up. + if matches!( + self.status, + PipelineStatus::Completed | PipelineStatus::Failed { .. } + ) { + return; + } + let Some(handles) = self.handles.as_mut() else { + return; + }; + + // We check whether actors are alive on every tick (1s), but only + // check whether they are making progress once per HEARTBEAT (30s). + // This gives CPU-intensive actors like the packager time to finish + // without being falsely declared unhealthy. + let check_for_progress = handles.should_check_for_progress(); + + let mut has_healthy = false; + let mut failure_actor_names: Vec = Vec::new(); + + for supervisable in self.supervisables() { + match supervisable.check_health(check_for_progress) { + Health::Healthy => { + has_healthy = true; + } + Health::FailureOrUnhealthy => { + failure_actor_names.push(supervisable.name().to_string()); + } + Health::Success => {} + } + } + + if !failure_actor_names.is_empty() { + self.record_pipeline_duration(); + let error_msg = format!("failed actors: {:?}", failure_actor_names); + error!(task_id=%self.task_id, "{error_msg}"); + self.status = PipelineStatus::Failed { error: error_msg }; + self.record_terminal_metrics(false); + return; + } + if !has_healthy { + self.record_pipeline_duration(); + info!(task_id=%self.task_id, "all compaction pipeline actors completed"); + self.status = PipelineStatus::Completed; + self.record_terminal_metrics(true); + } + } + + /// Fires once when the pipeline transitions from InProgress to a terminal state. + /// Pairs with the `COMPACTIONS_IN_PROGRESS.inc()` in `CompactorSupervisor::spawn_task`. + fn record_terminal_metrics(&self, succeeded: bool) { + let merge_level = self.merge_operation.merge_level().to_string(); + let index_label = + source_uid_metrics_label(&self.pipeline_id.index_uid, &self.pipeline_id.source_id); + let labels = label_values!(SOURCE_UID_MERGE_LEVEL => index_label, merge_level); + gauge!(parent: COMPACTIONS_IN_PROGRESS, labels: [labels.clone()]).dec(); + if succeeded { + counter!(parent: COMPACTIONS_SUCCEEDED, labels: [labels]).inc(); + } else { + counter!(parent: COMPACTIONS_FAILED, labels: [labels]).inc(); + } + } + + fn record_pipeline_duration(&self) { + if let Some(pipeline_start_time) = self.pipeline_start { + let elapsed = pipeline_start_time.elapsed().as_secs_f64(); + let merge_level = self.merge_operation.merge_level().to_string(); + let index_label = + source_uid_metrics_label(&self.pipeline_id.index_uid, &self.pipeline_id.source_id); + let labels = label_values!(SOURCE_UID_MERGE_LEVEL => index_label, merge_level); + histogram!(parent: COMPACTION_DURATION, labels: [labels]).observe(elapsed); + } + } + + fn build_status_update(&self) -> PipelineStatusUpdate { + PipelineStatusUpdate { + task_id: self.task_id.clone(), + index_uid: self.pipeline_id.index_uid.clone(), + source_id: self.pipeline_id.source_id.clone(), + split_ids: self + .merge_operation + .splits_as_slice() + .iter() + .map(|split| split.split_id().to_string()) + .collect(), + status: self.status.clone(), + } + } + + /// Spawns the 5-actor merge execution chain and sends the `MergeOperation` + /// to the downloader to kick off execution. + pub(crate) fn spawn_pipeline(&mut self, spawn_ctx: &SpawnContext) -> anyhow::Result<()> { + info!( + task_id=%self.task_id, + pipeline_id=%self.pipeline_id, + "spawning compaction pipeline" + ); + + // Publisher (no merge planner feedback, no source) + let merge_publisher = Publisher::new( + quickwit_indexing::MERGE_PUBLISHER_NAME, + QueueCapacity::Unbounded, + self.metastore.clone(), + None, + None, + ); + let (merge_publisher_mailbox, merge_publisher_handle) = spawn_ctx + .spawn_builder() + .set_kill_switch(self.kill_switch.child()) + .spawn(merge_publisher); + + // Uploader + let merge_uploader = Uploader::new( + UploaderType::MergeUploader, + self.metastore.clone(), + self.merge_policy.clone(), + self.retention_policy.clone(), + self.split_store.clone(), + SplitsUpdateMailbox::from(merge_publisher_mailbox), + self.max_concurrent_split_uploads, + self.event_broker.clone(), + ); + let (merge_uploader_mailbox, merge_uploader_handle) = spawn_ctx + .spawn_builder() + .set_kill_switch(self.kill_switch.child()) + .spawn(merge_uploader); + + // Packager + let tag_fields = self.doc_mapper.tag_named_fields()?; + let merge_packager = Packager::new("MergePackager", tag_fields, merge_uploader_mailbox); + let (merge_packager_mailbox, merge_packager_handle) = spawn_ctx + .spawn_builder() + .set_kill_switch(self.kill_switch.child()) + .spawn(merge_packager); + + // MergeExecutor + let split_downloader_io_controls = IoControls::default() + .set_throughput_limiter_opt(self.io_throughput_limiter.clone()) + .set_component("split_downloader_merge"); + let merge_executor_io_controls = + split_downloader_io_controls.clone().set_component("merger"); + + let merge_executor = MergeExecutor::new( + self.pipeline_id.clone(), + self.metastore.clone(), + self.doc_mapper.clone(), + merge_executor_io_controls, + merge_packager_mailbox, + Some(self.merge_execution_semaphore.clone()), + ); + let (merge_executor_mailbox, merge_executor_handle) = spawn_ctx + .spawn_builder() + .set_kill_switch(self.kill_switch.child()) + .spawn(merge_executor); + + // MergeSplitDownloader + let merge_split_downloader = MergeSplitDownloader { + scratch_directory: self.scratch_directory.clone(), + split_store: self.split_store.clone(), + executor_mailbox: merge_executor_mailbox, + io_controls: split_downloader_io_controls, + }; + let (merge_split_downloader_mailbox, merge_split_downloader_handle) = spawn_ctx + .spawn_builder() + .set_kill_switch(self.kill_switch.child()) + .spawn(merge_split_downloader); + + let now = Instant::now(); + self.pipeline_start = Some(now); + // Kick off the pipeline. + merge_split_downloader_mailbox + .try_send_message(self.merge_operation.clone()) + .map_err(|err| { + anyhow::anyhow!("failed to send merge operation to downloader: {err:?}") + })?; + + self.handles = Some(CompactionPipelineHandles { + merge_split_downloader: merge_split_downloader_handle, + merge_executor: merge_executor_handle, + merge_packager: merge_packager_handle, + merge_uploader: merge_uploader_handle, + merge_publisher: merge_publisher_handle, + next_check_for_progress: Instant::now() + *HEARTBEAT, + }); + + Ok(()) + } +} + +#[cfg(test)] +pub(crate) mod tests { + use std::sync::Arc; + + use quickwit_actors::Universe; + use quickwit_common::pubsub::EventBroker; + use quickwit_common::temp_dir::TempDirectory; + use quickwit_doc_mapper::default_doc_mapper_for_test; + use quickwit_indexing::IndexingSplitStore; + use quickwit_indexing::merge_policy::{MergeOperation, default_merge_policy}; + use quickwit_metastore::SplitMetadata; + use quickwit_proto::indexing::MergePipelineId; + use quickwit_proto::metastore::{MetastoreServiceClient, MockMetastoreService}; + use quickwit_proto::types::{IndexUid, NodeId}; + use quickwit_storage::RamStorage; + use tokio::sync::Semaphore; + + use super::{CompactionPipeline, PipelineStatus}; + + pub fn test_pipeline(task_id: &str, split_ids: &[&str]) -> CompactionPipeline { + let storage = Arc::new(RamStorage::default()); + let split_store = IndexingSplitStore::create_without_local_store_for_test(storage); + let metastore = MetastoreServiceClient::from_mock(MockMetastoreService::new()); + let splits: Vec = split_ids + .iter() + .map(|id| SplitMetadata::for_test(id.to_string())) + .collect(); + let merge_operation = MergeOperation::new_merge_operation(splits); + let pipeline_id = MergePipelineId { + node_id: NodeId::from("test-node"), + index_uid: IndexUid::for_test("test-index", 0), + source_id: "test-source".to_string(), + }; + CompactionPipeline::new( + task_id.to_string(), + TempDirectory::for_test(), + merge_operation, + pipeline_id, + Arc::new(default_doc_mapper_for_test()), + default_merge_policy(), + None, + metastore, + split_store, + None, + 2, + EventBroker::default(), + Arc::new(Semaphore::new(2)), + ) + } + + #[tokio::test] + async fn test_spawn_pipeline_creates_handles() { + let universe = Universe::new(); + let mut pipeline = test_pipeline("task-1", &["split-1", "split-2"]); + assert!(pipeline.handles.is_none()); + pipeline.spawn_pipeline(universe.spawn_ctx()).unwrap(); + assert!(pipeline.handles.is_some()); + universe.assert_quit().await; + } + + #[tokio::test] + async fn test_status_update_unspawned_pipeline() { + let mut pipeline = test_pipeline("task-1", &["split-1"]); + let update = pipeline.pipeline_status_update(); + assert_eq!(update.status, PipelineStatus::InProgress); + assert_eq!(update.task_id, "task-1"); + assert_eq!(update.split_ids, vec!["split-1"]); + assert_eq!(update.source_id, "test-source"); + assert_eq!(update.index_uid, IndexUid::for_test("test-index", 0)); + } + + #[tokio::test] + async fn test_status_update_healthy_pipeline() { + let universe = Universe::new(); + let mut pipeline = test_pipeline("task-1", &["split-1"]); + pipeline.spawn_pipeline(universe.spawn_ctx()).unwrap(); + let update = pipeline.pipeline_status_update(); + assert_eq!(update.status, PipelineStatus::InProgress); + universe.assert_quit().await; + } + + #[tokio::test] + async fn test_killed_pipeline_fails() { + let universe = Universe::new(); + let mut pipeline = test_pipeline("task-1", &["split-1"]); + pipeline.spawn_pipeline(universe.spawn_ctx()).unwrap(); + + pipeline.kill_switch.kill(); + tokio::task::yield_now().await; + let update = pipeline.pipeline_status_update(); + assert!(matches!(update.status, PipelineStatus::Failed { .. })); + + // Calling again still returns Failed (sticky). + let update = pipeline.pipeline_status_update(); + assert!(matches!(update.status, PipelineStatus::Failed { .. })); + universe.assert_quit().await; + } +} diff --git a/quickwit/quickwit-compaction/src/compactor_supervisor.rs b/quickwit/quickwit-compaction/src/compactor_supervisor.rs new file mode 100644 index 00000000000..128770b02f5 --- /dev/null +++ b/quickwit/quickwit-compaction/src/compactor_supervisor.rs @@ -0,0 +1,653 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::str::FromStr; +use std::sync::Arc; +use std::time::Duration; + +use async_trait::async_trait; +use quickwit_actors::{Actor, ActorContext, ActorExitStatus, Handler, SpawnContext}; +use quickwit_common::io::{self, Limiter}; +use quickwit_common::pubsub::EventBroker; +use quickwit_common::temp_dir::TempDirectory; +use quickwit_common::uri::Uri; +use quickwit_config::{ + CompactorConfig, IndexingSettings, RetentionPolicy, SearchSettings, build_doc_mapper, +}; +use quickwit_doc_mapper::DocMapping; +use quickwit_indexing::merge_policy::{MergeOperation, merge_policy_from_settings}; +use quickwit_indexing::{IndexingSplitCache, IndexingSplitStore}; +use quickwit_metastore::SplitMetadata; +use quickwit_metrics::{gauge, label_values}; +use quickwit_proto::compaction::{ + CompactionFailure, CompactionInProgress, CompactionPlannerService, + CompactionPlannerServiceClient, CompactionSuccess, MergeTaskAssignment, ReportStatusRequest, +}; +use quickwit_proto::indexing::MergePipelineId; +use quickwit_proto::metastore::MetastoreServiceClient; +use quickwit_proto::types::NodeId; +use quickwit_storage::StorageResolver; +use tokio::sync::Semaphore; +use tracing::{error, info}; + +use crate::compaction_pipeline::{CompactionPipeline, PipelineStatus, PipelineStatusUpdate}; +use crate::metrics::{AVAILABLE_SLOTS, COMPACTIONS_IN_PROGRESS, SOURCE_UID_MERGE_LEVEL}; +use crate::source_uid_metrics_label; + +const CHECK_PIPELINE_STATUSES_INTERVAL: Duration = Duration::from_secs(1); + +#[derive(Debug)] +struct CheckPipelineStatuses; + +/// Manages a pool of `CompactionPipeline`s, each executing a single merge task. +/// +/// Periodically collects pipeline status updates and forwards them to the +/// compaction planner. Pipelines manage their own retry logic internally. +pub struct CompactorSupervisor { + node_id: NodeId, + planner_client: CompactionPlannerServiceClient, + pipelines: Vec>, + // Bounds the number of pipelines running Tantivy merge step concurrently. There are 2x as many + // pipeline slots as permits, so half the pipelines can be doing IO while the other half hold a + // permit. + merge_execution_semaphore: Arc, + // Shared resources distributed to pipelines when spawning actor chains. + io_throughput_limiter: Option, + metastore: MetastoreServiceClient, + storage_resolver: StorageResolver, + split_cache: Arc, + max_concurrent_split_uploads: usize, + event_broker: EventBroker, + // Scratch directory root (/compaction/). + compaction_root_directory: TempDirectory, +} + +impl CompactorSupervisor { + #[allow(clippy::too_many_arguments)] + pub fn new( + node_id: NodeId, + planner_client: CompactionPlannerServiceClient, + compactor_config: &CompactorConfig, + metastore: MetastoreServiceClient, + storage_resolver: StorageResolver, + split_cache: Arc, + event_broker: EventBroker, + compaction_root_directory: TempDirectory, + ) -> Self { + let &CompactorConfig { + max_concurrent_merge_executions, + pipeline_slots_per_merge_execution, + max_concurrent_split_uploads, + max_merge_write_throughput, + } = compactor_config; + let num_pipeline_slots = + max_concurrent_merge_executions.get() * pipeline_slots_per_merge_execution.get(); + let pipelines = (0..num_pipeline_slots).map(|_| None).collect(); + let merge_execution_semaphore = + Arc::new(Semaphore::new(max_concurrent_merge_executions.get())); + let io_throughput_limiter = max_merge_write_throughput.map(io::limiter); + CompactorSupervisor { + node_id, + planner_client, + pipelines, + merge_execution_semaphore, + io_throughput_limiter, + metastore, + storage_resolver, + split_cache, + max_concurrent_split_uploads, + event_broker, + compaction_root_directory, + } + } + + fn check_pipeline_statuses(&mut self) -> Vec { + let mut statuses = Vec::new(); + for slot in &mut self.pipelines { + let Some(pipeline) = slot else { + continue; + }; + statuses.push(pipeline.pipeline_status_update()); + } + statuses + } + + async fn process_new_tasks( + &mut self, + assignments: Vec, + spawn_ctx: &SpawnContext, + ) { + // The planner has acknowledged the statuses we just sent — drop any + // Completed/Failed pipelines so their scratch directories and actor + // handles are released. + for slot in &mut self.pipelines { + slot.take_if(|p| p.status().is_terminal()); + } + for assignment in assignments { + let task_id = assignment.task_id.clone(); + if let Err(error) = self.spawn_task(assignment, spawn_ctx).await { + error!(%task_id, %error, "failed to spawn compaction task"); + } + } + let available_slots = self + .pipelines + .iter() + .filter(|pipeline_opt| pipeline_opt.is_none()) + .count(); + AVAILABLE_SLOTS.set(available_slots as f64); + } + + async fn spawn_task( + &mut self, + assignment: MergeTaskAssignment, + spawn_ctx: &SpawnContext, + ) -> anyhow::Result<()> { + let source_uid_label = source_uid_metrics_label( + assignment.index_uid.as_ref().unwrap(), + &assignment.source_id, + ); + let merge_level = assignment.merge_level.to_string(); + let labels = label_values!(SOURCE_UID_MERGE_LEVEL => source_uid_label, merge_level); + + let slot_idx = self + .pipelines + .iter() + .position(Option::is_none) + .ok_or_else(|| anyhow::anyhow!("no free pipeline slot"))?; + let scratch_directory = self + .compaction_root_directory + .named_temp_child(&assignment.task_id)?; + let mut pipeline = self + .build_compaction_pipeline(assignment, scratch_directory) + .await?; + pipeline.spawn_pipeline(spawn_ctx)?; + self.pipelines[slot_idx] = Some(pipeline); + gauge!(parent: COMPACTIONS_IN_PROGRESS, labels: [labels]).inc(); + Ok(()) + } + + async fn build_compaction_pipeline( + &self, + assignment: MergeTaskAssignment, + scratch_directory: TempDirectory, + ) -> anyhow::Result { + let splits: Vec = assignment + .splits_metadata_json + .iter() + .map(|json| serde_json::from_str(json)) + .collect::, serde_json::Error>>()?; + let doc_mapping: DocMapping = serde_json::from_str(&assignment.doc_mapping_json)?; + let search_settings: SearchSettings = + serde_json::from_str(&assignment.search_settings_json)?; + let indexing_settings: IndexingSettings = + serde_json::from_str(&assignment.indexing_settings_json)?; + let retention_policy: Option = + if assignment.retention_policy_json.is_empty() { + None + } else { + Some(serde_json::from_str(&assignment.retention_policy_json)?) + }; + let index_uid = assignment + .index_uid + .ok_or_else(|| anyhow::anyhow!("missing index_uid in MergeTaskAssignment"))?; + + let index_storage_uri = Uri::from_str(&assignment.index_storage_uri)?; + let index_storage = self.storage_resolver.resolve(&index_storage_uri).await?; + let split_store = IndexingSplitStore::new(index_storage, self.split_cache.clone()); + + let doc_mapper = build_doc_mapper(&doc_mapping, &search_settings)?; + let merge_policy = merge_policy_from_settings(&indexing_settings); + let merge_operation = MergeOperation::new_merge_operation(splits); + let pipeline_id = MergePipelineId { + node_id: self.node_id.clone(), + index_uid, + source_id: assignment.source_id, + }; + + Ok(CompactionPipeline::new( + assignment.task_id, + scratch_directory, + merge_operation, + pipeline_id, + doc_mapper, + merge_policy, + retention_policy, + self.metastore.clone(), + split_store, + self.io_throughput_limiter.clone(), + self.max_concurrent_split_uploads, + self.event_broker.clone(), + self.merge_execution_semaphore.clone(), + )) + } + + fn build_report_status_request( + &self, + statuses: &[PipelineStatusUpdate], + ) -> ReportStatusRequest { + let in_progress_count = statuses + .iter() + .filter(|s| matches!(s.status, PipelineStatus::InProgress)) + .count(); + let available_slots = (self.pipelines.len() - in_progress_count) as i64; + + let mut in_progress = Vec::new(); + let mut successes = Vec::new(); + let mut failures = Vec::new(); + + for update in statuses { + match &update.status { + PipelineStatus::InProgress => { + in_progress.push(CompactionInProgress { + task_id: update.task_id.clone(), + index_uid: Some(update.index_uid.clone()), + source_id: update.source_id.clone(), + split_ids: update.split_ids.clone(), + }); + } + PipelineStatus::Completed => { + successes.push(CompactionSuccess { + task_id: update.task_id.clone(), + }); + } + PipelineStatus::Failed { error } => { + failures.push(CompactionFailure { + task_id: update.task_id.clone(), + error_message: error.clone(), + }); + } + } + } + + ReportStatusRequest { + node_id: self.node_id.to_string(), + available_slots: available_slots as u32, + in_progress, + successes, + failures, + } + } +} + +#[async_trait] +impl Actor for CompactorSupervisor { + type ObservableState = (); + + fn name(&self) -> String { + "CompactorSupervisor".to_string() + } + + fn observable_state(&self) -> Self::ObservableState {} + + async fn initialize(&mut self, ctx: &ActorContext) -> Result<(), ActorExitStatus> { + info!( + num_pipeline_slots=%self.pipelines.len(), + "compactor supervisor started" + ); + ctx.schedule_self_msg(CHECK_PIPELINE_STATUSES_INTERVAL, CheckPipelineStatuses); + Ok(()) + } +} + +#[async_trait] +impl Handler for CompactorSupervisor { + type Reply = (); + + async fn handle( + &mut self, + _msg: CheckPipelineStatuses, + ctx: &ActorContext, + ) -> Result<(), ActorExitStatus> { + let statuses = self.check_pipeline_statuses(); + let request = self.build_report_status_request(&statuses); + match ctx + .protect_future(self.planner_client.report_status(request)) + .await + { + Ok(response) => { + ctx.protect_future(self.process_new_tasks(response.new_tasks, ctx.spawn_ctx())) + .await; + } + Err(error) => { + error!(%error, "failed to report status to compaction planner"); + } + } + ctx.schedule_self_msg(CHECK_PIPELINE_STATUSES_INTERVAL, CheckPipelineStatuses); + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use std::num::NonZeroUsize; + + use quickwit_actors::Universe; + use quickwit_common::temp_dir::TempDirectory; + use quickwit_proto::compaction::{ + CompactionPlannerServiceClient, MockCompactionPlannerService, + }; + use quickwit_proto::metastore::{MetastoreServiceClient, MockMetastoreService}; + use quickwit_proto::types::NodeId; + use quickwit_storage::StorageResolver; + + use super::*; + use crate::compaction_pipeline::tests::test_pipeline; + + /// Builds a test supervisor with `max_concurrent_merge_executions` permits + /// and `2 * max_concurrent_merge_executions` pipeline slots. + fn test_supervisor(max_concurrent_merge_executions: usize) -> CompactorSupervisor { + let metastore = MetastoreServiceClient::from_mock(MockMetastoreService::new()); + let compaction_client = + CompactionPlannerServiceClient::from_mock(MockCompactionPlannerService::new()); + let compactor_config = CompactorConfig { + max_concurrent_merge_executions: NonZeroUsize::new(max_concurrent_merge_executions) + .expect("max_concurrent_merge_executions must be non-zero"), + ..CompactorConfig::for_test() + }; + CompactorSupervisor::new( + NodeId::from("test-node"), + compaction_client, + &compactor_config, + metastore, + StorageResolver::for_test(), + Arc::new(IndexingSplitCache::no_caching()), + EventBroker::default(), + TempDirectory::for_test(), + ) + } + + #[test] + fn test_check_pipeline_statuses_empty_slots() { + let mut supervisor = test_supervisor(4); + let statuses = supervisor.check_pipeline_statuses(); + assert!(statuses.is_empty()); + } + + #[tokio::test] + async fn test_check_pipeline_statuses_with_pipelines() { + let universe = Universe::new(); + let mut supervisor = test_supervisor(4); + + let mut pipeline = test_pipeline("task-1", &["split-a", "split-b"]); + pipeline.spawn_pipeline(universe.spawn_ctx()).unwrap(); + supervisor.pipelines[0] = Some(pipeline); + + let mut pipeline = test_pipeline("task-2", &["split-c"]); + pipeline.spawn_pipeline(universe.spawn_ctx()).unwrap(); + supervisor.pipelines[2] = Some(pipeline); + + let statuses = supervisor.check_pipeline_statuses(); + assert_eq!(statuses.len(), 2); + assert_eq!(statuses[0].task_id, "task-1"); + assert_eq!(statuses[0].split_ids, vec!["split-a", "split-b"]); + assert_eq!(statuses[1].task_id, "task-2"); + assert_eq!(statuses[1].split_ids, vec!["split-c"]); + universe.assert_quit().await; + } + + #[tokio::test] + async fn test_end_to_end_statuses_to_proto() { + let universe = Universe::new(); + // 3 merge-executions → 6 pipeline slots + let mut supervisor = test_supervisor(3); + + let mut pipeline = test_pipeline("task-1", &["s1", "s2"]); + pipeline.spawn_pipeline(universe.spawn_ctx()).unwrap(); + supervisor.pipelines[0] = Some(pipeline); + + let statuses = supervisor.check_pipeline_statuses(); + let request = supervisor.build_report_status_request(&statuses); + + assert_eq!(request.node_id, "test-node"); + // 6 slots, 1 in-progress = 5 available + assert_eq!(request.available_slots, 5); + assert_eq!(request.in_progress.len(), 1); + assert_eq!(request.in_progress[0].task_id, "task-1"); + assert_eq!(request.in_progress[0].split_ids, vec!["s1", "s2"]); + assert!(request.successes.is_empty()); + assert!(request.failures.is_empty()); + universe.assert_quit().await; + } + + #[test] + fn test_build_report_status_request_empty() { + // 4 merge-executions → 8 pipeline slots + let supervisor = test_supervisor(4); + let request = supervisor.build_report_status_request(&[]); + assert_eq!(request.node_id, "test-node"); + assert_eq!(request.available_slots, 8); + assert!(request.in_progress.is_empty()); + assert!(request.successes.is_empty()); + assert!(request.failures.is_empty()); + } + + fn test_assignment(task_id: &str, split_ids: &[&str]) -> MergeTaskAssignment { + let index_metadata = + quickwit_metastore::IndexMetadata::for_test("test-index", "ram:///test-index"); + let config = &index_metadata.index_config; + let splits: Vec = split_ids + .iter() + .map(|id| SplitMetadata::for_test(id.to_string())) + .collect(); + MergeTaskAssignment { + task_id: task_id.to_string(), + splits_metadata_json: splits + .iter() + .map(|s| serde_json::to_string(s).unwrap()) + .collect(), + doc_mapping_json: serde_json::to_string(&config.doc_mapping).unwrap(), + search_settings_json: serde_json::to_string(&config.search_settings).unwrap(), + indexing_settings_json: serde_json::to_string(&config.indexing_settings).unwrap(), + retention_policy_json: String::new(), + index_uid: Some(index_metadata.index_uid.clone()), + source_id: "test-source".to_string(), + index_storage_uri: config.index_uri.to_string(), + merge_level: 1, + } + } + + #[tokio::test] + async fn test_build_compaction_pipeline_deserialization_errors() { + let supervisor = test_supervisor(4); + let scratch = TempDirectory::for_test; + + // Bad splits JSON. + let mut assignment = test_assignment("t", &["s1"]); + assignment.splits_metadata_json = vec!["not json".to_string()]; + assert!( + supervisor + .build_compaction_pipeline(assignment, scratch()) + .await + .is_err() + ); + + // Bad doc mapping JSON. + let mut assignment = test_assignment("t", &["s1"]); + assignment.doc_mapping_json = "not json".to_string(); + assert!( + supervisor + .build_compaction_pipeline(assignment, scratch()) + .await + .is_err() + ); + + // Bad search settings JSON. + let mut assignment = test_assignment("t", &["s1"]); + assignment.search_settings_json = "not json".to_string(); + assert!( + supervisor + .build_compaction_pipeline(assignment, scratch()) + .await + .is_err() + ); + + // Bad indexing settings JSON. + let mut assignment = test_assignment("t", &["s1"]); + assignment.indexing_settings_json = "not json".to_string(); + assert!( + supervisor + .build_compaction_pipeline(assignment, scratch()) + .await + .is_err() + ); + + // Bad retention policy JSON (non-empty but invalid). + let mut assignment = test_assignment("t", &["s1"]); + assignment.retention_policy_json = "not json".to_string(); + assert!( + supervisor + .build_compaction_pipeline(assignment, scratch()) + .await + .is_err() + ); + + // Missing index_uid. + let mut assignment = test_assignment("t", &["s1"]); + assignment.index_uid = None; + assert!( + supervisor + .build_compaction_pipeline(assignment, scratch()) + .await + .is_err() + ); + } + + #[tokio::test] + async fn test_spawn_task_fails_when_all_slots_occupied() { + let universe = Universe::new(); + // 1 merge-execution → 2 pipeline slots + let mut supervisor = test_supervisor(1); + + supervisor + .spawn_task(test_assignment("task-1", &["s1"]), universe.spawn_ctx()) + .await + .unwrap(); + supervisor + .spawn_task(test_assignment("task-2", &["s2"]), universe.spawn_ctx()) + .await + .unwrap(); + + // Both slots InProgress — no room. + assert!( + supervisor + .spawn_task(test_assignment("task-3", &["s3"]), universe.spawn_ctx()) + .await + .is_err() + ); + + universe.assert_quit().await; + } + + #[tokio::test] + async fn test_end_to_end_report_status_and_spawn() { + let universe = Universe::new(); + + // Mock planner that returns one assignment on the first call. + let assignment = test_assignment("planner-task-1", &["s1", "s2"]); + let assignments = vec![assignment]; + let assignments_clone = assignments.clone(); + let mut mock = MockCompactionPlannerService::new(); + mock.expect_report_status().times(1).returning(move |_req| { + Ok(quickwit_proto::compaction::ReportStatusResponse { + new_tasks: assignments_clone.clone(), + }) + }); + + let metastore = MetastoreServiceClient::from_mock(MockMetastoreService::new()); + let client = CompactionPlannerServiceClient::from_mock(mock); + // 3 merge-executions → 6 pipeline slots + let compactor_config = CompactorConfig { + max_concurrent_merge_executions: NonZeroUsize::new(3).unwrap(), + ..CompactorConfig::for_test() + }; + let mut supervisor = CompactorSupervisor::new( + NodeId::from("test-node"), + client, + &compactor_config, + metastore, + StorageResolver::for_test(), + Arc::new(IndexingSplitCache::no_caching()), + EventBroker::default(), + TempDirectory::for_test(), + ); + + // Simulate what the handler does: collect statuses, report, process response. + let statuses = supervisor.check_pipeline_statuses(); + let request = supervisor.build_report_status_request(&statuses); + assert_eq!(request.available_slots, 6); + + let response = supervisor + .planner_client + .report_status(request) + .await + .unwrap(); + supervisor + .process_new_tasks(response.new_tasks, universe.spawn_ctx()) + .await; + + // Verify the pipeline was spawned. + let statuses = supervisor.check_pipeline_statuses(); + let request = supervisor.build_report_status_request(&statuses); + assert_eq!(request.in_progress.len(), 1); + assert_eq!(request.in_progress[0].task_id, "planner-task-1"); + assert_eq!(request.in_progress[0].split_ids.len(), 2); + assert_eq!(request.available_slots, 5); + + universe.assert_quit().await; + } + + #[test] + fn test_build_report_status_request_mixed_statuses() { + // 4 merge-executions → 8 pipeline slots + let supervisor = test_supervisor(4); + let statuses = vec![ + PipelineStatusUpdate { + task_id: "task-1".to_string(), + index_uid: quickwit_proto::types::IndexUid::for_test("test-index", 0), + source_id: "src".to_string(), + split_ids: vec!["s1".to_string(), "s2".to_string()], + status: PipelineStatus::InProgress, + }, + PipelineStatusUpdate { + task_id: "task-2".to_string(), + index_uid: quickwit_proto::types::IndexUid::for_test("test-index", 0), + source_id: "src".to_string(), + split_ids: vec!["s3".to_string()], + status: PipelineStatus::Completed, + }, + PipelineStatusUpdate { + task_id: "task-3".to_string(), + index_uid: quickwit_proto::types::IndexUid::for_test("test-index", 0), + source_id: "src".to_string(), + split_ids: vec!["s4".to_string()], + status: PipelineStatus::Failed { + error: "boom".to_string(), + }, + }, + ]; + + let request = supervisor.build_report_status_request(&statuses); + + // 8 slots, 1 in-progress = 7 available + assert_eq!(request.available_slots, 7); + assert_eq!(request.in_progress.len(), 1); + assert_eq!(request.in_progress[0].task_id, "task-1"); + assert_eq!(request.in_progress[0].split_ids, vec!["s1", "s2"]); + assert_eq!(request.successes.len(), 1); + assert_eq!(request.successes[0].task_id, "task-2"); + assert_eq!(request.failures.len(), 1); + assert_eq!(request.failures[0].task_id, "task-3"); + assert_eq!(request.failures[0].error_message, "boom"); + } +} diff --git a/quickwit/quickwit-compaction/src/lib.rs b/quickwit/quickwit-compaction/src/lib.rs new file mode 100644 index 00000000000..ada03eb1294 --- /dev/null +++ b/quickwit/quickwit-compaction/src/lib.rs @@ -0,0 +1,67 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#![deny(clippy::disallowed_methods)] + +mod compaction_pipeline; +mod compactor_supervisor; +mod metrics; +pub mod planner; + +pub type TaskId = String; + +use std::sync::Arc; + +pub use compactor_supervisor::CompactorSupervisor; +use quickwit_actors::{Mailbox, Universe}; +use quickwit_common::pubsub::EventBroker; +use quickwit_common::temp_dir::TempDirectory; +use quickwit_config::CompactorConfig; +use quickwit_indexing::IndexingSplitCache; +use quickwit_proto::compaction::CompactionPlannerServiceClient; +use quickwit_proto::metastore::MetastoreServiceClient; +use quickwit_proto::types::{IndexUid, NodeId, SourceId}; +use quickwit_storage::StorageResolver; +use tracing::info; + +pub fn source_uid_metrics_label(index_uid: &IndexUid, source_id: &SourceId) -> String { + format!("{}-{}", index_uid, source_id) +} + +#[allow(clippy::too_many_arguments)] +pub async fn start_compactor_service( + universe: &Universe, + node_id: NodeId, + compaction_client: CompactionPlannerServiceClient, + compactor_config: &CompactorConfig, + metastore: MetastoreServiceClient, + storage_resolver: StorageResolver, + split_cache: Arc, + event_broker: EventBroker, + compaction_root_directory: TempDirectory, +) -> anyhow::Result> { + info!("starting compactor service"); + let supervisor = CompactorSupervisor::new( + node_id, + compaction_client, + compactor_config, + metastore, + storage_resolver, + split_cache, + event_broker, + compaction_root_directory, + ); + let (mailbox, _handle) = universe.spawn_builder().spawn(supervisor); + Ok(mailbox) +} diff --git a/quickwit/quickwit-compaction/src/metrics.rs b/quickwit/quickwit-compaction/src/metrics.rs new file mode 100644 index 00000000000..64243204e91 --- /dev/null +++ b/quickwit/quickwit-compaction/src/metrics.rs @@ -0,0 +1,53 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use quickwit_common::metrics::exponential_buckets; +use quickwit_metrics::{ + LabelNames, LazyCounter, LazyGauge, LazyHistogram, label_names, lazy_counter, lazy_gauge, + lazy_histogram, +}; + +/// Shared label template for the per-compaction metrics keyed by source and merge level. +pub(crate) const SOURCE_UID_MERGE_LEVEL: LabelNames<2> = label_names!("source_uid", "merge_level"); + +pub(crate) static COMPACTIONS_IN_PROGRESS: LazyGauge = lazy_gauge!( + name: "compactions_in_progress", + description: "number of compaction merge operations currently running on this compactor", + subsystem: "compactor", +); + +pub(crate) static COMPACTIONS_FAILED: LazyCounter = lazy_counter!( + name: "compactions_failed", + description: "total number of compaction merge operations that have failed", + subsystem: "compactor", +); + +pub(crate) static COMPACTIONS_SUCCEEDED: LazyCounter = lazy_counter!( + name: "compactions_succeeded", + description: "total number of compaction merge operations that have completed successfully", + subsystem: "compactor", +); + +pub(crate) static AVAILABLE_SLOTS: LazyGauge = lazy_gauge!( + name: "available_slots", + description: "number of compaction slots currently available on this compactor", + subsystem: "compactor", +); + +pub(crate) static COMPACTION_DURATION: LazyHistogram = lazy_histogram!( + name: "compaction_duration_seconds", + description: "duration of compaction merge operations in seconds", + subsystem: "compactor", + buckets: exponential_buckets(0.5, 2.0, 14).expect("compaction duration buckets should be valid"), +); diff --git a/quickwit/quickwit-compaction/src/planner/compaction_planner.rs b/quickwit/quickwit-compaction/src/planner/compaction_planner.rs new file mode 100644 index 00000000000..77546099715 --- /dev/null +++ b/quickwit/quickwit-compaction/src/planner/compaction_planner.rs @@ -0,0 +1,643 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::Debug; +use std::time::Duration; + +use anyhow::Result; +use async_trait::async_trait; +use itertools::Itertools; +use quickwit_actors::{Actor, ActorContext, ActorExitStatus, Handler}; +use quickwit_metastore::{ + ListSplitsQuery, ListSplitsRequestExt, MetastoreServiceStreamSplitsExt, Split, SplitState, +}; +use quickwit_metrics::{counter, label_values}; +use quickwit_proto::compaction::{ + CompactionResult, MergeTaskAssignment, ReportStatusRequest, ReportStatusResponse, +}; +use quickwit_proto::metastore::{ListSplitsRequest, MetastoreService, MetastoreServiceClient}; +use quickwit_proto::types::NodeId; +use time::OffsetDateTime; +use tracing::{error, info}; +use ulid::Ulid; + +use super::PendingMerge; +use super::compaction_state::CompactionState; +use super::index_config_metastore::{IndexConfigMetastore, IndexEntry}; +use crate::planner::metrics::{METASTORE_ERRORS, NEW_SPLITS_SCANNED, OPERATION, SOURCE_UID}; + +/// Cap on splits fetched per tick. Every tick, the planner re-scans the immature published set, +/// sorted by `maturity_timestamp` ASC so the most-urgent splits are processed first when a backlog +/// exists. Splits beyond this cap aren't lost -- they bubble into range as the front of the queue +/// is merged off. +const SCAN_PAGE_SIZE: usize = 5_000; + +/// Cap on the size of the `excluded_split_ids` list we send to the metastore. +/// It's a sanity max rather than some invariant. +const MAX_EXCLUDED_SPLIT_IDS: usize = 50_000; + +#[derive(Debug)] +pub struct CompactionPlanner { + state: CompactionState, + index_config_metastore: IndexConfigMetastore, + metastore: MetastoreServiceClient, +} + +const SCAN_AND_PLAN_INTERVAL: Duration = Duration::from_secs(5); +/// On initialization, we want to wait for two intervals to allow any in-progress workers to report +/// their progress, preventing us from frivolously rescheduling work. +const INITIAL_SCAN_AND_PLAN_INTERVAL: Duration = SCAN_AND_PLAN_INTERVAL.saturating_mul(2); + +#[derive(Debug)] +struct ScanAndPlan; + +#[async_trait] +impl Actor for CompactionPlanner { + type ObservableState = (); + + fn name(&self) -> String { + "CompactionPlanner".to_string() + } + + fn observable_state(&self) -> Self::ObservableState {} + + async fn initialize(&mut self, ctx: &ActorContext) -> Result<(), ActorExitStatus> { + info!("compaction planner starting, scanning metastore for immature splits"); + ctx.schedule_self_msg(INITIAL_SCAN_AND_PLAN_INTERVAL, ScanAndPlan); + Ok(()) + } +} + +#[async_trait] +impl Handler for CompactionPlanner { + type Reply = (); + + async fn handle( + &mut self, + _msg: ScanAndPlan, + ctx: &ActorContext, + ) -> Result<(), ActorExitStatus> { + if let Err(error) = ctx.protect_future(self.scan_and_plan()).await { + error!(%error, "error scanning metastore and planning merges"); + } + self.state.check_heartbeat_timeouts(); + ctx.schedule_self_msg(SCAN_AND_PLAN_INTERVAL, ScanAndPlan); + Ok(()) + } +} + +#[async_trait] +impl Handler for CompactionPlanner { + type Reply = CompactionResult; + + async fn handle( + &mut self, + msg: ReportStatusRequest, + _ctx: &ActorContext, + ) -> Result, ActorExitStatus> { + let node_id = NodeId::from(msg.node_id); + self.state.process_successes(&msg.successes); + self.state.process_failures(&msg.failures); + self.state.update_heartbeats(&node_id, &msg.in_progress); + let new_tasks = self.assign_tasks(&node_id, msg.available_slots); + Ok(Ok(ReportStatusResponse { new_tasks })) + } +} + +impl CompactionPlanner { + pub fn new(metastore: MetastoreServiceClient) -> Self { + CompactionPlanner { + state: CompactionState::new(), + index_config_metastore: IndexConfigMetastore::new(metastore.clone()), + metastore, + } + } + + async fn ingest_splits(&mut self, splits: Vec) { + for split in splits { + if self.state.is_split_tracked(&split.split_metadata.split_id) { + continue; + } + let Ok(index_entry) = self + .index_config_metastore + .get_for_split(&split.split_metadata) + .await + else { + error!(split_id=%split.split_metadata.split_id, "failed to load index config, skipping split"); + continue; + }; + if index_entry.is_split_mature(&split.split_metadata) { + continue; + } + self.state.track_split(split.split_metadata); + } + } + + async fn scan_metastore(&self) -> Result> { + let excluded_split_ids = self.state.tracked_split_ids(MAX_EXCLUDED_SPLIT_IDS); + let query = ListSplitsQuery::for_all_indexes() + .with_split_state(SplitState::Published) + .retain_immature(OffsetDateTime::now_utc()) + .sort_by_maturity_timestamp() + .with_limit(SCAN_PAGE_SIZE) + .with_excluded_split_ids(excluded_split_ids); + let request = ListSplitsRequest::try_from_list_splits_query(&query)?; + let splits = self + .metastore + .list_splits(request) + .await + .inspect_err(|error| { + error!(%error, "[compaction-planner] error calling metastore list_splits"); + let labels = label_values!(OPERATION => "scan"); + counter!(parent: METASTORE_ERRORS, labels: [labels]).inc(); + })? + .collect_splits() + .await + .inspect_err(|error| { + error!(%error, "[compaction-planner] error collecting metastore splits"); + let labels = label_values!(OPERATION => "collect_splits"); + counter!(parent: METASTORE_ERRORS, labels: [labels]).inc(); + })?; + emit_metastore_scan_metrics(&splits); + Ok(splits) + } + + async fn scan_and_plan(&mut self) -> Result<()> { + let splits = self.scan_metastore().await?; + self.ingest_splits(splits).await; + self.run_merge_policies(); + self.state.emit_metrics(); + Ok(()) + } + + fn run_merge_policies(&mut self) { + for partition_key in self.state.partition_keys() { + if let Some(index_entry) = self.index_config_metastore.get(&partition_key.index_uid) { + self.state + .plan_partition(&partition_key, index_entry.merge_policy()); + } + } + } + + fn assign_tasks(&mut self, node_id: &NodeId, available_slots: u32) -> Vec { + let pending_merge_ops = self.state.pop_pending(available_slots as usize); + let mut assignments = Vec::with_capacity(pending_merge_ops.len()); + + for merge_op in pending_merge_ops { + let task_id = Ulid::new().to_string(); + let Some(index_entry) = self.index_config_metastore.get(&merge_op.index_uid) else { + error!(index_uid=%merge_op.index_uid, "index config not found for pending operation, skipping"); + continue; + }; + let assignment = build_task_assignment(&task_id, index_entry, &merge_op); + + let split_ids = merge_op + .operation + .splits_as_slice() + .iter() + .map(|s| s.split_id().to_string()) + .collect(); + self.state + .record_assignment(task_id, split_ids, node_id.clone()); + + assignments.push(assignment); + } + assignments + } +} + +fn emit_metastore_scan_metrics(new_splits: &[Split]) { + let size = new_splits.len(); + info!(%size, "[compaction planner] new splits scanned from metastore"); + let counts = new_splits + .iter() + .counts_by(|split| &split.split_metadata.index_uid); + for (&index_uid, &count) in counts.iter() { + let labels = label_values!(SOURCE_UID => index_uid.to_string()); + counter!(parent: NEW_SPLITS_SCANNED, labels: [labels]).inc_by(count as u64); + } +} + +fn build_task_assignment( + task_id: &str, + index_entry: &IndexEntry, + merge_op: &PendingMerge, +) -> MergeTaskAssignment { + MergeTaskAssignment { + task_id: task_id.to_string(), + splits_metadata_json: merge_op + .operation + .splits_as_slice() + .iter() + .map(|s| { + serde_json::to_string(s).expect("split metadata serialization should not fail") + }) + .collect(), + doc_mapping_json: index_entry.doc_mapping_json(), + search_settings_json: index_entry.search_settings_json(), + indexing_settings_json: index_entry.indexing_settings_json(), + retention_policy_json: index_entry.retention_policy_json(), + index_uid: Some(merge_op.index_uid.clone()), + source_id: merge_op.source_id.clone(), + index_storage_uri: index_entry.index_storage_uri(), + merge_level: merge_op.operation.merge_level() as u64, + } +} + +#[cfg(test)] +mod tests { + use std::ops::Bound; + use std::time::Duration; + + use quickwit_common::ServiceStream; + use quickwit_config::IndexingSettings; + use quickwit_config::merge_policy_config::{ + ConstWriteAmplificationMergePolicyConfig, MergePolicyConfig, + }; + use quickwit_metastore::{ + IndexMetadata, IndexMetadataResponseExt, ListSplitsRequestExt, ListSplitsResponseExt, + SortBy, Split, SplitMaturity, SplitMetadata, SplitState, + }; + use quickwit_proto::compaction::{CompactionInProgress, CompactionSuccess}; + use quickwit_proto::metastore::{ + IndexMetadataResponse, ListSplitsResponse, MetastoreError, MockMetastoreService, + }; + use quickwit_proto::types::IndexUid; + use time::OffsetDateTime; + + use super::*; + + fn test_split(split_id: &str, index_uid: &IndexUid, update_timestamp: i64) -> Split { + Split { + split_state: SplitState::Published, + update_timestamp, + publish_timestamp: Some(update_timestamp), + split_metadata: SplitMetadata { + split_id: split_id.to_string(), + index_uid: index_uid.clone(), + source_id: "test-source".to_string(), + node_id: "test-node".to_string(), + num_docs: 100, + create_timestamp: OffsetDateTime::now_utc().unix_timestamp(), + maturity: SplitMaturity::Immature { + maturation_period: Duration::from_secs(3600), + }, + ..Default::default() + }, + } + } + + fn test_index_metadata() -> IndexMetadata { + IndexMetadata::for_test("test-index", "ram:///test-index") + } + + /// Returns an IndexMetadata with merge_factor=2 so two splits trigger a merge. + fn test_index_metadata_with_merge_factor_2() -> IndexMetadata { + let mut metadata = test_index_metadata(); + metadata.index_config.indexing_settings = IndexingSettings { + merge_policy: MergePolicyConfig::ConstWriteAmplification( + ConstWriteAmplificationMergePolicyConfig { + merge_factor: 2, + max_merge_factor: 2, + ..Default::default() + }, + ), + ..Default::default() + }; + metadata + } + + fn test_index_metadata_response(index_metadata: &IndexMetadata) -> IndexMetadataResponse { + IndexMetadataResponse::try_from_index_metadata(index_metadata).unwrap() + } + + #[tokio::test] + async fn test_scan_metastore_query_shape_and_passthrough() { + let index_uid = IndexUid::for_test("test-index", 0); + let returned_splits = vec![ + test_split("a", &index_uid, 1000), + test_split("b", &index_uid, 2000), + ]; + let returned_clone = returned_splits.clone(); + let scan_started_at = OffsetDateTime::now_utc().unix_timestamp(); + + let mut mock = MockMetastoreService::new(); + mock.expect_list_splits().returning(move |req| { + let query = req.deserialize_list_splits_query().unwrap(); + + assert_eq!(query.split_states, vec![SplitState::Published]); + assert_eq!(query.limit, Some(SCAN_PAGE_SIZE)); + assert_eq!(query.sort_by, SortBy::MaturityTimestamp); + + let Bound::Excluded(mature_at) = query.mature else { + panic!("expected Excluded mature bound, got {:?}", query.mature); + }; + let now_secs = OffsetDateTime::now_utc().unix_timestamp(); + assert!(mature_at.unix_timestamp() >= scan_started_at); + assert!(mature_at.unix_timestamp() <= now_secs); + + assert_eq!(query.update_timestamp.start, Bound::Unbounded); + assert_eq!(query.update_timestamp.end, Bound::Unbounded); + + // No tracked splits → empty exclude list. The non-empty case is + // covered by `test_scan_metastore_excludes_tracked_splits`. + assert!(query.excluded_split_ids.is_empty()); + + let response = ListSplitsResponse::try_from_splits(returned_clone.clone()).unwrap(); + Ok(ServiceStream::from(vec![Ok(response)])) + }); + + let planner = CompactionPlanner::new(MetastoreServiceClient::from_mock(mock)); + let result = planner.scan_metastore().await.unwrap(); + + assert_eq!(result.len(), 2); + assert_eq!(result[0].split_metadata.split_id, "a"); + assert_eq!(result[1].split_metadata.split_id, "b"); + } + + #[tokio::test] + async fn test_scan_metastore_excludes_tracked_splits() { + let index_uid = IndexUid::for_test("test-index", 0); + + let mut mock = MockMetastoreService::new(); + mock.expect_list_splits().returning(move |req| { + let query = req.deserialize_list_splits_query().unwrap(); + let mut excluded = query.excluded_split_ids.clone(); + excluded.sort(); + assert_eq!( + excluded, + vec!["in-flight".to_string(), "tracked".to_string()] + ); + + let response = ListSplitsResponse::try_from_splits(Vec::new()).unwrap(); + Ok(ServiceStream::from(vec![Ok(response)])) + }); + + let mut planner = CompactionPlanner::new(MetastoreServiceClient::from_mock(mock)); + planner.state.track_split(SplitMetadata { + split_id: "tracked".to_string(), + index_uid: index_uid.clone(), + ..Default::default() + }); + planner.state.update_heartbeats( + &NodeId::from("test-node"), + &[CompactionInProgress { + task_id: "t-1".to_string(), + index_uid: Some(index_uid.clone()), + source_id: "src".to_string(), + split_ids: vec!["in-flight".to_string()], + }], + ); + + planner.scan_metastore().await.unwrap(); + } + + #[tokio::test] + async fn test_ingest_splits_dedups_and_skips_mature() { + let index_metadata = test_index_metadata(); + let response = test_index_metadata_response(&index_metadata); + let index_uid = index_metadata.index_uid.clone(); + + let mut mock = MockMetastoreService::new(); + mock.expect_index_metadata() + .returning(move |_| Ok(response.clone())); + + let mut planner = CompactionPlanner::new(MetastoreServiceClient::from_mock(mock)); + + // Pre-populate: "in-flight" is already being compacted. + planner.state.track_split(SplitMetadata { + split_id: "in-flight".to_string(), + index_uid: index_uid.clone(), + ..Default::default() + }); + + let mut mature_split = test_split("mature", &index_uid, 4000); + mature_split.split_metadata.num_docs = 20_000_000; + + let splits = vec![ + test_split("in-flight", &index_uid, 1000), + test_split("fresh", &index_uid, 3000), + mature_split, + ]; + + planner.ingest_splits(splits).await; + + assert!(planner.state.is_split_tracked("fresh")); + assert!(planner.state.is_split_tracked("in-flight")); + assert!(!planner.state.is_split_tracked("mature")); + } + + #[tokio::test] + async fn test_scan_and_plan_propagates_metastore_error() { + let mut mock = MockMetastoreService::new(); + mock.expect_list_splits().returning(|_| { + Err(MetastoreError::Internal { + message: "test error".to_string(), + cause: String::new(), + }) + }); + + let mut planner = CompactionPlanner::new(MetastoreServiceClient::from_mock(mock)); + assert!(planner.scan_and_plan().await.is_err()); + } + + #[tokio::test] + async fn test_ingest_splits_skips_on_config_error() { + let index_uid = IndexUid::for_test("missing-index", 0); + let splits = vec![test_split("orphan", &index_uid, 1000)]; + + let mut mock = MockMetastoreService::new(); + mock.expect_index_metadata().returning(|_| { + Err(MetastoreError::Internal { + message: "test error".to_string(), + cause: String::new(), + }) + }); + + let mut planner = CompactionPlanner::new(MetastoreServiceClient::from_mock(mock)); + planner.ingest_splits(splits).await; + + assert!(!planner.state.is_split_tracked("orphan")); + } + + #[tokio::test] + async fn test_scan_and_plan_happy_path() { + let index_metadata = test_index_metadata(); + let index_metadata_response = test_index_metadata_response(&index_metadata); + let index_uid = index_metadata.index_uid.clone(); + + let splits = vec![ + test_split("s1", &index_uid, 5000), + test_split("s2", &index_uid, 6000), + ]; + let splits_clone = splits.clone(); + + let mut mock = MockMetastoreService::new(); + mock.expect_list_splits().returning(move |_| { + let response = ListSplitsResponse::try_from_splits(splits_clone.clone()).unwrap(); + Ok(ServiceStream::from(vec![Ok(response)])) + }); + mock.expect_index_metadata() + .returning(move |_| Ok(index_metadata_response.clone())); + + let mut planner = CompactionPlanner::new(MetastoreServiceClient::from_mock(mock)); + planner.scan_and_plan().await.unwrap(); + + assert!(planner.state.is_split_tracked("s1")); + assert!(planner.state.is_split_tracked("s2")); + } + + #[tokio::test] + async fn test_failed_task_is_retracked_on_next_scan() { + // After a worker reports failure (or times out), planner-local + // tracking is cleared. Because there is no cursor, the next scan + // rediscovers the still-Published, still-immature splits and + // re-tracks them. + let index_metadata = test_index_metadata_with_merge_factor_2(); + let index_metadata_response = test_index_metadata_response(&index_metadata); + let index_uid = index_metadata.index_uid.clone(); + + let splits = vec![ + test_split("s1", &index_uid, 1000), + test_split("s2", &index_uid, 2000), + ]; + let splits_clone = splits.clone(); + + let mut mock = MockMetastoreService::new(); + mock.expect_list_splits().returning(move |_| { + let response = ListSplitsResponse::try_from_splits(splits_clone.clone()).unwrap(); + Ok(ServiceStream::from(vec![Ok(response)])) + }); + mock.expect_index_metadata() + .returning(move |_| Ok(index_metadata_response.clone())); + + let mut planner = CompactionPlanner::new(MetastoreServiceClient::from_mock(mock)); + let node_id = NodeId::from("worker-1"); + + planner.scan_and_plan().await.unwrap(); + let assignments = planner.assign_tasks(&node_id, 10); + assert_eq!(assignments.len(), 1); + let task_id = assignments[0].task_id.clone(); + assert!(planner.state.is_split_tracked("s1")); + assert!(planner.state.is_split_tracked("s2")); + + // Worker reports failure; planner forgets the splits. + planner + .state + .process_failures(&[quickwit_proto::compaction::CompactionFailure { + task_id, + error_message: "boom".to_string(), + }]); + assert!(!planner.state.is_split_tracked("s1")); + assert!(!planner.state.is_split_tracked("s2")); + + // Next scan rediscovers them and re-tracks them. + planner.scan_and_plan().await.unwrap(); + assert!(planner.state.is_split_tracked("s1")); + assert!(planner.state.is_split_tracked("s2")); + } + + /// Helper: creates a planner with merge_factor=2, ingests the given splits, + /// and runs merge policies. Returns the planner ready for `assign_tasks`. + async fn planner_with_pending_merges(split_ids: &[&str]) -> (CompactionPlanner, IndexUid) { + let index_metadata = test_index_metadata_with_merge_factor_2(); + let index_uid = index_metadata.index_uid.clone(); + let response = test_index_metadata_response(&index_metadata); + + let mut mock = MockMetastoreService::new(); + mock.expect_index_metadata() + .returning(move |_| Ok(response.clone())); + mock.expect_list_splits().returning(|_| { + Ok(ServiceStream::from(vec![Ok( + ListSplitsResponse::try_from_splits(Vec::new()).unwrap(), + )])) + }); + + let mut planner = CompactionPlanner::new(MetastoreServiceClient::from_mock(mock)); + + let splits: Vec = split_ids + .iter() + .enumerate() + .map(|(i, id)| test_split(id, &index_uid, (i + 1) as i64 * 1000)) + .collect(); + planner.ingest_splits(splits).await; + planner.run_merge_policies(); + (planner, index_uid) + } + + #[tokio::test] + async fn test_assign_tasks_returns_assignments_and_drains_queue() { + let (mut planner, index_uid) = planner_with_pending_merges(&["s1", "s2"]).await; + let node_id = NodeId::from("worker-1"); + + // First call: get the assignment. + let assignments = planner.assign_tasks(&node_id, 10); + assert_eq!(assignments.len(), 1); + + let assignment = &assignments[0]; + assert!(!assignment.task_id.is_empty()); + assert_eq!(assignment.splits_metadata_json.len(), 2); + assert_eq!(assignment.index_uid, Some(index_uid)); + assert_eq!(assignment.source_id, "test-source"); + assert!(!assignment.doc_mapping_json.is_empty()); + assert!(!assignment.index_storage_uri.is_empty()); + + // Second call: queue is drained, no more assignments. + let assignments = planner.assign_tasks(&node_id, 10); + assert!(assignments.is_empty()); + } + + #[tokio::test] + async fn test_assign_tasks_respects_available_slots() { + // 4 splits with merge_factor=2 produces 2 merge operations. + let (mut planner, _) = planner_with_pending_merges(&["s1", "s2", "s3", "s4"]).await; + let node_id = NodeId::from("worker-1"); + + // Request only 1 slot. + let assignments = planner.assign_tasks(&node_id, 1); + assert_eq!(assignments.len(), 1); + + // The remaining operation is still pending. + let assignments = planner.assign_tasks(&node_id, 10); + assert_eq!(assignments.len(), 1); + } + + #[tokio::test] + async fn test_report_status_success_frees_splits_for_future_merges() { + let (mut planner, index_uid) = planner_with_pending_merges(&["s1", "s2"]).await; + let node_id = NodeId::from("worker-1"); + + let assignments = planner.assign_tasks(&node_id, 10); + assert_eq!(assignments.len(), 1); + let task_id = assignments[0].task_id.clone(); + + // Report success for the task. + planner + .state + .process_successes(&[CompactionSuccess { task_id }]); + + // The original splits are no longer tracked. Re-ingesting them + // (simulating the merged output being immature) creates new work. + let new_splits = vec![ + test_split("s5", &index_uid, 5000), + test_split("s6", &index_uid, 6000), + ]; + planner.ingest_splits(new_splits).await; + planner.run_merge_policies(); + + let assignments = planner.assign_tasks(&node_id, 10); + assert_eq!(assignments.len(), 1); + assert_eq!(assignments[0].splits_metadata_json.len(), 2); + } +} diff --git a/quickwit/quickwit-compaction/src/planner/compaction_state.rs b/quickwit/quickwit-compaction/src/planner/compaction_state.rs new file mode 100644 index 00000000000..59113070da8 --- /dev/null +++ b/quickwit/quickwit-compaction/src/planner/compaction_state.rs @@ -0,0 +1,467 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::{HashMap, HashSet}; +use std::fmt::Debug; +use std::sync::Arc; +use std::time::{Duration, Instant}; + +use itertools::Itertools; +use quickwit_indexing::merge_policy::MergePolicy; +use quickwit_metastore::SplitMetadata; +use quickwit_metrics::{gauge, label_values}; +use quickwit_proto::compaction::{CompactionFailure, CompactionInProgress, CompactionSuccess}; +use quickwit_proto::types::{DocMappingUid, IndexUid, NodeId, SourceId, SplitId}; +use tracing::{error, info, warn}; + +use crate::planner::metrics::{SOURCE_UID, SPLITS_NEEDING_COMPACTION, TIMED_OUT_OPERATIONS}; +use crate::planner::{PendingMerge, PendingOperations}; +use crate::{TaskId, source_uid_metrics_label}; + +const HEARTBEAT_TIMEOUT: Duration = Duration::from_secs(60); + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct CompactionPartitionKey { + pub index_uid: IndexUid, + pub source_id: SourceId, + pub partition_id: u64, + pub doc_mapping_uid: DocMappingUid, +} + +impl CompactionPartitionKey { + pub fn from_split(split: &SplitMetadata) -> Self { + CompactionPartitionKey { + index_uid: split.index_uid.clone(), + source_id: split.source_id.clone(), + partition_id: split.partition_id, + doc_mapping_uid: split.doc_mapping_uid, + } + } +} + +#[derive(Debug)] +struct InFlightCompaction { + split_ids: Vec, + node_id: NodeId, + last_heartbeat: Instant, +} + +/// Tracks all split-level state for the compaction planner: +/// which splits need compaction, which are in-flight, and which +/// operations are pending assignment to workers. +#[derive(Debug)] +pub struct CompactionState { + needs_compaction: HashMap>, + needs_compaction_split_ids: HashSet, + in_flight: HashMap, + in_flight_split_ids: HashSet, + pending_operations: PendingOperations, +} + +impl CompactionState { + pub fn new() -> Self { + CompactionState { + needs_compaction: HashMap::new(), + needs_compaction_split_ids: HashSet::new(), + in_flight: HashMap::new(), + in_flight_split_ids: HashSet::new(), + pending_operations: PendingOperations::new(), + } + } + + /// Returns true if the split is already tracked (either awaiting compaction or in-flight). + pub fn is_split_tracked(&self, split_id: &str) -> bool { + self.needs_compaction_split_ids.contains(split_id) + || self.in_flight_split_ids.contains(split_id) + } + + /// All split IDs the planner is currently tracking. Used by the metastore + /// scan to exclude already-known splits so each tick returns fresh work. + pub fn tracked_split_ids(&self, max_size: usize) -> Vec { + let mut out = Vec::with_capacity( + (self.needs_compaction_split_ids.len() + self.in_flight_split_ids.len()).min(max_size), + ); + out.extend( + self.needs_compaction_split_ids + .iter() + .chain(self.in_flight_split_ids.iter()) + .take(max_size) + .cloned(), + ); + out + } + + /// Adds a split to the needs_compaction set. + pub fn track_split(&mut self, split: SplitMetadata) { + let split_id = split.split_id().to_string(); + let key = CompactionPartitionKey::from_split(&split); + self.needs_compaction_split_ids.insert(split_id); + self.needs_compaction.entry(key).or_default().push(split); + } + + /// Returns the partition keys for iteration. The caller drives the loop. + pub fn partition_keys(&self) -> Vec { + self.needs_compaction.keys().cloned().collect() + } + + /// Runs a merge policy on a single partition and queues the resulting operations. + pub fn plan_partition( + &mut self, + partition_key: &CompactionPartitionKey, + merge_policy: &Arc, + ) { + let Some(splits) = self.needs_compaction.get_mut(partition_key) else { + return; + }; + // `MergePolicy::operations` emits at most one op per level per call, which under a backlog + // leaves the bulk of `splits` untouched per tick. Loop until no new operations are created. + loop { + let operations = merge_policy.operations(splits); + if operations.is_empty() { + break; + } + for operation in operations { + for split in &operation.splits { + self.needs_compaction_split_ids.remove(split.split_id()); + self.in_flight_split_ids + .insert(split.split_id().to_string()); + } + self.pending_operations.push(PendingMerge::new(operation)); + } + } + if splits.is_empty() { + self.needs_compaction.remove(partition_key); + } + } + + pub fn process_successes(&mut self, successes: &[CompactionSuccess]) { + for success in successes { + if let Some(inflight) = self.in_flight.remove(&success.task_id) { + info!(task_id=%success.task_id, "compaction task completed"); + for split_id in &inflight.split_ids { + self.in_flight_split_ids.remove(split_id.as_str()); + } + } + } + } + + pub fn process_failures(&mut self, failures: &[CompactionFailure]) { + for failure in failures { + if let Some(inflight) = self.in_flight.remove(&failure.task_id) { + warn!(task_id=%failure.task_id, error=%failure.error_message, "compaction task failed"); + for split_id in &inflight.split_ids { + // these splits will be picked up again on the next metastore scan. + self.in_flight_split_ids.remove(split_id.as_str()); + } + } + } + } + + pub fn update_heartbeats(&mut self, node_id: &NodeId, in_progress: &[CompactionInProgress]) { + for task in in_progress { + if let Some(inflight) = self.in_flight.get_mut(&task.task_id) { + inflight.last_heartbeat = Instant::now(); + } else { + // Task not tracked — start tracking it. Happens when + // workers report tasks the planner doesn't know about yet + // (e.g. after planner start). + for split_id in &task.split_ids { + self.in_flight_split_ids.insert(split_id.clone()); + self.needs_compaction_split_ids.remove(split_id.as_str()); + } + self.in_flight.insert( + task.task_id.clone(), + InFlightCompaction { + split_ids: task.split_ids.clone(), + node_id: node_id.clone(), + last_heartbeat: Instant::now(), + }, + ); + } + } + } + + pub fn check_heartbeat_timeouts(&mut self) { + let now = Instant::now(); + let timed_out_task_ids: Vec = self + .in_flight + .iter() + .filter(|(_, inflight)| now.duration_since(inflight.last_heartbeat) > HEARTBEAT_TIMEOUT) + .map(|(task_id, _)| task_id.clone()) + .collect(); + + for task_id in timed_out_task_ids { + if let Some(inflight) = self.in_flight.remove(&task_id) { + error!(%task_id, node_id=%inflight.node_id, "compaction task timed out"); + TIMED_OUT_OPERATIONS.inc(); + for split_id in &inflight.split_ids { + // these splits will be picked up again on the next metastore scan. + self.in_flight_split_ids.remove(split_id.as_str()); + } + } + } + } + + /// Pops up to `count` pending merges for assignment, highest-priority first. + pub fn pop_pending(&mut self, count: usize) -> Vec { + let count = count.min(self.pending_operations.len()); + let mut pending = Vec::with_capacity(count); + for _ in 0..count { + pending.push(self.pending_operations.pop().unwrap()); + } + pending + } + + /// Records that an operation has been assigned to a worker. + pub fn record_assignment(&mut self, task_id: TaskId, split_ids: Vec, node_id: NodeId) { + self.in_flight.insert( + task_id, + InFlightCompaction { + split_ids, + node_id, + last_heartbeat: Instant::now(), + }, + ); + } + + pub fn emit_metrics(&self) { + // total number of splits that need to be merged by compaction partition key + self.needs_compaction + .iter() + .map(|(compaction_partition_key, splits)| { + ( + source_uid_metrics_label( + &compaction_partition_key.index_uid, + &compaction_partition_key.source_id, + ), + splits.len() as i64, + ) + }) + .into_grouping_map() + .sum() + .iter() + .for_each(|(partition_key, &total_splits)| { + let labels = label_values!(SOURCE_UID => partition_key.clone()); + gauge!(parent: SPLITS_NEEDING_COMPACTION, labels: [labels]) + .set(total_splits as f64); + }); + } +} + +#[cfg(test)] +mod tests { + use std::time::Duration; + + use quickwit_config::IndexingSettings; + use quickwit_config::merge_policy_config::{ + ConstWriteAmplificationMergePolicyConfig, MergePolicyConfig, + }; + use quickwit_indexing::merge_policy::merge_policy_from_settings; + use quickwit_proto::types::IndexUid; + + use super::*; + + fn test_merge_policy() -> Arc { + let settings = IndexingSettings { + merge_policy: MergePolicyConfig::ConstWriteAmplification( + ConstWriteAmplificationMergePolicyConfig { + merge_factor: 2, + max_merge_factor: 2, + ..Default::default() + }, + ), + ..Default::default() + }; + merge_policy_from_settings(&settings) + } + + fn test_split(split_id: &str, index_uid: &IndexUid) -> SplitMetadata { + SplitMetadata { + split_id: split_id.to_string(), + index_uid: index_uid.clone(), + source_id: "test-source".to_string(), + node_id: "test-node".to_string(), + num_docs: 1000, + create_timestamp: time::OffsetDateTime::now_utc().unix_timestamp(), + maturity: quickwit_metastore::SplitMaturity::Immature { + maturation_period: Duration::from_secs(3600), + }, + ..Default::default() + } + } + + #[test] + fn test_track_and_is_known() { + let index_uid = IndexUid::for_test("test-index", 0); + let mut state = CompactionState::new(); + + assert!(!state.is_split_tracked("s1")); + + state.track_split(test_split("s1", &index_uid)); + assert!(state.is_split_tracked("s1")); + assert!(!state.is_split_tracked("s2")); + } + + #[test] + fn test_track_split_partitions_correctly() { + let index_uid = IndexUid::for_test("test-index", 0); + let mut state = CompactionState::new(); + + state.track_split(test_split("s1", &index_uid)); + state.track_split(test_split("s2", &index_uid)); + + assert_eq!(state.partition_keys().len(), 1); + } + + #[test] + fn test_plan_partition_moves_splits_to_in_flight() { + let index_uid = IndexUid::for_test("test-index", 0); + let merge_policy = test_merge_policy(); + let mut state = CompactionState::new(); + + state.track_split(test_split("s0", &index_uid)); + state.track_split(test_split("s1", &index_uid)); + + let keys = state.partition_keys(); + assert_eq!(keys.len(), 1); + + state.plan_partition(&keys[0], &merge_policy); + + // Splits moved from needs_compaction to in_flight. + assert!(!state.pending_operations.is_empty()); + for pending in state.pending_operations.iter() { + for split in pending.operation.splits_as_slice() { + assert!(!state.needs_compaction_split_ids.contains(split.split_id())); + assert!(state.in_flight_split_ids.contains(split.split_id())); + } + } + } + + #[test] + fn test_process_successes_and_failures_clear_in_flight() { + let node_id = NodeId::from("worker-1"); + let mut state = CompactionState::new(); + + // Simulate what plan_partition + record_assignment does: + // split IDs go into in_flight_split_ids, then into InFlightCompaction. + state.in_flight_split_ids.insert("s1".to_string()); + state.in_flight_split_ids.insert("s2".to_string()); + state.record_assignment( + "task-1".to_string(), + vec!["s1".to_string(), "s2".to_string()], + node_id.clone(), + ); + + state.in_flight_split_ids.insert("s3".to_string()); + state.record_assignment("task-2".to_string(), vec!["s3".to_string()], node_id); + + assert!(state.is_split_tracked("s1")); + assert!(state.is_split_tracked("s3")); + + state.process_successes(&[CompactionSuccess { + task_id: "task-1".to_string(), + }]); + assert!(!state.is_split_tracked("s1")); + assert!(!state.is_split_tracked("s2")); + + state.process_failures(&[CompactionFailure { + task_id: "task-2".to_string(), + error_message: "boom".to_string(), + }]); + assert!(!state.is_split_tracked("s3")); + } + + #[test] + fn test_update_heartbeats_adopts_unknown_tasks() { + let node_id = NodeId::from("worker-1"); + let mut state = CompactionState::new(); + + // Simulate a split that was tracked as needing compaction. + let index_uid = IndexUid::for_test("test-index", 0); + state.track_split(test_split("s1", &index_uid)); + assert!(state.needs_compaction_split_ids.contains("s1")); + + // Worker reports an in-progress task the planner doesn't know about. + state.update_heartbeats( + &node_id, + &[CompactionInProgress { + task_id: "task-1".to_string(), + index_uid: Some(index_uid), + source_id: "test-source".to_string(), + split_ids: vec!["s1".to_string()], + }], + ); + + // Split moved from needs_compaction to in_flight. + assert!(state.in_flight_split_ids.contains("s1")); + assert!(!state.needs_compaction_split_ids.contains("s1")); + } + + #[test] + fn test_pop_pending_and_record_assignment() { + let index_uid = IndexUid::for_test("test-index", 0); + let merge_policy = test_merge_policy(); + let mut state = CompactionState::new(); + + state.track_split(test_split("s0", &index_uid)); + state.track_split(test_split("s1", &index_uid)); + + let keys = state.partition_keys(); + state.plan_partition(&keys[0], &merge_policy); + + let pending = state.pop_pending(1); + assert_eq!(pending.len(), 1); + + let operation = &pending[0].operation; + let split_ids: Vec = operation + .splits_as_slice() + .iter() + .map(|s| s.split_id().to_string()) + .collect(); + + // Splits are already in in_flight_split_ids from plan_partition. + state.record_assignment("task-1".to_string(), split_ids.clone(), NodeId::from("w1")); + + for split_id in &split_ids { + assert!(state.is_split_tracked(split_id)); + } + } + + #[test] + fn test_pop_pending_respects_count() { + let index_uid = IndexUid::for_test("test-index", 0); + let merge_policy = test_merge_policy(); + let mut state = CompactionState::new(); + + // With merge_factor=2, 4 splits produces 2 operations. + for i in 0..4 { + state.track_split(test_split(&format!("s{i}"), &index_uid)); + } + + let keys = state.partition_keys(); + state.plan_partition(&keys[0], &merge_policy); + + let total_pending = state.pending_operations.len(); + assert!(total_pending >= 2); + + let popped = state.pop_pending(1); + assert_eq!(popped.len(), 1); + assert_eq!(state.pending_operations.len(), total_pending - 1); + + // Asking for more than available returns what's there. + let rest = state.pop_pending(100); + assert_eq!(rest.len(), total_pending - 1); + assert!(state.pending_operations.is_empty()); + } +} diff --git a/quickwit/quickwit-compaction/src/planner/index_config_metastore.rs b/quickwit/quickwit-compaction/src/planner/index_config_metastore.rs new file mode 100644 index 00000000000..2c409daea7e --- /dev/null +++ b/quickwit/quickwit-compaction/src/planner/index_config_metastore.rs @@ -0,0 +1,318 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::sync::Arc; + +use quickwit_config::{IndexConfig, build_doc_mapper}; +use quickwit_doc_mapper::DocMapper; +use quickwit_indexing::merge_policy::{MergePolicy, merge_policy_from_settings}; +use quickwit_metastore::{IndexMetadataResponseExt, SplitMaturity, SplitMetadata}; +use quickwit_metrics::{counter, label_values}; +use quickwit_proto::metastore::{IndexMetadataRequest, MetastoreService, MetastoreServiceClient}; +use quickwit_proto::types::{DocMappingUid, IndexUid}; +use tracing::error; + +use crate::planner::metrics::{METASTORE_ERRORS, OPERATION}; + +/// Everything the planner needs to know about a single index. +#[derive(Debug)] +pub struct IndexEntry { + config: IndexConfig, + merge_policy: Arc, + doc_mappers: HashMap>, +} + +impl IndexEntry { + pub fn is_split_mature(&self, split: &SplitMetadata) -> bool { + matches!( + self.merge_policy + .split_maturity(split.num_docs, split.num_merge_ops), + SplitMaturity::Mature + ) + } + + pub fn merge_policy(&self) -> &Arc { + &self.merge_policy + } + + pub fn doc_mapping_json(&self) -> String { + serde_json::to_string(&self.config.doc_mapping) + .expect("doc mapping serialization should not fail") + } + + pub fn search_settings_json(&self) -> String { + serde_json::to_string(&self.config.search_settings) + .expect("search settings serialization should not fail") + } + + pub fn indexing_settings_json(&self) -> String { + serde_json::to_string(&self.config.indexing_settings) + .expect("indexing settings serialization should not fail") + } + + pub fn retention_policy_json(&self) -> String { + match &self.config.retention_policy_opt { + Some(policy) => serde_json::to_string(policy) + .expect("retention policy serialization should not fail"), + None => String::new(), + } + } + + pub fn index_storage_uri(&self) -> String { + self.config.index_uri.to_string() + } +} + +/// Caches per-index configuration, merge policies, and doc mappers. +/// Fetches from the metastore on demand. All accessors panic if called +/// for an index that hasn't been loaded. +#[derive(Debug)] +pub struct IndexConfigMetastore { + indexes: HashMap, + metastore_client: MetastoreServiceClient, +} + +impl IndexConfigMetastore { + pub fn new(metastore_client: MetastoreServiceClient) -> Self { + IndexConfigMetastore { + indexes: HashMap::new(), + metastore_client, + } + } + + /// Fetches an index config from the metastore and builds the derived + /// artifacts. + async fn fetch_index_config( + &mut self, + index_uid: &IndexUid, + doc_mapping_uid: &DocMappingUid, + ) -> anyhow::Result<()> { + let response = self + .metastore_client + .index_metadata(IndexMetadataRequest { + index_uid: Some(index_uid.clone()), + index_id: None, + }) + .await + .inspect_err(|error| { + error!(%error, "[compaction-planner] error getting index metadata from metastore"); + let labels = label_values!(OPERATION => "index_metadata"); + counter!(parent: METASTORE_ERRORS, labels: [labels]).inc(); + })?; + + let index_metadata = response.deserialize_index_metadata()?; + + let doc_mapper = build_doc_mapper( + &index_metadata.index_config.doc_mapping, + &index_metadata.index_config.search_settings, + )?; + let merge_policy = + merge_policy_from_settings(&index_metadata.index_config.indexing_settings); + + let mut doc_mappers = HashMap::new(); + doc_mappers.insert(*doc_mapping_uid, doc_mapper); + let entry = IndexEntry { + config: index_metadata.index_config, + merge_policy, + doc_mappers, + }; + self.indexes.insert(index_uid.clone(), entry); + Ok(()) + } + + /// Gets the index entry, fetching from the metastore if not cached. + pub async fn get_or_fetch( + &mut self, + index_uid: &IndexUid, + doc_mapping_uid: &DocMappingUid, + ) -> anyhow::Result<&IndexEntry> { + match self.indexes.get(index_uid) { + Some(entry) if entry.doc_mappers.contains_key(doc_mapping_uid) => {} + _ => self.fetch_index_config(index_uid, doc_mapping_uid).await?, + } + Ok(&self.indexes[index_uid]) + } + + pub async fn get_for_split(&mut self, split: &SplitMetadata) -> anyhow::Result<&IndexEntry> { + self.get_or_fetch(&split.index_uid, &split.doc_mapping_uid) + .await + } + + pub fn get(&self, index_uid: &IndexUid) -> Option<&IndexEntry> { + self.indexes.get(index_uid) + } +} + +#[cfg(test)] +mod tests { + use quickwit_metastore::{IndexMetadata, IndexMetadataResponseExt}; + use quickwit_proto::metastore::{IndexMetadataResponse, MetastoreError, MockMetastoreService}; + + use super::*; + + fn test_index_metadata() -> IndexMetadata { + IndexMetadata::for_test("test-index", "ram:///test-index") + } + + fn test_index_metadata_response(index_metadata: &IndexMetadata) -> IndexMetadataResponse { + IndexMetadataResponse::try_from_index_metadata(index_metadata).unwrap() + } + + #[tokio::test] + async fn test_get_or_fetch_loads_and_caches() { + let index_metadata = test_index_metadata(); + let response = test_index_metadata_response(&index_metadata); + let index_uid = index_metadata.index_uid.clone(); + let doc_mapping_uid = DocMappingUid::default(); + + let mut mock = MockMetastoreService::new(); + mock.expect_index_metadata() + .times(1) + .returning(move |_| Ok(response.clone())); + + let mut store = IndexConfigMetastore::new(MetastoreServiceClient::from_mock(mock)); + + // First call fetches from metastore. + let entry = store + .get_or_fetch(&index_uid, &doc_mapping_uid) + .await + .unwrap(); + assert!(!entry.doc_mapping_json().is_empty()); + assert!(!entry.search_settings_json().is_empty()); + assert!(!entry.indexing_settings_json().is_empty()); + assert!(!entry.index_storage_uri().is_empty()); + + // Second call hits cache (times(1) would panic otherwise). + store + .get_or_fetch(&index_uid, &doc_mapping_uid) + .await + .unwrap(); + } + + #[tokio::test] + async fn test_get_or_fetch_metastore_error() { + let mut mock = MockMetastoreService::new(); + mock.expect_index_metadata().returning(|_| { + Err(MetastoreError::Internal { + message: "test error".to_string(), + cause: String::new(), + }) + }); + + let mut store = IndexConfigMetastore::new(MetastoreServiceClient::from_mock(mock)); + let result = store + .get_or_fetch(&IndexUid::for_test("missing", 0), &DocMappingUid::default()) + .await; + assert!(result.is_err()); + } + + #[tokio::test] + async fn test_get_returns_none_before_fetch() { + let mock = MockMetastoreService::new(); + let store = IndexConfigMetastore::new(MetastoreServiceClient::from_mock(mock)); + assert!(store.get(&IndexUid::for_test("test-index", 0)).is_none()); + } + + #[tokio::test] + async fn test_get_returns_some_after_fetch() { + let index_metadata = test_index_metadata(); + let response = test_index_metadata_response(&index_metadata); + let index_uid = index_metadata.index_uid.clone(); + + let mut mock = MockMetastoreService::new(); + mock.expect_index_metadata() + .returning(move |_| Ok(response.clone())); + + let mut store = IndexConfigMetastore::new(MetastoreServiceClient::from_mock(mock)); + store + .get_or_fetch(&index_uid, &DocMappingUid::default()) + .await + .unwrap(); + + assert!(store.get(&index_uid).is_some()); + } + + #[tokio::test] + async fn test_get_for_split() { + let index_metadata = test_index_metadata(); + let response = test_index_metadata_response(&index_metadata); + let index_uid = index_metadata.index_uid.clone(); + + let mut mock = MockMetastoreService::new(); + mock.expect_index_metadata() + .returning(move |_| Ok(response.clone())); + + let mut store = IndexConfigMetastore::new(MetastoreServiceClient::from_mock(mock)); + let split = SplitMetadata { + split_id: "split-1".to_string(), + index_uid: index_uid.clone(), + ..Default::default() + }; + + let entry = store.get_for_split(&split).await.unwrap(); + assert!(!entry.index_storage_uri().is_empty()); + } + + #[tokio::test] + async fn test_is_split_mature() { + let index_metadata = test_index_metadata(); + let response = test_index_metadata_response(&index_metadata); + let index_uid = index_metadata.index_uid.clone(); + + let mut mock = MockMetastoreService::new(); + mock.expect_index_metadata() + .returning(move |_| Ok(response.clone())); + + let mut store = IndexConfigMetastore::new(MetastoreServiceClient::from_mock(mock)); + let entry = store + .get_or_fetch(&index_uid, &DocMappingUid::default()) + .await + .unwrap(); + + let small_split = SplitMetadata { + num_docs: 100, + num_merge_ops: 0, + ..Default::default() + }; + assert!(!entry.is_split_mature(&small_split)); + + let large_split = SplitMetadata { + num_docs: 20_000_000, + num_merge_ops: 0, + ..Default::default() + }; + assert!(entry.is_split_mature(&large_split)); + } + + #[tokio::test] + async fn test_retention_policy_json_empty_when_none() { + let index_metadata = test_index_metadata(); + let response = test_index_metadata_response(&index_metadata); + let index_uid = index_metadata.index_uid.clone(); + + let mut mock = MockMetastoreService::new(); + mock.expect_index_metadata() + .returning(move |_| Ok(response.clone())); + + let mut store = IndexConfigMetastore::new(MetastoreServiceClient::from_mock(mock)); + let entry = store + .get_or_fetch(&index_uid, &DocMappingUid::default()) + .await + .unwrap(); + + // Default test index has no retention policy. + assert!(entry.retention_policy_json().is_empty()); + } +} diff --git a/quickwit/quickwit-compaction/src/planner/metrics.rs b/quickwit/quickwit-compaction/src/planner/metrics.rs new file mode 100644 index 00000000000..2333434b8df --- /dev/null +++ b/quickwit/quickwit-compaction/src/planner/metrics.rs @@ -0,0 +1,49 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use quickwit_metrics::{LabelNames, LazyCounter, LazyGauge, label_names, lazy_counter, lazy_gauge}; + +pub(crate) const SOURCE_UID: LabelNames<1> = label_names!("source_uid"); +pub(crate) const SOURCE_UID_MERGE_LEVEL: LabelNames<2> = label_names!("source_uid", "merge_level"); +pub(crate) const OPERATION: LabelNames<1> = label_names!("operation"); + +pub(crate) static NEW_SPLITS_SCANNED: LazyCounter = lazy_counter!( + name: "new_splits_scanned", + description: "cumulative number of immature splits scanned from the metastore", + subsystem: "compaction_planner", +); + +pub(crate) static SPLITS_NEEDING_COMPACTION: LazyGauge = lazy_gauge!( + name: "splits_needing_compaction", + description: "number of splits currently tracked as needing compaction", + subsystem: "compaction_planner", +); + +pub(crate) static PENDING_MERGE_OPERATIONS: LazyGauge = lazy_gauge!( + name: "pending_merge_operations", + description: "number of pending merge operations awaiting assignment", + subsystem: "compaction_planner", +); + +pub(crate) static TIMED_OUT_OPERATIONS: LazyCounter = lazy_counter!( + name: "timed_out_operations", + description: "cumulative number of merge operations that timed out waiting for a worker heartbeat", + subsystem: "compaction_planner", +); + +pub(crate) static METASTORE_ERRORS: LazyCounter = lazy_counter!( + name: "metastore_errors", + description: "cumulative number of metastore errors encountered by the compaction planner", + subsystem: "compaction_planner", +); diff --git a/quickwit/quickwit-compaction/src/planner/mod.rs b/quickwit/quickwit-compaction/src/planner/mod.rs new file mode 100644 index 00000000000..0cc7c3132b9 --- /dev/null +++ b/quickwit/quickwit-compaction/src/planner/mod.rs @@ -0,0 +1,138 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +mod compaction_planner; +mod compaction_state; +mod index_config_metastore; +pub(crate) mod metrics; + +use std::cmp::Ordering; +use std::collections::BinaryHeap; + +pub use compaction_planner::CompactionPlanner; +use quickwit_indexing::merge_policy::{MergeOperation, compute_merge_score}; +use quickwit_metrics::{gauge, label_values}; +use quickwit_proto::types::{IndexUid, SourceId}; + +use crate::planner::metrics::{PENDING_MERGE_OPERATIONS, SOURCE_UID_MERGE_LEVEL}; +use crate::source_uid_metrics_label; + +/// A `MergeOperation` waiting to be assigned, with `priority_score` used to order the heap. +#[derive(Debug)] +pub(super) struct PendingMerge { + pub(super) operation: MergeOperation, + pub(super) index_uid: IndexUid, + pub(super) source_id: SourceId, + pub(super) priority_score: u64, +} + +impl PendingMerge { + pub(super) fn new(operation: MergeOperation) -> Self { + let first_split = operation + .splits + .first() + .expect("merge operation must have splits"); + let index_uid = first_split.index_uid.clone(); + let source_id = first_split.source_id.clone(); + let total_num_bytes: u64 = operation + .splits + .iter() + .map(|split| split.footer_offsets.end) + .sum(); + let priority_score = compute_merge_score(operation.splits.len(), total_num_bytes); + Self { + operation, + index_uid, + source_id, + priority_score, + } + } +} + +impl PartialEq for PendingMerge { + fn eq(&self, other: &Self) -> bool { + self.operation.merge_split_id == other.operation.merge_split_id + } +} + +impl Eq for PendingMerge {} + +impl PartialOrd for PendingMerge { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for PendingMerge { + /// Higher `priority_score` sorts first (max-heap); ties broken by + /// `merge_split_id` so `Ord::cmp == Equal` agrees with `PartialEq`. + fn cmp(&self, other: &Self) -> Ordering { + self.priority_score + .cmp(&other.priority_score) + .then_with(|| { + self.operation + .merge_split_id + .cmp(&other.operation.merge_split_id) + }) + } +} + +/// Max-heap of pending merges awaiting assignment, ordered by +/// `PendingMerge`'s priority score. The `pending_merge_operations` gauge is +/// maintained inline; push/pop are the only mutation paths so the metric +/// stays consistent with `len()`. +#[derive(Debug)] +pub(super) struct PendingOperations { + inner: BinaryHeap, +} + +impl PendingOperations { + pub(super) fn new() -> Self { + Self { + inner: BinaryHeap::new(), + } + } + + pub(super) fn push(&mut self, pending: PendingMerge) { + Self::adjust_gauge(&pending, 1); + self.inner.push(pending); + } + + pub(super) fn pop(&mut self) -> Option { + let pending = self.inner.pop()?; + Self::adjust_gauge(&pending, -1); + Some(pending) + } + + pub(super) fn len(&self) -> usize { + self.inner.len() + } + + #[cfg(test)] + fn is_empty(&self) -> bool { + self.inner.is_empty() + } + + #[cfg(test)] + fn iter(&self) -> impl Iterator { + self.inner.iter() + } + + fn adjust_gauge(pending: &PendingMerge, delta: i64) { + let source_uid_label = source_uid_metrics_label(&pending.index_uid, &pending.source_id); + let merge_level = pending.operation.merge_level().to_string(); + let labels = label_values!(SOURCE_UID_MERGE_LEVEL => source_uid_label, merge_level); + gauge!(parent: PENDING_MERGE_OPERATIONS, labels: [labels]).inc_by(delta as f64); + } +} diff --git a/quickwit/quickwit-config/src/lib.rs b/quickwit/quickwit-config/src/lib.rs index b10afbc8b0b..330c3e9fe8c 100644 --- a/quickwit/quickwit-config/src/lib.rs +++ b/quickwit/quickwit-config/src/lib.rs @@ -74,9 +74,9 @@ pub use crate::metastore_config::{ MetastoreBackend, MetastoreConfig, MetastoreConfigs, PostgresMetastoreConfig, }; pub use crate::node_config::{ - CacheConfig, CachePolicy, DEFAULT_QW_CONFIG_PATH, GrpcConfig, IndexerConfig, IngestApiConfig, - JaegerConfig, KeepAliveConfig, LambdaConfig, LambdaDeployConfig, NodeConfig, RestConfig, - SearcherConfig, SplitCacheLimits, StorageTimeoutPolicy, TlsConfig, + CacheConfig, CachePolicy, CompactorConfig, DEFAULT_QW_CONFIG_PATH, GrpcConfig, IndexerConfig, + IngestApiConfig, JaegerConfig, KeepAliveConfig, LambdaConfig, LambdaDeployConfig, NodeConfig, + RestConfig, SearcherConfig, SplitCacheLimits, StorageTimeoutPolicy, TlsConfig, }; use crate::source_config::serialize::{SourceConfigV0_7, SourceConfigV0_8, VersionedSourceConfig}; pub use crate::storage_config::{ diff --git a/quickwit/quickwit-config/src/node_config/mod.rs b/quickwit/quickwit-config/src/node_config/mod.rs index fff19960e57..a66d9ae714a 100644 --- a/quickwit/quickwit-config/src/node_config/mod.rs +++ b/quickwit/quickwit-config/src/node_config/mod.rs @@ -163,6 +163,11 @@ pub struct IndexerConfig { pub enable_cooperative_indexing: bool, #[serde(default = "IndexerConfig::default_cpu_capacity")] pub cpu_capacity: CpuCapacity, + /// When true, the compactor service is not implicitly started on indexer + /// nodes. Dedicated compactor nodes must be deployed separately. + /// When false (default), every indexer node also runs the compactor. + #[serde(default = "IndexerConfig::default_enable_standalone_compactors")] + pub enable_standalone_compactors: bool, /// If true, run Parquet merges through the streaming column-major engine /// (`execute_merge_operation`). If false (default), use the in-memory /// `merge_sorted_parquet_files` engine. The legacy in-memory engine is @@ -211,6 +216,10 @@ impl IndexerConfig { CpuCapacity::one_cpu_thread() * (quickwit_common::num_cpus() as u32) } + fn default_enable_standalone_compactors() -> bool { + false + } + fn default_parquet_merge_use_streaming_engine() -> bool { false } @@ -227,6 +236,7 @@ impl IndexerConfig { cpu_capacity: PIPELINE_FULL_CAPACITY * 4u32, max_merge_write_throughput: None, merge_concurrency: NonZeroUsize::new(3).unwrap(), + enable_standalone_compactors: false, parquet_merge_use_streaming_engine: Self::default_parquet_merge_use_streaming_engine(), }; Ok(indexer_config) @@ -244,11 +254,67 @@ impl Default for IndexerConfig { cpu_capacity: Self::default_cpu_capacity(), merge_concurrency: Self::default_merge_concurrency(), max_merge_write_throughput: None, + enable_standalone_compactors: Self::default_enable_standalone_compactors(), parquet_merge_use_streaming_engine: Self::default_parquet_merge_use_streaming_engine(), } } } +#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct CompactorConfig { + /// Maximum number of concurrent merges, which hold the CPU for + /// a long time. Defaults to `num_cpus - 1`. + #[serde(default = "CompactorConfig::default_max_concurrent_merge_executions")] + pub max_concurrent_merge_executions: NonZeroUsize, + /// Number of pipelines to run per merge executions. Scalar. Since merges perform a lot + /// of IO, multiple concurrent merges can be interleaved. + #[serde(default = "CompactorConfig::default_pipeline_slots_per_merge_execution")] + pub pipeline_slots_per_merge_execution: NonZeroUsize, + /// Maximum number of concurrent split uploads across all pipelines. + #[serde(default = "CompactorConfig::default_max_concurrent_split_uploads")] + pub max_concurrent_split_uploads: usize, + /// Limits the IO throughput of the split downloader and the merge executor. + #[serde(default)] + pub max_merge_write_throughput: Option, +} + +impl CompactorConfig { + fn default_max_concurrent_merge_executions() -> NonZeroUsize { + let cpus = quickwit_common::num_cpus().saturating_sub(1); + NonZeroUsize::new(cpus).unwrap_or(NonZeroUsize::MIN) + } + + fn default_pipeline_slots_per_merge_execution() -> NonZeroUsize { + NonZeroUsize::new(2).unwrap() + } + + fn default_max_concurrent_split_uploads() -> usize { + 12 + } + + #[cfg(any(test, feature = "testsuite"))] + pub fn for_test() -> Self { + CompactorConfig { + max_concurrent_merge_executions: NonZeroUsize::new(2).unwrap(), + pipeline_slots_per_merge_execution: Self::default_pipeline_slots_per_merge_execution(), + max_concurrent_split_uploads: 4, + max_merge_write_throughput: None, + } + } +} + +impl Default for CompactorConfig { + fn default() -> Self { + Self { + max_concurrent_merge_executions: Self::default_max_concurrent_merge_executions(), + pipeline_slots_per_merge_execution: Self::default_pipeline_slots_per_merge_execution(), + max_concurrent_split_uploads: Self::default_max_concurrent_split_uploads(), + max_merge_write_throughput: None, + } + } +} + #[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub struct SplitCacheLimits { @@ -794,6 +860,7 @@ pub struct NodeConfig { pub searcher_config: SearcherConfig, pub ingest_api_config: IngestApiConfig, pub jaeger_config: JaegerConfig, + pub compactor_config: CompactorConfig, } impl NodeConfig { @@ -892,23 +959,6 @@ mod tests { "1500m" ); } - { - let indexer_config: IndexerConfig = - serde_yaml::from_str(r#"merge_concurrency: 5"#).unwrap(); - assert_eq!( - indexer_config.merge_concurrency, - NonZeroUsize::new(5).unwrap() - ); - let indexer_config_json = serde_json::to_value(&indexer_config).unwrap(); - assert_eq!( - indexer_config_json - .get("merge_concurrency") - .unwrap() - .as_u64() - .unwrap(), - 5 - ); - } { let indexer_config: IndexerConfig = serde_yaml::from_str(r#"cpu_capacity: 1500m"#).unwrap(); diff --git a/quickwit/quickwit-config/src/node_config/serialize.rs b/quickwit/quickwit-config/src/node_config/serialize.rs index c7740e7146e..5830c292d47 100644 --- a/quickwit/quickwit-config/src/node_config/serialize.rs +++ b/quickwit/quickwit-config/src/node_config/serialize.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::net::{IpAddr, SocketAddr}; use std::str::FromStr; use std::time::Duration; @@ -35,8 +35,8 @@ use crate::service::QuickwitService; use crate::storage_config::StorageConfigs; use crate::templating::render_config; use crate::{ - ConfigFormat, IndexerConfig, IngestApiConfig, JaegerConfig, MetastoreConfigs, NodeConfig, - SearcherConfig, TlsConfig, validate_identifier, validate_node_id, + CompactorConfig, ConfigFormat, IndexerConfig, IngestApiConfig, JaegerConfig, MetastoreConfigs, + NodeConfig, SearcherConfig, TlsConfig, validate_identifier, validate_node_id, }; pub const DEFAULT_CLUSTER_ID: &str = "quickwit-default-cluster"; @@ -87,9 +87,12 @@ impl FromStr for List { } fn default_enabled_services() -> ConfigValue { + // The compactor is excluded by default — it only runs in standalone mode, + // which an operator must explicitly opt into via `enable_standalone_compactors`. ConfigValue::with_default(List( QuickwitService::supported_services() .into_iter() + .filter(|service| *service != QuickwitService::Compactor) .map(|service| service.to_string()) .collect(), )) @@ -192,6 +195,8 @@ struct NodeConfigBuilder { data_dir_uri: ConfigValue, metastore_uri: ConfigValue, default_index_root_uri: ConfigValue, + #[serde(default)] + enable_standalone_compactors: ConfigValue, #[serde(rename = "rest")] #[serde(default)] rest_config_builder: RestConfigBuilder, @@ -216,6 +221,9 @@ struct NodeConfigBuilder { #[serde(rename = "jaeger")] #[serde(default)] jaeger_config: JaegerConfig, + #[serde(rename = "compactor")] + #[serde(default)] + compactor_config: CompactorConfig, } impl NodeConfigBuilder { @@ -226,7 +234,10 @@ impl NodeConfigBuilder { let node_id = self.node_id.resolve(env_vars).map(NodeId::new)?; let availability_zone = self.availability_zone.resolve_optional(env_vars)?; - let enabled_services = self + self.indexer_config.enable_standalone_compactors = + self.enable_standalone_compactors.resolve(env_vars)?; + + let enabled_services: HashSet = self .enabled_services .resolve(env_vars)? .0 @@ -331,6 +342,7 @@ impl NodeConfigBuilder { searcher_config: self.searcher_config, ingest_api_config: self.ingest_api_config, jaeger_config: self.jaeger_config, + compactor_config: self.compactor_config, }; validate(&node_config)?; @@ -348,6 +360,15 @@ fn validate(node_config: &NodeConfig) -> anyhow::Result<()> { if node_config.peer_seeds.is_empty() { warn!("peer seeds are empty"); } + if node_config.is_service_enabled(QuickwitService::Compactor) + && !node_config.indexer_config.enable_standalone_compactors + { + bail!( + "the `compactor` service can only be enabled when `enable_standalone_compactors` is \ + true (or `QW_ENABLE_STANDALONE_COMPACTORS=true`). With the default indexer-local \ + merge pipeline, the compactor service must not be enabled." + ); + } validate_disk_usage(node_config); Ok(()) } @@ -421,6 +442,7 @@ impl Default for NodeConfigBuilder { data_dir_uri: default_data_dir_uri(), metastore_uri: ConfigValue::none(), default_index_root_uri: ConfigValue::none(), + enable_standalone_compactors: Default::default(), rest_config_builder: RestConfigBuilder::default(), grpc_config: GrpcConfig::default(), storage_configs: StorageConfigs::default(), @@ -429,6 +451,7 @@ impl Default for NodeConfigBuilder { searcher_config: SearcherConfig::default(), ingest_api_config: IngestApiConfig::default(), jaeger_config: JaegerConfig::default(), + compactor_config: CompactorConfig::default(), } } } @@ -528,6 +551,7 @@ pub fn node_config_for_tests_from_ports( searcher_config: SearcherConfig::default(), ingest_api_config: IngestApiConfig::default(), jaeger_config: JaegerConfig::default(), + compactor_config: CompactorConfig::default(), } } @@ -657,6 +681,7 @@ mod tests { cpu_capacity: IndexerConfig::default_cpu_capacity(), enable_cooperative_indexing: false, max_merge_write_throughput: Some(ByteSize::mb(100)), + enable_standalone_compactors: false, parquet_merge_use_streaming_engine: true, } ); @@ -768,6 +793,9 @@ mod tests { assert_eq!( config.enabled_services, QuickwitService::supported_services() + .into_iter() + .filter(|service| *service != QuickwitService::Compactor) + .collect::>(), ); assert_eq!( config.rest_config.listen_addr, @@ -822,6 +850,10 @@ mod tests { "test-peer-seed-0,test-peer-seed-1".to_string(), ); env_vars.insert("QW_DATA_DIR".to_string(), "test-data-dir".to_string()); + env_vars.insert( + "QW_ENABLE_STANDALONE_COMPACTORS".to_string(), + "true".to_string(), + ); env_vars.insert( "QW_METASTORE_URI".to_string(), "postgresql://test-user:test-password@test-host:4321/test-db".to_string(), @@ -1355,4 +1387,48 @@ mod tests { .to_string(); assert!(error_message.contains("replication factor")); } + + #[tokio::test] + async fn test_standalone_compactors_prevents_implicit_compactor() { + let config_yaml = r#" + version: 0.8 + enabled_services: [indexer] + "#; + let env_vars = HashMap::from([( + "QW_ENABLE_STANDALONE_COMPACTORS".to_string(), + "true".to_string(), + )]); + let config = + load_node_config_with_env(ConfigFormat::Yaml, config_yaml.as_bytes(), &env_vars) + .await + .unwrap(); + assert!(config.enabled_services.contains(&QuickwitService::Indexer)); + assert!( + !config + .enabled_services + .contains(&QuickwitService::Compactor) + ); + } + + #[tokio::test] + async fn test_standalone_compactors_env_var_override() { + let env_vars = HashMap::from([( + "QW_ENABLE_STANDALONE_COMPACTORS".to_string(), + "true".to_string(), + )]); + let config_yaml = r#" + version: 0.8 + enabled_services: [indexer] + "#; + let config = + load_node_config_with_env(ConfigFormat::Yaml, config_yaml.as_bytes(), &env_vars) + .await + .unwrap(); + assert!(config.enabled_services.contains(&QuickwitService::Indexer)); + assert!( + !config + .enabled_services + .contains(&QuickwitService::Compactor) + ); + } } diff --git a/quickwit/quickwit-config/src/qw_env_vars.rs b/quickwit/quickwit-config/src/qw_env_vars.rs index 75f891d332c..4f69448d26c 100644 --- a/quickwit/quickwit-config/src/qw_env_vars.rs +++ b/quickwit/quickwit-config/src/qw_env_vars.rs @@ -55,7 +55,8 @@ qw_env_vars!( QW_PEER_SEEDS, QW_DATA_DIR, QW_METASTORE_URI, - QW_DEFAULT_INDEX_ROOT_URI + QW_DEFAULT_INDEX_ROOT_URI, + QW_ENABLE_STANDALONE_COMPACTORS ); #[cfg(test)] diff --git a/quickwit/quickwit-config/src/service.rs b/quickwit/quickwit-config/src/service.rs index d323331b510..0382231330d 100644 --- a/quickwit/quickwit-config/src/service.rs +++ b/quickwit/quickwit-config/src/service.rs @@ -29,6 +29,7 @@ pub enum QuickwitService { Searcher, Janitor, Metastore, + Compactor, } #[allow(clippy::from_over_into)] @@ -46,6 +47,7 @@ impl QuickwitService { QuickwitService::Searcher => "searcher", QuickwitService::Janitor => "janitor", QuickwitService::Metastore => "metastore", + QuickwitService::Compactor => "compactor", } } @@ -70,6 +72,7 @@ impl FromStr for QuickwitService { "searcher" => Ok(QuickwitService::Searcher), "janitor" => Ok(QuickwitService::Janitor), "metastore" => Ok(QuickwitService::Metastore), + "compactor" => Ok(QuickwitService::Compactor), _ => { bail!( "failed to parse service `{service_str}`. supported services are: `{}`", diff --git a/quickwit/quickwit-indexing/failpoints/mod.rs b/quickwit/quickwit-indexing/failpoints/mod.rs index d8c5ab0e418..25b6522851b 100644 --- a/quickwit/quickwit-indexing/failpoints/mod.rs +++ b/quickwit/quickwit-indexing/failpoints/mod.rs @@ -285,9 +285,10 @@ async fn test_merge_executor_controlled_directory_kill_switch() -> anyhow::Resul tantivy_dirs.push(get_tantivy_directory_from_split_bundle(&dest_filepath).unwrap()); } let merge_operation = MergeOperation::new_merge_operation(split_metadatas); - let merge_task = MergeTask::from_merge_operation_for_test(merge_operation); + let merge_task = MergeTask::from_merge_operation_for_test(merge_operation.clone()); let merge_scratch = MergeScratch { - merge_task, + merge_operation, + merge_task: Some(merge_task), merge_scratch_directory, downloaded_splits_directory, tantivy_dirs, @@ -307,6 +308,7 @@ async fn test_merge_executor_controlled_directory_kill_switch() -> anyhow::Resul doc_mapper, io_controls, merge_packager_mailbox, + None, ); let (merge_executor_mailbox, _merge_executor_handle) = diff --git a/quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs b/quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs index 686bd942495..b35bb1901bc 100644 --- a/quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs +++ b/quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs @@ -304,7 +304,7 @@ impl IndexingPipeline { super::PUBLISHER_NAME, QueueCapacity::Bounded(1), self.params.metastore.clone(), - Some(self.params.merge_planner_mailbox.clone()), + self.params.merge_planner_mailbox_opt.clone(), Some(source_mailbox.clone()), ); let (publisher_mailbox, publisher_handle) = ctx @@ -530,7 +530,7 @@ pub struct IndexingPipelineParams { // Merge-related parameters pub merge_policy: Arc, pub retention_policy: Option, - pub merge_planner_mailbox: Mailbox, + pub merge_planner_mailbox_opt: Option>, pub max_concurrent_split_uploads_merge: usize, // Source-related parameters @@ -668,7 +668,7 @@ mod tests { max_concurrent_split_uploads_index: 4, max_concurrent_split_uploads_merge: 5, cooperative_indexing_permits: None, - merge_planner_mailbox, + merge_planner_mailbox_opt: Some(merge_planner_mailbox), event_broker: EventBroker::default(), params_fingerprint: 42u64, }; @@ -773,8 +773,8 @@ mod tests { let universe = Universe::new(); let storage = Arc::new(RamStorage::default()); - let split_store = IndexingSplitStore::create_without_local_store_for_test(storage.clone()); let (merge_planner_mailbox, _) = universe.create_test_mailbox(); + let split_store = IndexingSplitStore::create_without_local_store_for_test(storage.clone()); let pipeline_params = IndexingPipelineParams { pipeline_id, doc_mapper: Arc::new(default_doc_mapper_for_test()), @@ -792,7 +792,7 @@ mod tests { max_concurrent_split_uploads_index: 4, max_concurrent_split_uploads_merge: 5, cooperative_indexing_permits: None, - merge_planner_mailbox, + merge_planner_mailbox_opt: Some(merge_planner_mailbox), event_broker: Default::default(), params_fingerprint: 42u64, }; @@ -816,7 +816,6 @@ mod tests { async fn test_indexing_pipeline_simple_gz() -> anyhow::Result<()> { indexing_pipeline_simple("data/test_corpus.json.gz").await } - #[tokio::test] async fn test_merge_pipeline_does_not_stop_on_indexing_pipeline_failure() { let node_id = NodeId::from("test-node"); @@ -893,7 +892,7 @@ mod tests { max_concurrent_split_uploads_index: 4, max_concurrent_split_uploads_merge: 5, cooperative_indexing_permits: None, - merge_planner_mailbox: merge_planner_mailbox.clone(), + merge_planner_mailbox_opt: Some(merge_planner_mailbox.clone()), event_broker: Default::default(), params_fingerprint: 42u64, }; @@ -1021,7 +1020,7 @@ mod tests { max_concurrent_split_uploads_index: 4, max_concurrent_split_uploads_merge: 5, cooperative_indexing_permits: None, - merge_planner_mailbox, + merge_planner_mailbox_opt: Some(merge_planner_mailbox), params_fingerprint: 42u64, event_broker: Default::default(), }; diff --git a/quickwit/quickwit-indexing/src/actors/indexing_service.rs b/quickwit/quickwit-indexing/src/actors/indexing_service.rs index 4453ffbfe00..8fe95f79cf7 100644 --- a/quickwit/quickwit-indexing/src/actors/indexing_service.rs +++ b/quickwit/quickwit-indexing/src/actors/indexing_service.rs @@ -26,8 +26,6 @@ use quickwit_actors::{ Observation, }; use quickwit_cluster::Cluster; -use quickwit_common::fs::get_cache_directory_path; -use quickwit_common::io::Limiter; use quickwit_common::pubsub::EventBroker; use quickwit_common::{io, temp_dir}; use quickwit_config::{ @@ -64,7 +62,7 @@ use super::pipeline_shared::{ActorPipeline, PipelineHandle}; use super::{FinishPendingMergesAndShutdownPipeline, MergePlanner, MergeSchedulerService}; use crate::models::{DetachIndexingPipeline, DetachMergePipeline, ObservePipeline, SpawnPipeline}; use crate::source::{AssignShards, Assignment}; -use crate::split_store::{IndexingSplitCache, SplitStoreQuota}; +use crate::split_store::IndexingSplitCache; use crate::{IndexingPipeline, IndexingPipelineParams, IndexingSplitStore, IndexingStatistics}; /// Name of the indexing directory, usually located at `/indexing`. @@ -106,13 +104,13 @@ pub struct IndexingService { cluster: Cluster, pub(crate) metastore: MetastoreServiceClient, ingest_api_service_opt: Option>, - merge_scheduler_service: Mailbox, pub(crate) ingester_pool: IngesterPool, pub(crate) storage_resolver: StorageResolver, indexing_pipelines: HashMap, counters: IndexingServiceCounters, - local_split_store: Arc, pub(crate) max_concurrent_split_uploads: usize, + merge_scheduler_service_opt: Option>, + split_cache: Arc, /// Cached from `IndexerConfig`. Selects whether new Parquet merge /// pipelines route regular merges through the streaming engine or /// the in-memory fallback. Promotion merges always use the @@ -123,7 +121,7 @@ pub struct IndexingService { #[cfg(feature = "metrics")] parquet_merge_pipeline_handles: HashMap, cooperative_indexing_permits: Option>, - merge_io_throughput_limiter_opt: Option, + merge_io_throughput_limiter_opt: Option, pub(crate) event_broker: EventBroker, } @@ -148,20 +146,14 @@ impl IndexingService { cluster: Cluster, metastore: MetastoreServiceClient, ingest_api_service_opt: Option>, - merge_scheduler_service: Mailbox, + merge_scheduler_service_opt: Option>, ingester_pool: IngesterPool, storage_resolver: StorageResolver, event_broker: EventBroker, + split_cache: Arc, ) -> anyhow::Result { - let split_store_space_quota = SplitStoreQuota::try_new( - indexer_config.split_store_max_num_splits, - indexer_config.split_store_max_num_bytes, - )?; let merge_io_throughput_limiter_opt = indexer_config.max_merge_write_throughput.map(io::limiter); - let split_cache_dir_path = get_cache_directory_path(&data_dir_path); - let local_split_store = - IndexingSplitCache::open(split_cache_dir_path, split_store_space_quota).await?; let indexing_root_directory = temp_dir::create_or_purge_directory(&data_dir_path.join(INDEXING_DIR_NAME)).await?; let queue_dir_path = data_dir_path.join(QUEUES_DIR_NAME); @@ -177,10 +169,10 @@ impl IndexingService { cluster, metastore, ingest_api_service_opt, - merge_scheduler_service, + merge_scheduler_service_opt, ingester_pool, storage_resolver, - local_split_store: Arc::new(local_split_store), + split_cache, indexing_pipelines: Default::default(), counters: Default::default(), max_concurrent_split_uploads: indexer_config.max_concurrent_split_uploads, @@ -365,32 +357,43 @@ impl IndexingService { let merge_policy = crate::merge_policy::merge_policy_from_settings(&index_config.indexing_settings); let retention_policy = index_config.retention_policy_opt.clone(); - let split_store = IndexingSplitStore::new(storage.clone(), self.local_split_store.clone()); + let split_store = IndexingSplitStore::new(storage.clone(), self.split_cache.clone()); let doc_mapper = build_doc_mapper(&index_config.doc_mapping, &index_config.search_settings) .map_err(|error| IndexingError::Internal(error.to_string()))?; - let merge_pipeline_id = indexing_pipeline_id.merge_pipeline_id(); - let merge_pipeline_params = MergePipelineParams { - pipeline_id: merge_pipeline_id.clone(), - doc_mapper: doc_mapper.clone(), - indexing_directory: indexing_directory.clone(), - metastore: self.metastore.clone(), - split_store: split_store.clone(), - merge_scheduler_service: self.merge_scheduler_service.clone(), - merge_policy: merge_policy.clone(), - retention_policy: retention_policy.clone(), - merge_io_throughput_limiter_opt: self.merge_io_throughput_limiter_opt.clone(), - max_concurrent_split_uploads: self.max_concurrent_split_uploads, - event_broker: self.event_broker.clone(), + let merge_planner_mailbox_opt = + if let Some(merge_scheduler_service) = self.merge_scheduler_service_opt.clone() { + let merge_pipeline_id = indexing_pipeline_id.merge_pipeline_id(); + let merge_pipeline_params = MergePipelineParams { + pipeline_id: merge_pipeline_id, + doc_mapper: doc_mapper.clone(), + indexing_directory: indexing_directory.clone(), + metastore: self.metastore.clone(), + split_store: split_store.clone(), + merge_scheduler_service, + merge_policy: merge_policy.clone(), + retention_policy: retention_policy.clone(), + merge_io_throughput_limiter_opt: self.merge_io_throughput_limiter_opt.clone(), + max_concurrent_split_uploads: self.max_concurrent_split_uploads, + event_broker: self.event_broker.clone(), + }; + Some(self.get_or_create_merge_pipeline( + merge_pipeline_params, + immature_splits_opt, + ctx, + )?) + } else { + None + }; + + let max_concurrent_split_uploads_index = if self.merge_scheduler_service_opt.is_some() { + (self.max_concurrent_split_uploads / 2).max(1) + } else { + self.max_concurrent_split_uploads }; - let merge_planner_mailbox = - self.get_or_create_merge_pipeline(merge_pipeline_params, immature_splits_opt, ctx)?; - // The concurrent uploads budget is split in 2: 1/2 for the indexing pipeline, 1/2 for the - // merge pipeline. - let max_concurrent_split_uploads_index = (self.max_concurrent_split_uploads / 2).max(1); let max_concurrent_split_uploads_merge = - (self.max_concurrent_split_uploads - max_concurrent_split_uploads_index).max(1); + self.max_concurrent_split_uploads - max_concurrent_split_uploads_index; let pipeline_params = IndexingPipelineParams { pipeline_id: indexing_pipeline_id.clone(), @@ -405,7 +408,7 @@ impl IndexingService { merge_policy, retention_policy, max_concurrent_split_uploads_merge, - merge_planner_mailbox, + merge_planner_mailbox_opt, source_config, ingester_pool: self.ingester_pool.clone(), queues_dir_path: self.queue_dir_path.clone(), @@ -472,6 +475,9 @@ impl IndexingService { indexing_pipeline_ids: &[IndexingPipelineId], ctx: &ActorContext, ) -> MetastoreResult>> { + if self.merge_scheduler_service_opt.is_none() { + return Ok(Default::default()); + } let mut index_uids = Vec::new(); for indexing_pipeline_id in indexing_pipeline_ids { @@ -685,6 +691,10 @@ impl IndexingService { /// Returns the Parquet merge planner mailbox for the given index, creating /// a new ParquetMergePipeline if one isn't already running. /// + /// Returns `Ok(None)` when there is no `merge_scheduler_service_opt` on the + /// service — mirrors the log pipeline path, which does not spawn a merge + /// pipeline in that case. + /// /// Keyed by IndexUid (not MergePipelineId) because Parquet merge pipelines /// are shared across all sources for the same index — unlike Tantivy merge /// pipelines which are per-source. @@ -697,9 +707,12 @@ impl IndexingService { indexing_directory: quickwit_common::temp_dir::TempDirectory, immature_splits_opt: Option>, ctx: &ActorContext, - ) -> Result, IndexingError> { + ) -> Result>, IndexingError> { + let Some(merge_scheduler_service) = self.merge_scheduler_service_opt.clone() else { + return Ok(None); + }; if let Some(handle) = self.parquet_merge_pipeline_handles.get(&index_uid) { - return Ok(handle.mailbox.clone()); + return Ok(Some(handle.mailbox.clone())); } // Convert the config-crate merge policy into the engine-crate type. @@ -727,7 +740,7 @@ impl IndexingService { metastore: self.metastore.clone(), storage, merge_policy, - merge_scheduler_service: self.merge_scheduler_service.clone(), + merge_scheduler_service, max_concurrent_split_uploads: self.max_concurrent_split_uploads, event_broker: self.event_broker.clone(), writer_config, @@ -748,7 +761,7 @@ impl IndexingService { }; self.parquet_merge_pipeline_handles .insert(index_uid, handle); - Ok(merge_planner_mailbox) + Ok(Some(merge_planner_mailbox)) } /// For all Ingest V2 pipelines, assigns the set of shards they should be working on. @@ -1209,10 +1222,11 @@ mod tests { cluster, metastore, Some(ingest_api_service), - merge_scheduler_mailbox, + Some(merge_scheduler_mailbox), IngesterPool::default(), storage_resolver.clone(), EventBroker::default(), + Arc::new(IndexingSplitCache::no_caching()), ) .await .unwrap(); @@ -1384,9 +1398,9 @@ mod tests { // change whenever IndexingSettings fields are added/removed. Recompute // by temporarily adding a test that prints // `indexing_pipeline_params_fingerprint(&index_config, &source_config)`. - const PARAMS_FINGERPRINT_INGEST_API: u64 = 7973087274884969148; - const PARAMS_FINGERPRINT_SOURCE_1: u64 = 9420938500552890840; - const PARAMS_FINGERPRINT_SOURCE_2: u64 = 16199199787360162635; + const PARAMS_FINGERPRINT_INGEST_API: u64 = 1637744865450232394; + const PARAMS_FINGERPRINT_SOURCE_1: u64 = 1705211905504908791; + const PARAMS_FINGERPRINT_SOURCE_2: u64 = 8706667372658059428; quickwit_common::setup_logging_for_tests(); let transport = ChannelTransport::default(); @@ -1428,7 +1442,6 @@ mod tests { .unwrap() .deserialize_index_metadata() .unwrap(); - let source_config_1 = SourceConfig { source_id: "test-indexing-service--source-1".to_string(), num_pipelines: NonZeroUsize::MIN, @@ -1726,10 +1739,11 @@ mod tests { cluster.clone(), metastore.clone(), Some(ingest_api_service), - merge_scheduler_service, + Some(merge_scheduler_service), IngesterPool::default(), storage_resolver.clone(), EventBroker::default(), + Arc::new(IndexingSplitCache::no_caching()), ) .await .unwrap(); @@ -1770,6 +1784,167 @@ mod tests { universe.quit().await; } + #[tokio::test] + async fn test_indexing_service_no_merge_pipeline_when_no_merge_scheduler() { + quickwit_common::setup_logging_for_tests(); + let transport = ChannelTransport::default(); + let cluster = create_cluster_for_test(Vec::new(), &["indexer"], &transport, true) + .await + .unwrap(); + let metastore = metastore_for_test(); + + let index_id = append_random_suffix("test-indexing-service-no-merge"); + let index_uri = format!("ram:///indexes/{index_id}"); + let index_config = IndexConfig::for_test(&index_id, &index_uri); + + let source_config = SourceConfig { + source_id: "test-source".to_string(), + num_pipelines: NonZeroUsize::MIN, + enabled: true, + source_params: SourceParams::void(), + transform_config: None, + input_format: SourceInputFormat::Json, + }; + let create_index_request = + CreateIndexRequest::try_from_index_config(&index_config).unwrap(); + let index_uid: IndexUid = metastore + .create_index(create_index_request) + .await + .unwrap() + .index_uid() + .clone(); + let add_source_request = + AddSourceRequest::try_from_source_config(index_uid.clone(), &source_config).unwrap(); + metastore.add_source(add_source_request).await.unwrap(); + + let temp_dir = tempfile::tempdir().unwrap(); + let data_dir_path = temp_dir.path().to_path_buf(); + let indexer_config = IndexerConfig::for_test().unwrap(); + let num_blocking_threads = 1; + let storage_resolver = StorageResolver::unconfigured(); + let universe = Universe::with_accelerated_time(); + let queues_dir_path = data_dir_path.join(QUEUES_DIR_NAME); + let ingest_api_service = + init_ingest_api(&universe, &queues_dir_path, &IngestApiConfig::default()) + .await + .unwrap(); + let indexing_server = IndexingService::new( + NodeId::from("test-node"), + data_dir_path, + indexer_config, + num_blocking_threads, + cluster.clone(), + metastore.clone(), + Some(ingest_api_service), + None, // No merge scheduler — external merge service handles compaction. + IngesterPool::default(), + storage_resolver.clone(), + EventBroker::default(), + Arc::new(IndexingSplitCache::no_caching()), + ) + .await + .unwrap(); + let (indexing_server_mailbox, indexing_server_handle) = + universe.spawn_builder().spawn(indexing_server); + + indexing_server_mailbox + .ask_for_res(SpawnPipeline { + index_id: index_id.clone(), + source_config, + pipeline_uid: PipelineUid::default(), + }) + .await + .unwrap(); + + let observation = indexing_server_handle.observe().await; + assert_eq!(observation.num_running_pipelines, 1); + assert_eq!(observation.num_running_merge_pipelines, 0); + assert!(universe.get_one::().is_none()); + + universe.quit().await; + } + + #[tokio::test] + async fn test_indexing_service_spawns_merge_pipeline_with_merge_scheduler() { + quickwit_common::setup_logging_for_tests(); + let transport = ChannelTransport::default(); + let cluster = create_cluster_for_test(Vec::new(), &["indexer"], &transport, true) + .await + .unwrap(); + let metastore = metastore_for_test(); + + let index_id = append_random_suffix("test-indexing-service-with-merge"); + let index_uri = format!("ram:///indexes/{index_id}"); + let index_config = IndexConfig::for_test(&index_id, &index_uri); + + let source_config = SourceConfig { + source_id: "test-source".to_string(), + num_pipelines: NonZeroUsize::MIN, + enabled: true, + source_params: SourceParams::void(), + transform_config: None, + input_format: SourceInputFormat::Json, + }; + let create_index_request = + CreateIndexRequest::try_from_index_config(&index_config).unwrap(); + let index_uid: IndexUid = metastore + .create_index(create_index_request) + .await + .unwrap() + .index_uid() + .clone(); + let add_source_request = + AddSourceRequest::try_from_source_config(index_uid.clone(), &source_config).unwrap(); + metastore.add_source(add_source_request).await.unwrap(); + + let temp_dir = tempfile::tempdir().unwrap(); + let data_dir_path = temp_dir.path().to_path_buf(); + let indexer_config = IndexerConfig::for_test().unwrap(); + let num_blocking_threads = 1; + let storage_resolver = StorageResolver::unconfigured(); + let universe = Universe::with_accelerated_time(); + let queues_dir_path = data_dir_path.join(QUEUES_DIR_NAME); + let ingest_api_service = + init_ingest_api(&universe, &queues_dir_path, &IngestApiConfig::default()) + .await + .unwrap(); + let merge_scheduler_mailbox: Mailbox = universe.get_or_spawn_one(); + let indexing_server = IndexingService::new( + NodeId::from("test-node"), + data_dir_path, + indexer_config, + num_blocking_threads, + cluster.clone(), + metastore.clone(), + Some(ingest_api_service), + Some(merge_scheduler_mailbox), + IngesterPool::default(), + storage_resolver.clone(), + EventBroker::default(), + Arc::new(IndexingSplitCache::no_caching()), + ) + .await + .unwrap(); + let (indexing_server_mailbox, indexing_server_handle) = + universe.spawn_builder().spawn(indexing_server); + + indexing_server_mailbox + .ask_for_res(SpawnPipeline { + index_id: index_id.clone(), + source_config, + pipeline_uid: PipelineUid::default(), + }) + .await + .unwrap(); + + let observation = indexing_server_handle.observe().await; + assert_eq!(observation.num_running_pipelines, 1); + assert_eq!(observation.num_running_merge_pipelines, 1); + assert!(universe.get_one::().is_some()); + + universe.quit().await; + } + #[derive(Debug)] struct FreezePipeline; #[async_trait] @@ -1927,10 +2102,11 @@ mod tests { cluster.clone(), metastore.clone(), Some(ingest_api_service.clone()), - merge_scheduler_service, + Some(merge_scheduler_service), IngesterPool::default(), storage_resolver.clone(), EventBroker::default(), + Arc::new(IndexingSplitCache::no_caching()), ) .await .unwrap(); @@ -1995,6 +2171,7 @@ mod tests { let response = IndexesMetadataResponse::for_test(indexes_metadata, failures); Ok(response) }); + mock_metastore .expect_list_splits() .withf(|request| { diff --git a/quickwit/quickwit-indexing/src/actors/log_publisher_impl.rs b/quickwit/quickwit-indexing/src/actors/log_publisher_impl.rs index 02a4bca1661..e3924c0d8a6 100644 --- a/quickwit/quickwit-indexing/src/actors/log_publisher_impl.rs +++ b/quickwit/quickwit-indexing/src/actors/log_publisher_impl.rs @@ -28,7 +28,7 @@ use crate::actors::publisher::{ use crate::models::{NewSplits, SplitsUpdate}; pub(crate) const PUBLISHER_NAME: &str = "Publisher"; -pub(crate) const MERGE_PUBLISHER_NAME: &str = "MergePublisher"; +pub const MERGE_PUBLISHER_NAME: &str = "MergePublisher"; #[async_trait] impl Handler for Publisher { diff --git a/quickwit/quickwit-indexing/src/actors/merge_executor.rs b/quickwit/quickwit-indexing/src/actors/merge_executor.rs index 39305ca1935..d51796e9774 100644 --- a/quickwit/quickwit-indexing/src/actors/merge_executor.rs +++ b/quickwit/quickwit-indexing/src/actors/merge_executor.rs @@ -42,6 +42,7 @@ use tantivy::index::SegmentId; use tantivy::tokenizer::TokenizerManager; use tantivy::{DateTime, Directory, Index, IndexMeta, IndexWriter, SegmentReader}; use tokio::runtime::Handle; +use tokio::sync::Semaphore; use tracing::{debug, error, info, instrument, warn}; use crate::actors::Packager; @@ -56,6 +57,9 @@ pub struct MergeExecutor { doc_mapper: Arc, io_controls: IoControls, merge_packager_mailbox: Mailbox, + /// Bounds concurrent Tantivy merges across pipelines on this node. + /// `None` in the legacy indexing-side merge pipeline. + merge_execution_semaphore: Option>, } #[async_trait] @@ -81,22 +85,38 @@ impl Actor for MergeExecutor { impl Handler for MergeExecutor { type Reply = (); - #[instrument(level = "info", name = "merge_executor", parent = merge_scratch.merge_task.merge_parent_span.id(), skip_all)] + #[instrument(level = "info", name = "merge_executor", parent = merge_scratch.merge_operation.merge_parent_span.id(), skip_all)] async fn handle( &mut self, merge_scratch: MergeScratch, ctx: &ActorContext, ) -> Result<(), ActorExitStatus> { let start = Instant::now(); - let merge_task = merge_scratch.merge_task; - let indexed_split_opt: Option = match merge_task.operation_type { + let MergeScratch { + merge_operation, + merge_task, + tantivy_dirs, + merge_scratch_directory, + .. + } = merge_scratch; + // On nodes running the split compaction architecture, merge pipelines are ephemeral, and we + // need to make sure there aren't too many CPU-bound operations occurring concurrently. + let _cpu_permit = match &self.merge_execution_semaphore { + Some(semaphore) => Some( + ctx.protect_future(semaphore.clone().acquire_owned()) + .await + .expect("merge execution semaphore should never be closed"), + ), + None => None, + }; + let indexed_split_opt: Option = match merge_operation.operation_type { MergeOperationType::Merge => { let merge_res = self .process_merge( - merge_task.merge_split_id.clone(), - merge_task.splits.clone(), - merge_scratch.tantivy_dirs, - merge_scratch.merge_scratch_directory, + merge_operation.merge_split_id.clone(), + merge_operation.splits.clone(), + tantivy_dirs, + merge_scratch_directory, ctx, ) .await; @@ -114,24 +134,24 @@ impl Handler for MergeExecutor { // With a merge policy that marks splits as mature after a day or so, this // limits the noise associated to those failed // merges. - error!(task=?merge_task, err=?err, "failed to merge splits"); + error!(task=?merge_operation, err=?err, "failed to merge splits"); return Ok(()); } } } MergeOperationType::DeleteAndMerge => { assert_eq!( - merge_task.splits.len(), + merge_operation.splits.len(), 1, "Delete tasks can be applied only on one split." ); - assert_eq!(merge_scratch.tantivy_dirs.len(), 1); - let split_with_docs_to_delete = merge_task.splits[0].clone(); + assert_eq!(tantivy_dirs.len(), 1); + let split_with_docs_to_delete = merge_operation.splits[0].clone(); self.process_delete_and_merge( - merge_task.merge_split_id.clone(), + merge_operation.merge_split_id.clone(), split_with_docs_to_delete, - merge_scratch.tantivy_dirs, - merge_scratch.merge_scratch_directory, + tantivy_dirs, + merge_scratch_directory, ctx, ) .await? @@ -141,7 +161,7 @@ impl Handler for MergeExecutor { info!( merged_num_docs = %indexed_split.split_attrs.num_docs, elapsed_secs = %start.elapsed().as_secs_f32(), - operation_type = %merge_task.operation_type, + operation_type = %merge_operation.operation_type, "merge-operation-success" ); ctx.send_message( @@ -151,8 +171,8 @@ impl Handler for MergeExecutor { checkpoint_delta_opt: Default::default(), publish_lock: PublishLock::default(), publish_token_opt: None, - batch_parent_span: merge_task.merge_parent_span.clone(), - merge_task_opt: Some(merge_task), + batch_parent_span: merge_operation.merge_parent_span.clone(), + merge_task_opt: merge_task, }, ) .await?; @@ -305,6 +325,7 @@ impl MergeExecutor { doc_mapper: Arc, io_controls: IoControls, merge_packager_mailbox: Mailbox, + merge_execution_semaphore: Option>, ) -> Self { MergeExecutor { pipeline_id, @@ -312,6 +333,7 @@ impl MergeExecutor { doc_mapper, io_controls, merge_packager_mailbox, + merge_execution_semaphore, } } @@ -642,9 +664,10 @@ mod tests { tantivy_dirs.push(get_tantivy_directory_from_split_bundle(&dest_filepath).unwrap()) } let merge_operation = MergeOperation::new_merge_operation(split_metas); - let merge_task = MergeTask::from_merge_operation_for_test(merge_operation); + let merge_task = MergeTask::from_merge_operation_for_test(merge_operation.clone()); let merge_scratch = MergeScratch { - merge_task, + merge_operation, + merge_task: Some(merge_task), tantivy_dirs, merge_scratch_directory, downloaded_splits_directory, @@ -662,6 +685,7 @@ mod tests { test_sandbox.doc_mapper(), IoControls::default(), merge_packager_mailbox, + None, ); let (merge_executor_mailbox, merge_executor_handle) = test_sandbox .universe() @@ -786,9 +810,10 @@ mod tests { .await?; let tantivy_dir = get_tantivy_directory_from_split_bundle(&dest_filepath).unwrap(); let merge_operation = MergeOperation::new_delete_and_merge_operation(new_split_metadata); - let merge_task = MergeTask::from_merge_operation_for_test(merge_operation); + let merge_task = MergeTask::from_merge_operation_for_test(merge_operation.clone()); let merge_scratch = MergeScratch { - merge_task, + merge_operation, + merge_task: Some(merge_task), tantivy_dirs: vec![tantivy_dir], merge_scratch_directory, downloaded_splits_directory, @@ -806,6 +831,7 @@ mod tests { test_sandbox.doc_mapper(), IoControls::default(), merge_packager_mailbox, + None, ); let (delete_task_executor_mailbox, delete_task_executor_handle) = universe.spawn_builder().spawn(delete_task_executor); diff --git a/quickwit/quickwit-indexing/src/actors/merge_pipeline.rs b/quickwit/quickwit-indexing/src/actors/merge_pipeline.rs index 656200717c2..0788474bb21 100644 --- a/quickwit/quickwit-indexing/src/actors/merge_pipeline.rs +++ b/quickwit/quickwit-indexing/src/actors/merge_pipeline.rs @@ -316,6 +316,7 @@ impl MergePipeline { self.params.doc_mapper.clone(), merge_executor_io_controls, merge_packager_mailbox, + None, ); let (merge_executor_mailbox, merge_executor_handle) = ctx .spawn_actor() diff --git a/quickwit/quickwit-indexing/src/actors/merge_scheduler_service.rs b/quickwit/quickwit-indexing/src/actors/merge_scheduler_service.rs index 1e79ed58ea3..dfe832f0c61 100644 --- a/quickwit/quickwit-indexing/src/actors/merge_scheduler_service.rs +++ b/quickwit/quickwit-indexing/src/actors/merge_scheduler_service.rs @@ -29,7 +29,7 @@ use tracing::error; use super::MergeSplitDownloader; #[cfg(feature = "metrics")] use super::parquet_pipeline::{ParquetMergeSplitDownloader, ParquetMergeTask}; -use crate::merge_policy::{MergeOperation, MergeTask}; +use crate::merge_policy::{MergeOperation, MergeTask, compute_merge_score}; use crate::metrics::{ONGOING_MERGE_OPERATIONS, PENDING_MERGE_BYTES, PENDING_MERGE_OPERATIONS}; pub struct MergePermit { @@ -309,36 +309,17 @@ struct ScheduleMerge { split_downloader_mailbox: Mailbox, } -/// Scores a merge operation for priority scheduling. -/// -/// Higher score = scheduled sooner. Prefers merges that strongly reduce -/// split count relative to their total byte cost. Used by both Tantivy -/// and Parquet merge scheduling. -fn score_merge(num_splits: usize, total_num_bytes: u64) -> u64 { - if total_num_bytes == 0 { - return u64::MAX; - } - // We will remove num_splits and add 1 merged split. - let delta_num_splits = (num_splits - 1) as u64; - // Integer arithmetic to avoid `f64 are not ordered` silliness. - (delta_num_splits << 48) - .checked_div(total_num_bytes) - .unwrap_or(1u64) -} - -fn score_merge_operation(merge_operation: &MergeOperation) -> u64 { - score_merge( - merge_operation.splits.len(), - merge_operation.total_num_bytes(), - ) -} - impl ScheduleMerge { pub fn new( merge_operation: TrackedObject, split_downloader_mailbox: Mailbox, ) -> ScheduleMerge { - let score = score_merge_operation(&merge_operation); + let total_num_bytes: u64 = merge_operation + .splits + .iter() + .map(|split| split.footer_offsets.end) + .sum(); + let score = compute_merge_score(merge_operation.splits.len(), total_num_bytes); ScheduleMerge { score, merge_operation, @@ -399,7 +380,7 @@ impl Handler for MergeSchedulerService { #[cfg(feature = "metrics")] fn score_parquet_merge_operation(merge_operation: &ParquetMergeOperation) -> u64 { - score_merge( + compute_merge_score( merge_operation.splits.len(), merge_operation.total_size_bytes(), ) @@ -483,24 +464,6 @@ mod tests { MergeOperation::new_merge_operation(splits) } - #[test] - fn test_score_merge_operation() { - let score_merge_operation_aux = |num_splits, num_bytes_per_split| { - let merge_operation = build_merge_operation(num_splits, num_bytes_per_split); - score_merge_operation(&merge_operation) - }; - assert!(score_merge_operation_aux(10, 10_000_000) < score_merge_operation_aux(10, 999_999)); - assert!( - score_merge_operation_aux(10, 10_000_000) > score_merge_operation_aux(9, 10_000_000) - ); - assert_eq!( - // 9 - 1 = 8 splits removed. - score_merge_operation_aux(9, 10_000_000), - // 5 - 1 = 4 splits removed. - score_merge_operation_aux(5, 10_000_000 * 9 / 10) - ); - } - #[tokio::test] async fn test_merge_schedule_service_prioritize() { let universe = Universe::new(); diff --git a/quickwit/quickwit-indexing/src/actors/merge_split_downloader.rs b/quickwit/quickwit-indexing/src/actors/merge_split_downloader.rs index 5d68bb59285..1911ecc0363 100644 --- a/quickwit/quickwit-indexing/src/actors/merge_split_downloader.rs +++ b/quickwit/quickwit-indexing/src/actors/merge_split_downloader.rs @@ -23,7 +23,7 @@ use tantivy::Directory; use tracing::{debug, info, instrument}; use super::MergeExecutor; -use crate::merge_policy::MergeTask; +use crate::merge_policy::{MergeOperation, MergeTask}; use crate::models::MergeScratch; use crate::split_store::IndexingSplitStore; @@ -62,6 +62,35 @@ impl Handler for MergeSplitDownloader { merge_task: MergeTask, ctx: &ActorContext, ) -> Result<(), quickwit_actors::ActorExitStatus> { + let merge_operation = merge_task.merge_operation.as_ref().clone(); + self.download_and_send(merge_operation, ctx).await + } +} + +#[async_trait] +impl Handler for MergeSplitDownloader { + type Reply = (); + + #[instrument( + name = "merge_split_downloader", + parent = merge_operation.merge_parent_span.id(), + skip_all, + )] + async fn handle( + &mut self, + merge_operation: MergeOperation, + ctx: &ActorContext, + ) -> Result<(), quickwit_actors::ActorExitStatus> { + self.download_and_send(merge_operation, ctx).await + } +} + +impl MergeSplitDownloader { + async fn download_and_send( + &mut self, + merge_operation: MergeOperation, + ctx: &ActorContext, + ) -> Result<(), ActorExitStatus> { let merge_scratch_directory = temp_dir::Builder::default() .join("merge") .tempdir_in(self.scratch_directory.path()) @@ -73,13 +102,14 @@ impl Handler for MergeSplitDownloader { .map_err(|error| anyhow::anyhow!(error))?; let tantivy_dirs = self .download_splits( - merge_task.splits_as_slice(), + merge_operation.splits_as_slice(), downloaded_splits_directory.path(), ctx, ) .await?; let msg = MergeScratch { - merge_task, + merge_operation, + merge_task: None, merge_scratch_directory, downloaded_splits_directory, tantivy_dirs, @@ -96,32 +126,30 @@ impl MergeSplitDownloader { download_directory: &Path, ctx: &ActorContext, ) -> Result>, quickwit_actors::ActorExitStatus> { - // we download all of the split files in the scratch directory. - let mut tantivy_dirs = Vec::new(); - for split in splits { - if ctx.kill_switch().is_dead() { - debug!( - split_id = split.split_id(), - "Kill switch was activated. Cancelling download." - ); - return Err(ActorExitStatus::Killed); - } - let io_controls = self - .io_controls - .clone() - .set_progress(ctx.progress().clone()) - .set_kill_switch(ctx.kill_switch().clone()); - let _protect_guard = ctx.protect_zone(); - let tantivy_dir = self - .split_store - .fetch_and_open_split(split.split_id(), download_directory, &io_controls) - .await - .map_err(|error| { - let split_id = split.split_id(); - anyhow::anyhow!(error).context(format!("failed to download split `{split_id}`")) - })?; - tantivy_dirs.push(tantivy_dir); + if ctx.kill_switch().is_dead() { + debug!("kill switch was activated, cancelling download"); + return Err(ActorExitStatus::Killed); } + let io_controls = self + .io_controls + .clone() + .set_progress(ctx.progress().clone()) + .set_kill_switch(ctx.kill_switch().clone()); + let _protect_guard = ctx.protect_zone(); + let download_futures = splits.iter().map(|split| { + let io_controls = io_controls.clone(); + async move { + self.split_store + .fetch_and_open_split(split.split_id(), download_directory, &io_controls) + .await + .map_err(|error| { + let split_id = split.split_id(); + anyhow::anyhow!(error) + .context(format!("failed to download split `{split_id}`")) + }) + } + }); + let tantivy_dirs = futures::future::try_join_all(download_futures).await?; Ok(tantivy_dirs) } } @@ -190,8 +218,8 @@ mod tests { .unwrap() .downcast::() .unwrap(); - assert_eq!(merge_scratch.merge_task.splits_as_slice().len(), 10); - for split in merge_scratch.merge_task.splits_as_slice() { + assert_eq!(merge_scratch.merge_operation.splits_as_slice().len(), 10); + for split in merge_scratch.merge_operation.splits_as_slice() { let split_filename = split_file(split.split_id()); let split_filepath = merge_scratch .downloaded_splits_directory diff --git a/quickwit/quickwit-indexing/src/actors/mod.rs b/quickwit/quickwit-indexing/src/actors/mod.rs index f7a69265662..fdab5df4bcc 100644 --- a/quickwit/quickwit-indexing/src/actors/mod.rs +++ b/quickwit/quickwit-indexing/src/actors/mod.rs @@ -41,7 +41,8 @@ pub use indexing_pipeline::{IndexingPipeline, IndexingPipelineParams}; pub use indexing_service::{ BoxedPipelineHandle, INDEXING_DIR_NAME, IndexingService, IndexingServiceCounters, }; -pub(crate) use log_publisher_impl::{MERGE_PUBLISHER_NAME, PUBLISHER_NAME}; +pub use log_publisher_impl::MERGE_PUBLISHER_NAME; +pub(crate) use log_publisher_impl::PUBLISHER_NAME; pub use merge_executor::{MergeExecutor, combine_partition_ids, merge_split_attrs}; pub use merge_pipeline::{ FinishPendingMergesAndShutdownPipeline, MergePipeline, MergePipelineParams, diff --git a/quickwit/quickwit-indexing/src/actors/parquet_pipeline/indexing_service_impl.rs b/quickwit/quickwit-indexing/src/actors/parquet_pipeline/indexing_service_impl.rs index e39a7e4989f..9d5b82b9a44 100644 --- a/quickwit/quickwit-indexing/src/actors/parquet_pipeline/indexing_service_impl.rs +++ b/quickwit/quickwit-indexing/src/actors/parquet_pipeline/indexing_service_impl.rs @@ -70,7 +70,9 @@ impl IndexingService { // Spawn the Parquet merge pipeline (or reuse an existing one for this // index). The planner mailbox is wired into the MetricsPipeline's // Publisher so newly ingested splits are fed back for merging. - let merge_planner_mailbox = self.get_or_create_parquet_merge_pipeline( + // Returns `None` when there is no local merge scheduler — the metrics + // pipeline then runs without local merging (mirrors the log path). + let merge_planner_mailbox_opt = self.get_or_create_parquet_merge_pipeline( indexing_pipeline_id.index_uid.clone(), &index_config, storage.clone(), @@ -97,7 +99,7 @@ impl IndexingService { use_sketch_processors, partition_key, max_num_partitions: index_config.doc_mapping.max_num_partitions, - parquet_merge_planner_mailbox_opt: Some(merge_planner_mailbox), + parquet_merge_planner_mailbox_opt: merge_planner_mailbox_opt, }; let pipeline = MetricsPipeline::new(pipeline_params); let (mailbox, handle) = ctx.spawn_actor().spawn(pipeline); diff --git a/quickwit/quickwit-indexing/src/lib.rs b/quickwit/quickwit-indexing/src/lib.rs index f0e79499b8a..1888bfaffa7 100644 --- a/quickwit/quickwit-indexing/src/lib.rs +++ b/quickwit/quickwit-indexing/src/lib.rs @@ -14,6 +14,8 @@ #![deny(clippy::disallowed_methods)] +use std::sync::Arc; + use quickwit_actors::{Mailbox, Universe}; use quickwit_cluster::Cluster; use quickwit_common::pubsub::EventBroker; @@ -27,11 +29,14 @@ use tracing::info; use crate::actors::MergeSchedulerService; pub use crate::actors::{ BoxedPipelineHandle, FinishPendingMergesAndShutdownPipeline, IndexingError, IndexingPipeline, - IndexingPipelineParams, IndexingService, Sequencer, SplitsUpdateMailbox, + IndexingPipelineParams, IndexingService, MERGE_PUBLISHER_NAME, Sequencer, SplitsUpdateMailbox, }; pub use crate::controlled_directory::ControlledDirectory; use crate::models::IndexingStatistics; -pub use crate::split_store::{IndexingSplitStore, get_tantivy_directory_from_split_bundle}; +pub use crate::split_store::{ + IndexingSplitCache, IndexingSplitStore, SplitStoreQuota, + get_tantivy_directory_from_split_bundle, +}; pub mod actors; mod controlled_directory; @@ -69,13 +74,11 @@ pub async fn start_indexing_service( ingester_pool: IngesterPool, storage_resolver: StorageResolver, event_broker: EventBroker, + merge_scheduler_mailbox_opt: Option>, + indexing_split_cache: Arc, ) -> anyhow::Result> { info!("starting indexer service"); let ingest_api_service_mailbox = universe.get_one::(); - let (merge_scheduler_mailbox, _) = universe.spawn_builder().spawn(MergeSchedulerService::new( - config.indexer_config.merge_concurrency.get(), - )); - // Spawn indexing service. let indexing_service = IndexingService::new( config.node_id.clone(), config.data_dir_path.to_path_buf(), @@ -84,10 +87,11 @@ pub async fn start_indexing_service( cluster, metastore.clone(), ingest_api_service_mailbox, - merge_scheduler_mailbox, + merge_scheduler_mailbox_opt, ingester_pool, storage_resolver, event_broker, + indexing_split_cache, ) .await?; let (indexing_service, _) = universe.spawn_builder().spawn(indexing_service); diff --git a/quickwit/quickwit-indexing/src/merge_policy/mod.rs b/quickwit/quickwit-indexing/src/merge_policy/mod.rs index e94e70d3370..408dc7cd798 100644 --- a/quickwit/quickwit-indexing/src/merge_policy/mod.rs +++ b/quickwit/quickwit-indexing/src/merge_policy/mod.rs @@ -121,6 +121,31 @@ impl MergeOperation { pub fn splits_as_slice(&self) -> &[SplitMetadata] { self.splits.as_slice() } + + pub fn merge_level(&self) -> usize { + self.splits + .iter() + .map(|s| s.num_merge_ops) + .max() + .unwrap_or(0) + } +} + +// The higher, the sooner we will execute the merge operation. +// A good merge operation: +// - strongly reduces the number of splits +// - is light. +pub fn compute_merge_score(num_splits: usize, total_num_bytes: u64) -> u64 { + if total_num_bytes == 0 { + // Silly corner case that should never happen. + return u64::MAX; + } + // We will remove num_splits and add 1 merge split. + let delta_num_splits = num_splits.saturating_sub(1) as u64; + // Integer arithmetic to avoid `f64 are not ordered` silliness. + (delta_num_splits << 48) + .checked_div(total_num_bytes) + .unwrap_or(1u64) } impl fmt::Debug for MergeOperation { @@ -254,6 +279,21 @@ pub mod tests { }; use crate::models::{NewSplits, create_split_metadata}; + #[test] + fn test_score() { + // Lighter merge at the same split count scores higher. + assert!(compute_merge_score(10, 100_000_000) < compute_merge_score(10, 9_999_990)); + // More splits removed at the same total bytes scores higher. + assert!(compute_merge_score(10, 100_000_000) > compute_merge_score(9, 100_000_000)); + // Equal `(delta_splits / total_bytes)` ratios yield equal scores. + assert_eq!( + // delta=8, 90M bytes. + compute_merge_score(9, 90_000_000), + // delta=4, 45M bytes (same 8/90M ratio). + compute_merge_score(5, 45_000_000), + ); + } + fn pow_of_10(n: usize) -> usize { 10usize.pow(n as u32) } diff --git a/quickwit/quickwit-indexing/src/models/merge_scratch.rs b/quickwit/quickwit-indexing/src/models/merge_scratch.rs index 392ca60b42c..a0296f69d2d 100644 --- a/quickwit/quickwit-indexing/src/models/merge_scratch.rs +++ b/quickwit/quickwit-indexing/src/models/merge_scratch.rs @@ -15,14 +15,15 @@ use quickwit_common::temp_dir::TempDirectory; use tantivy::Directory; -use crate::merge_policy::MergeTask; +use crate::merge_policy::{MergeOperation, MergeTask}; #[derive(Debug)] pub struct MergeScratch { - /// A [`MergeTask`] tracked by either the `MergePlanner` or the `DeleteTaskPlanner` - /// See planners docs to understand the usage. - pub merge_task: MergeTask, - /// Scratch directory for computing the merge. + /// The merge operation data (splits, merge_split_id, operation type). + pub merge_operation: MergeOperation, + // TODO: remove once the old MergePipeline is deleted and the + // DeleteTaskPipeline no longer routes through MergeSchedulerService. + pub merge_task: Option, pub merge_scratch_directory: TempDirectory, pub downloaded_splits_directory: TempDirectory, pub tantivy_dirs: Vec>, diff --git a/quickwit/quickwit-indexing/src/split_store/indexing_split_cache.rs b/quickwit/quickwit-indexing/src/split_store/indexing_split_cache.rs index 70f19410ecf..47676669487 100644 --- a/quickwit/quickwit-indexing/src/split_store/indexing_split_cache.rs +++ b/quickwit/quickwit-indexing/src/split_store/indexing_split_cache.rs @@ -21,11 +21,13 @@ use std::time::{Duration, SystemTime}; use anyhow::Context; use bytesize::ByteSize; +use quickwit_common::fs::get_cache_directory_path; use quickwit_common::split_file; +use quickwit_config::IndexerConfig; use quickwit_directories::BundleDirectory; use quickwit_storage::StorageResult; use tantivy::Directory; -use tantivy::directory::MmapDirectory; +use tantivy::directory::{Advice, MmapDirectory}; use tokio::sync::Mutex; use tracing::{debug, error, warn}; use ulid::Ulid; @@ -38,12 +40,15 @@ const SPLIT_MAX_AGE: Duration = Duration::from_secs(2 * 24 * 3_600); // 2 days pub fn get_tantivy_directory_from_split_bundle( split_file: &Path, ) -> StorageResult> { - let mmap_directory = MmapDirectory::open(split_file.parent().ok_or_else(|| { - io::Error::new( - io::ErrorKind::NotFound, - format!("couldn't find parent for {}", split_file.display()), - ) - })?)?; + let mmap_directory = MmapDirectory::open_with_madvice( + split_file.parent().ok_or_else(|| { + io::Error::new( + io::ErrorKind::NotFound, + format!("couldn't find parent for {}", split_file.display()), + ) + })?, + Advice::Sequential, + )?; let split_fileslice = mmap_directory.open_read(Path::new(&split_file))?; Ok(Box::new(BundleDirectory::open_split(split_fileslice)?)) } @@ -361,6 +366,22 @@ impl IndexingSplitCache { IndexingSplitCache { inner } } + /// Builds an [`IndexingSplitCache`] from an [`IndexerConfig`], rooted at + /// `/indexer-split-cache/splits`. + pub async fn from_config( + indexer_config: &IndexerConfig, + data_dir_path: &Path, + ) -> anyhow::Result { + let cache_path = get_cache_directory_path(data_dir_path); + let quota = SplitStoreQuota::try_new( + indexer_config.split_store_max_num_splits, + indexer_config.split_store_max_num_bytes, + )?; + IndexingSplitCache::open(cache_path, quota) + .await + .context("failed to open indexing split cache") + } + /// Try to open an existing local split store directory. /// /// If the directory does not exists, it will be created. @@ -508,6 +529,7 @@ mod tests { use std::time::Duration; use bytesize::ByteSize; + use quickwit_config::IndexerConfig; use quickwit_directories::BundleDirectory; use quickwit_storage::{PutPayload, SplitPayloadBuilder}; use tantivy::Directory; @@ -530,6 +552,20 @@ mod tests { Ok(()) } + #[tokio::test] + async fn test_from_config() { + let data_dir = tempdir().unwrap(); + let config = IndexerConfig::for_test().unwrap(); + let _cache = IndexingSplitCache::from_config(&config, data_dir.path()) + .await + .unwrap(); + let cache_dir = data_dir.path().join("indexer-split-cache").join("splits"); + assert!( + cache_dir.is_dir(), + "from_config must open (and create) the cache directory", + ); + } + #[tokio::test] async fn test_local_split_store_load_existing_splits() -> anyhow::Result<()> { let temp_dir = tempfile::tempdir()?; diff --git a/quickwit/quickwit-indexing/src/test_utils.rs b/quickwit/quickwit-indexing/src/test_utils.rs index 82198c820f4..e6c824ce788 100644 --- a/quickwit/quickwit-indexing/src/test_utils.rs +++ b/quickwit/quickwit-indexing/src/test_utils.rs @@ -40,6 +40,7 @@ use serde_json::Value as JsonValue; use crate::actors::IndexingService; use crate::models::{DetachIndexingPipeline, IndexingStatistics, SpawnPipeline}; +use crate::split_store::IndexingSplitCache; /// Creates a Test environment. /// @@ -123,10 +124,11 @@ impl TestSandbox { cluster, metastore.clone(), Some(ingest_api_service), - merge_scheduler_mailbox, + Some(merge_scheduler_mailbox), IngesterPool::default(), storage_resolver.clone(), EventBroker::default(), + Arc::new(IndexingSplitCache::no_caching()), ) .await?; let (indexing_service, _indexing_service_handle) = diff --git a/quickwit/quickwit-integration-tests/src/test_utils/cluster_sandbox.rs b/quickwit/quickwit-integration-tests/src/test_utils/cluster_sandbox.rs index a7385ca0946..9c2fe3effde 100644 --- a/quickwit/quickwit-integration-tests/src/test_utils/cluster_sandbox.rs +++ b/quickwit/quickwit-integration-tests/src/test_utils/cluster_sandbox.rs @@ -76,6 +76,11 @@ impl TestNodeConfig { tcp_listener_resolver.add_listener(grpc_tcp_listener).await; config.indexer_config.enable_otlp_endpoint = self.enable_otlp; config.enabled_services.clone_from(&self.services); + if config.enabled_services.contains(&QuickwitService::Indexer) + && !config.indexer_config.enable_standalone_compactors + { + config.enabled_services.insert(QuickwitService::Compactor); + } config.jaeger_config.enable_endpoint = true; config.cluster_id.clone_from(&cluster_id); config.node_id = NodeId::new(format!("test-node-{node_idx}")); @@ -678,7 +683,11 @@ impl ClusterSandbox { #[tokio::test] async fn test_sandbox_happy_path() { let sandbox = ClusterSandboxBuilder::default() - .add_node([QuickwitService::ControlPlane, QuickwitService::Metastore]) + .add_node([ + QuickwitService::ControlPlane, + QuickwitService::Metastore, + QuickwitService::Janitor, + ]) .add_node([QuickwitService::Searcher]) .add_node([QuickwitService::Indexer]) .build_and_start() @@ -691,7 +700,11 @@ async fn test_sandbox_happy_path() { #[tokio::test] async fn test_sandbox_add_node_dynamically() { let mut sandbox = ClusterSandboxBuilder::default() - .add_node([QuickwitService::ControlPlane, QuickwitService::Metastore]) + .add_node([ + QuickwitService::ControlPlane, + QuickwitService::Metastore, + QuickwitService::Janitor, + ]) .add_node([QuickwitService::Searcher]) .build_and_start() .await; diff --git a/quickwit/quickwit-integration-tests/src/tests/basic_tests.rs b/quickwit/quickwit-integration-tests/src/tests/basic_tests.rs index 91c6b0ebeb1..96bf311cc01 100644 --- a/quickwit/quickwit-integration-tests/src/tests/basic_tests.rs +++ b/quickwit/quickwit-integration-tests/src/tests/basic_tests.rs @@ -162,3 +162,47 @@ async fn test_multi_nodes_cluster() { sandbox.shutdown().await.unwrap(); } + +#[tokio::test] +async fn test_indexer_implicitly_runs_compactor() { + quickwit_common::setup_logging_for_tests(); + let sandbox = ClusterSandboxBuilder::default() + .add_node([QuickwitService::Searcher]) + .add_node([QuickwitService::Metastore]) + .add_node([QuickwitService::ControlPlane]) + .add_node([QuickwitService::Janitor]) + .add_node([QuickwitService::Indexer]) + .build_and_start() + .await; + + let cluster_snapshot = sandbox + .rest_client(QuickwitService::Searcher) + .cluster() + .snapshot() + .await + .unwrap(); + assert_eq!(cluster_snapshot.ready_nodes.len(), 5); + + // The indexer node should also advertise the compactor service. + let indexer_node = cluster_snapshot + .chitchat_state_snapshot + .node_states + .iter() + .find(|node_state| { + node_state + .get("enabled_services") + .map(|s| s.contains("indexer")) + .unwrap_or(false) + }) + .expect("indexer node not found in cluster state"); + let services = indexer_node.get("enabled_services").unwrap(); + assert!( + services.contains("compactor"), + "indexer node should implicitly run the compactor, got: {services}" + ); + + sandbox.shutdown().await.unwrap(); +} + +// TODO: add test_standalone_compactor_node once the test config builder +// resolves QW_ENABLE_STANDALONE_COMPACTORS from the environment. diff --git a/quickwit/quickwit-janitor/Cargo.toml b/quickwit/quickwit-janitor/Cargo.toml index e8063895f24..c633c8e4aac 100644 --- a/quickwit/quickwit-janitor/Cargo.toml +++ b/quickwit/quickwit-janitor/Cargo.toml @@ -27,6 +27,7 @@ utoipa = { workspace = true } quickwit-actors = { workspace = true } quickwit-common = { workspace = true } quickwit-metrics = { workspace = true } +quickwit-compaction = { workspace = true } quickwit-config = { workspace = true } quickwit-doc-mapper = { workspace = true } quickwit-index-management = { workspace = true } diff --git a/quickwit/quickwit-janitor/src/actors/delete_task_pipeline.rs b/quickwit/quickwit-janitor/src/actors/delete_task_pipeline.rs index be11509c3d7..37884040715 100644 --- a/quickwit/quickwit-janitor/src/actors/delete_task_pipeline.rs +++ b/quickwit/quickwit-janitor/src/actors/delete_task_pipeline.rs @@ -207,6 +207,7 @@ impl DeleteTaskPipeline { doc_mapper.clone(), delete_executor_io_controls, packager_mailbox, + None, ); let (delete_executor_mailbox, task_executor_supervisor_handler) = ctx.spawn_actor().supervise(delete_executor); diff --git a/quickwit/quickwit-janitor/src/janitor_service.rs b/quickwit/quickwit-janitor/src/janitor_service.rs index b8bfe5e54b0..1e843f57b9b 100644 --- a/quickwit/quickwit-janitor/src/janitor_service.rs +++ b/quickwit/quickwit-janitor/src/janitor_service.rs @@ -16,6 +16,7 @@ use async_trait::async_trait; use quickwit_actors::{ Actor, ActorContext, ActorExitStatus, ActorHandle, ActorState, Handler, Healthz, }; +use quickwit_compaction::planner::CompactionPlanner; use serde_json::{Value as JsonValue, json}; use crate::actors::{DeleteTaskService, GarbageCollector, RetentionPolicyExecutor}; @@ -24,6 +25,7 @@ pub struct JanitorService { delete_task_service_handle: Option>, garbage_collector_handle: ActorHandle, retention_policy_executor_handle: ActorHandle, + compaction_planner_handle: Option>, } impl JanitorService { @@ -31,11 +33,13 @@ impl JanitorService { delete_task_service_handle: Option>, garbage_collector_handle: ActorHandle, retention_policy_executor_handle: ActorHandle, + compaction_planner_handle: Option>, ) -> Self { Self { delete_task_service_handle, garbage_collector_handle, retention_policy_executor_handle, + compaction_planner_handle, } } @@ -46,9 +50,16 @@ impl JanitorService { } else { true }; + let compaction_planner_is_not_failure: bool = + if let Some(compaction_planner_handle) = &self.compaction_planner_handle { + compaction_planner_handle.state() != ActorState::Failure + } else { + true + }; delete_task_is_not_failure && self.garbage_collector_handle.state() != ActorState::Failure && self.retention_policy_executor_handle.state() != ActorState::Failure + && compaction_planner_is_not_failure } } diff --git a/quickwit/quickwit-janitor/src/lib.rs b/quickwit/quickwit-janitor/src/lib.rs index bef73160377..7e6734d9016 100644 --- a/quickwit/quickwit-janitor/src/lib.rs +++ b/quickwit/quickwit-janitor/src/lib.rs @@ -14,8 +14,9 @@ #![deny(clippy::disallowed_methods)] -use quickwit_actors::{Mailbox, Universe}; +use quickwit_actors::{ActorHandle, Mailbox, Universe}; use quickwit_common::pubsub::EventBroker; +use quickwit_compaction::planner::CompactionPlanner; use quickwit_config::NodeConfig; use quickwit_indexing::actors::MergeSchedulerService; use quickwit_metastore::SplitInfo; @@ -39,6 +40,7 @@ use crate::actors::{DeleteTaskService, GarbageCollector, RetentionPolicyExecutor /// Schema used for the OpenAPI generation which are apart of this crate. pub struct JanitorApiSchemas; +#[allow(clippy::too_many_arguments)] pub async fn start_janitor_service( universe: &Universe, config: &NodeConfig, @@ -47,6 +49,7 @@ pub async fn start_janitor_service( storage_resolver: StorageResolver, event_broker: EventBroker, run_delete_task_service: bool, + compaction_planner_handle: Option>, ) -> anyhow::Result> { info!("starting janitor service"); let garbage_collector = GarbageCollector::new(metastore.clone(), storage_resolver.clone()); @@ -77,6 +80,7 @@ pub async fn start_janitor_service( delete_task_service_handle, garbage_collector_handle, retention_policy_executor_handle, + compaction_planner_handle, ); let (janitor_service_mailbox, _janitor_service_handle) = universe.spawn_builder().spawn(janitor_service); diff --git a/quickwit/quickwit-metastore/migrations/postgresql/29_create-index-splits-published-maturity-timestamp.down.sql b/quickwit/quickwit-metastore/migrations/postgresql/29_create-index-splits-published-maturity-timestamp.down.sql new file mode 100644 index 00000000000..c8bd22af8c3 --- /dev/null +++ b/quickwit/quickwit-metastore/migrations/postgresql/29_create-index-splits-published-maturity-timestamp.down.sql @@ -0,0 +1,4 @@ +-- no-transaction +-- CONCURRENTLY matches the up migration: avoids blocking writes on `splits` +-- while the index is being dropped. Cannot run inside a transaction. +DROP INDEX CONCURRENTLY IF EXISTS splits_published_maturity_timestamp_idx; diff --git a/quickwit/quickwit-metastore/migrations/postgresql/29_create-index-splits-published-maturity-timestamp.up.sql b/quickwit/quickwit-metastore/migrations/postgresql/29_create-index-splits-published-maturity-timestamp.up.sql new file mode 100644 index 00000000000..f6a4c7a53d0 --- /dev/null +++ b/quickwit/quickwit-metastore/migrations/postgresql/29_create-index-splits-published-maturity-timestamp.up.sql @@ -0,0 +1,24 @@ +-- no-transaction +-- Index for the compaction planner's scan, which on every tick reads up to +-- LIMIT splits matching: +-- split_state = 'Published' +-- maturity_timestamp > now() +-- ordered by maturity_timestamp ascending. The planner keeps a local set of +-- already-tracked splits and dedups against it, so re-reading the immature +-- set every tick is intentional -- it's how the planner recovers splits +-- whose merge timed out or failed. +-- +-- The btree on (maturity_timestamp, split_id) lets postgres seek to the live +-- "still immature" range in index order, satisfying both the filter and the +-- ORDER BY without an extra sort. The split_id column is included as a +-- tiebreaker so postgres returns deterministic pages under LIMIT. +-- +-- The partial predicate is restricted to split_state = 'Published' because +-- partial-index predicates must be IMMUTABLE; "now()" cannot appear here. +-- +-- CONCURRENTLY avoids taking a SHARE lock on splits, which would block all +-- writes for the duration of the build. CONCURRENTLY cannot run inside a +-- transaction block, hence the `-- no-transaction` directive above. +CREATE INDEX CONCURRENTLY IF NOT EXISTS splits_published_maturity_timestamp_idx + ON splits (maturity_timestamp, split_id) + WHERE split_state = 'Published'; diff --git a/quickwit/quickwit-metastore/src/lib.rs b/quickwit/quickwit-metastore/src/lib.rs index b437a1c1bef..7e02fcd0ec9 100644 --- a/quickwit/quickwit-metastore/src/lib.rs +++ b/quickwit/quickwit-metastore/src/lib.rs @@ -48,7 +48,7 @@ pub use metastore::{ ListParquetSplitsQuery, ListParquetSplitsRequestExt, ListParquetSplitsResponseExt, ListSplitsQuery, ListSplitsRequestExt, ListSplitsResponseExt, MetastoreServiceExt, MetastoreServiceStreamSplitsExt, ParquetSplitRecord, PublishParquetSplitsRequestExt, - PublishSplitsRequestExt, StageParquetSplitsRequestExt, StageSplitsRequestExt, + PublishSplitsRequestExt, SortBy, StageParquetSplitsRequestExt, StageSplitsRequestExt, UpdateIndexRequestExt, UpdateSourceRequestExt, file_backed, }; pub use metastore_factory::{MetastoreFactory, UnsupportedMetastore}; diff --git a/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs b/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs index 88b95d003ee..444032b8ae2 100644 --- a/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs +++ b/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs @@ -1290,6 +1290,14 @@ fn split_query_predicate(split: &&Split, query: &ListSplitsQuery) -> bool { } } + if query + .excluded_split_ids + .iter() + .any(|excluded| excluded == &split.split_metadata.split_id) + { + return false; + } + true } diff --git a/quickwit/quickwit-metastore/src/metastore/mod.rs b/quickwit/quickwit-metastore/src/metastore/mod.rs index 8752d348e02..6ebef78e583 100644 --- a/quickwit/quickwit-metastore/src/metastore/mod.rs +++ b/quickwit/quickwit-metastore/src/metastore/mod.rs @@ -992,13 +992,28 @@ pub struct ListSplitsQuery { /// Only return splits whose (index_uid, split_id) are lexicographically after this split pub after_split: Option<(IndexUid, SplitId)>, + + /// Exclude any split whose `split_id` appears in this list. Empty means no + /// exclusion. + pub excluded_split_ids: Vec, } #[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)] +/// Ordering applied to the result of a [`ListSplitsQuery`]. pub enum SortBy { + /// No ordering — the metastore may return splits in any order. None, + /// Order by `(delete_opstamp ASC, publish_timestamp ASC)`. Used by the + /// delete pipeline to process the splits with the most pending delete + /// work first. Staleness, + /// Order by `(index_uid ASC, split_id ASC)`, matching the splits-table + /// primary key. Used for stable pagination across all indexes. IndexUid, + /// Order by `(maturity_timestamp ASC, split_id ASC)`. Used by the + /// compaction planner so that under a backlog the splits closest to + /// becoming mature are processed first. + MaturityTimestamp, } impl SortBy { @@ -1024,6 +1039,16 @@ impl SortBy { .split_id .cmp(&right_split.split_metadata.split_id) }), + SortBy::MaturityTimestamp => left_split + .split_metadata + .maturity_unix_timestamp() + .cmp(&right_split.split_metadata.maturity_unix_timestamp()) + .then_with(|| { + left_split + .split_metadata + .split_id + .cmp(&right_split.split_metadata.split_id) + }), } } } @@ -1047,6 +1072,7 @@ impl ListSplitsQuery { mature: Bound::Unbounded, sort_by: SortBy::None, after_split: None, + excluded_split_ids: Vec::new(), } } @@ -1071,6 +1097,7 @@ impl ListSplitsQuery { mature: Bound::Unbounded, sort_by: SortBy::None, after_split: None, + excluded_split_ids: Vec::new(), }) } @@ -1091,6 +1118,7 @@ impl ListSplitsQuery { mature: Bound::Unbounded, sort_by: SortBy::None, after_split: None, + excluded_split_ids: Vec::new(), } } @@ -1274,12 +1302,25 @@ impl ListSplitsQuery { self } + /// Sorts the splits by maturity_timestamp ascending, with split_id as a tiebreaker. + pub fn sort_by_maturity_timestamp(mut self) -> Self { + self.sort_by = SortBy::MaturityTimestamp; + self + } + /// Only return splits whose (index_uid, split_id) are lexicographically after this split. /// This is only useful if results are sorted by index_uid and split_id. pub fn after_split(mut self, split_meta: &SplitMetadata) -> Self { self.after_split = Some((split_meta.index_uid.clone(), split_meta.split_id.clone())); self } + + /// Excludes splits whose `split_id` is in the provided list. Used by the + /// compaction planner to skip splits it is already tracking locally. + pub fn with_excluded_split_ids(mut self, excluded_split_ids: Vec) -> Self { + self.excluded_split_ids = excluded_split_ids; + self + } } #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] diff --git a/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs b/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs index e9d0b1ed8fa..9675c372102 100644 --- a/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs +++ b/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs @@ -873,7 +873,7 @@ impl MetastoreService for PostgresqlMetastore { let list_splits_query = request.deserialize_list_splits_query()?; let mut sql_query_builder = Query::select(); sql_query_builder.column(Asterisk).from(Splits::Table); - append_query_filters_and_order_by(&mut sql_query_builder, &list_splits_query); + append_query_filters_and_order_by(&mut sql_query_builder, list_splits_query); let (sql_query, values) = sql_query_builder.build_sqlx(PostgresQueryBuilder); let pg_split_stream = SplitStream::new( @@ -3194,7 +3194,7 @@ mod tests { let index_uid = IndexUid::new_with_random_ulid("test-index"); let query = ListSplitsQuery::for_index(index_uid.clone()).with_split_state(SplitState::Staged); - append_query_filters_and_order_by(sql, &query); + append_query_filters_and_order_by(sql, query); assert_eq!( sql.to_string(PostgresQueryBuilder), @@ -3208,7 +3208,7 @@ mod tests { let query = ListSplitsQuery::for_index(index_uid.clone()).with_split_state(SplitState::Published); - append_query_filters_and_order_by(sql, &query); + append_query_filters_and_order_by(sql, query); assert_eq!( sql.to_string(PostgresQueryBuilder), @@ -3222,7 +3222,7 @@ mod tests { let query = ListSplitsQuery::for_index(index_uid.clone()) .with_split_states([SplitState::Published, SplitState::MarkedForDeletion]); - append_query_filters_and_order_by(sql, &query); + append_query_filters_and_order_by(sql, query); assert_eq!( sql.to_string(PostgresQueryBuilder), format!( @@ -3234,7 +3234,7 @@ mod tests { let sql = select_statement.column(Asterisk).from(Splits::Table); let query = ListSplitsQuery::for_index(index_uid.clone()).with_update_timestamp_lt(51); - append_query_filters_and_order_by(sql, &query); + append_query_filters_and_order_by(sql, query); assert_eq!( sql.to_string(PostgresQueryBuilder), format!( @@ -3246,7 +3246,7 @@ mod tests { let sql = select_statement.column(Asterisk).from(Splits::Table); let query = ListSplitsQuery::for_index(index_uid.clone()).with_create_timestamp_lte(55); - append_query_filters_and_order_by(sql, &query); + append_query_filters_and_order_by(sql, query); assert_eq!( sql.to_string(PostgresQueryBuilder), format!( @@ -3260,7 +3260,7 @@ mod tests { let maturity_evaluation_datetime = OffsetDateTime::from_unix_timestamp(55).unwrap(); let query = ListSplitsQuery::for_index(index_uid.clone()) .retain_mature(maturity_evaluation_datetime); - append_query_filters_and_order_by(sql, &query); + append_query_filters_and_order_by(sql, query); assert_eq!( sql.to_string(PostgresQueryBuilder), @@ -3274,7 +3274,7 @@ mod tests { let query = ListSplitsQuery::for_index(index_uid.clone()) .retain_immature(maturity_evaluation_datetime); - append_query_filters_and_order_by(sql, &query); + append_query_filters_and_order_by(sql, query); assert_eq!( sql.to_string(PostgresQueryBuilder), format!( @@ -3286,7 +3286,7 @@ mod tests { let sql = select_statement.column(Asterisk).from(Splits::Table); let query = ListSplitsQuery::for_index(index_uid.clone()).with_delete_opstamp_gte(4); - append_query_filters_and_order_by(sql, &query); + append_query_filters_and_order_by(sql, query); assert_eq!( sql.to_string(PostgresQueryBuilder), format!( @@ -3298,7 +3298,7 @@ mod tests { let sql = select_statement.column(Asterisk).from(Splits::Table); let query = ListSplitsQuery::for_index(index_uid.clone()).with_time_range_start_gt(45); - append_query_filters_and_order_by(sql, &query); + append_query_filters_and_order_by(sql, query); assert_eq!( sql.to_string(PostgresQueryBuilder), format!( @@ -3310,7 +3310,7 @@ mod tests { let sql = select_statement.column(Asterisk).from(Splits::Table); let query = ListSplitsQuery::for_index(index_uid.clone()).with_time_range_end_lt(45); - append_query_filters_and_order_by(sql, &query); + append_query_filters_and_order_by(sql, query); assert_eq!( sql.to_string(PostgresQueryBuilder), format!( @@ -3326,7 +3326,7 @@ mod tests { is_present: false, tag: "tag-2".to_string(), }); - append_query_filters_and_order_by(sql, &query); + append_query_filters_and_order_by(sql, query); assert_eq!( sql.to_string(PostgresQueryBuilder), @@ -3339,7 +3339,7 @@ mod tests { let sql = select_statement.column(Asterisk).from(Splits::Table); let query = ListSplitsQuery::for_index(index_uid.clone()).with_offset(4); - append_query_filters_and_order_by(sql, &query); + append_query_filters_and_order_by(sql, query); assert_eq!( sql.to_string(PostgresQueryBuilder), @@ -3352,7 +3352,7 @@ mod tests { let sql = select_statement.column(Asterisk).from(Splits::Table); let query = ListSplitsQuery::for_index(index_uid.clone()).sort_by_index_uid(); - append_query_filters_and_order_by(sql, &query); + append_query_filters_and_order_by(sql, query); assert_eq!( sql.to_string(PostgresQueryBuilder), @@ -3370,7 +3370,7 @@ mod tests { split_id: "my_split".to_string(), ..Default::default() }); - append_query_filters_and_order_by(sql, &query); + append_query_filters_and_order_by(sql, query); assert_eq!( sql.to_string(PostgresQueryBuilder), @@ -3383,7 +3383,7 @@ mod tests { let sql = select_statement.column(Asterisk).from(Splits::Table); let query = ListSplitsQuery::for_all_indexes().with_split_state(SplitState::Staged); - append_query_filters_and_order_by(sql, &query); + append_query_filters_and_order_by(sql, query); assert_eq!( sql.to_string(PostgresQueryBuilder), @@ -3394,7 +3394,7 @@ mod tests { let sql = select_statement.column(Asterisk).from(Splits::Table); let query = ListSplitsQuery::for_all_indexes().with_max_time_range_end(42); - append_query_filters_and_order_by(sql, &query); + append_query_filters_and_order_by(sql, query); assert_eq!( sql.to_string(PostgresQueryBuilder), @@ -3411,7 +3411,8 @@ mod tests { let query = ListSplitsQuery::for_index(index_uid.clone()) .with_time_range_start_gt(0) .with_time_range_end_lt(40); - append_query_filters_and_order_by(sql, &query); + append_query_filters_and_order_by(sql, query); + assert_eq!( sql.to_string(PostgresQueryBuilder), format!( @@ -3425,7 +3426,7 @@ mod tests { let query = ListSplitsQuery::for_index(index_uid.clone()) .with_time_range_start_gt(45) .with_delete_opstamp_gt(0); - append_query_filters_and_order_by(sql, &query); + append_query_filters_and_order_by(sql, query); assert_eq!( sql.to_string(PostgresQueryBuilder), format!( @@ -3439,7 +3440,7 @@ mod tests { let query = ListSplitsQuery::for_index(index_uid.clone()) .with_update_timestamp_lt(51) .with_create_timestamp_lte(63); - append_query_filters_and_order_by(sql, &query); + append_query_filters_and_order_by(sql, query); assert_eq!( sql.to_string(PostgresQueryBuilder), format!( @@ -3456,7 +3457,7 @@ mod tests { is_present: true, tag: "tag-1".to_string(), }); - append_query_filters_and_order_by(sql, &query); + append_query_filters_and_order_by(sql, query); assert_eq!( sql.to_string(PostgresQueryBuilder), format!( @@ -3471,7 +3472,7 @@ mod tests { let query = ListSplitsQuery::try_from_index_uids(vec![index_uid.clone(), index_uid_2.clone()]) .unwrap(); - append_query_filters_and_order_by(sql, &query); + append_query_filters_and_order_by(sql, query); assert_eq!( sql.to_string(PostgresQueryBuilder), format!( @@ -3480,6 +3481,20 @@ mod tests { ); } + #[test] + fn test_list_splits_query_excluded_split_ids() { + let mut select_statement = Query::select(); + let sql = select_statement.column(Asterisk).from(Splits::Table); + + let query = ListSplitsQuery::for_all_indexes() + .with_excluded_split_ids(vec!["s1".to_string(), "s2".to_string()]); + append_query_filters_and_order_by(sql, query); + assert_eq!( + sql.to_string(PostgresQueryBuilder), + r#"SELECT * FROM "splits" WHERE split_id <> ALL(ARRAY ['s1','s2']::text[])"# + ); + } + #[test] fn test_index_id_pattern_like_query() { assert_eq!( diff --git a/quickwit/quickwit-metastore/src/metastore/postgres/migrator.rs b/quickwit/quickwit-metastore/src/metastore/postgres/migrator.rs index 03c40eec666..aa51a42c43e 100644 --- a/quickwit/quickwit-metastore/src/metastore/postgres/migrator.rs +++ b/quickwit/quickwit-metastore/src/metastore/postgres/migrator.rs @@ -27,40 +27,45 @@ fn get_migrations() -> Migrator { /// Initializes the database and runs the SQL migrations stored in the /// `quickwit-metastore/migrations` directory. +/// +/// Runs on a raw pooled connection -- not wrapped in an outer transaction. +/// sqlx's `Migrator::run_direct` handles per-migration transactionality +/// itself, honoring each migration's `no_tx` flag (set by a +/// `-- no-transaction` directive as the first line of the migration file). +/// Wrapping the run in our own transaction would defeat that for migrations +/// that must execute outside a transaction block (e.g. `CREATE INDEX +/// CONCURRENTLY`). +/// +/// Atomicity is per-migration, not per-run: a failure on migration N leaves +/// migrations 1..N-1 applied and committed in `_sqlx_migrations`. The +/// operator fixes the failing migration and re-runs. #[instrument(skip_all)] pub(super) async fn run_migrations( pool: &TrackedPool, skip_migrations: bool, skip_locking: bool, ) -> MetastoreResult<()> { - let mut tx = pool.begin().await?; - let conn = tx.acquire().await?; - + let mut conn = pool.acquire().await?; let mut migrator = get_migrations(); if skip_locking { migrator.set_locking(false); } - if !skip_migrations { - // this is an hidden function, made to get "around the annoying "implementation of `Acquire` - // is not general enough" error", which is the error we get otherwise. - let migrate_result = migrator.run_direct(conn).await; + if skip_migrations { + return check_migrations(migrator, &mut conn).await; + } - let Err(migrate_error) = migrate_result else { - tx.commit().await?; - return Ok(()); - }; - tx.rollback().await?; + // this is an hidden function, made to get "around the annoying "implementation of `Acquire` + // is not general enough" error", which is the error we get otherwise. + if let Err(migrate_error) = migrator.run_direct(&mut *conn).await { error!(error=%migrate_error, "failed to run PostgreSQL migrations"); - - Err(MetastoreError::Internal { + return Err(MetastoreError::Internal { message: "failed to run PostgreSQL migrations".to_string(), cause: migrate_error.to_string(), - }) - } else { - check_migrations(migrator, conn).await + }); } + Ok(()) } async fn check_migrations(migrator: Migrator, conn: &mut PgConnection) -> MetastoreResult<()> { diff --git a/quickwit/quickwit-metastore/src/metastore/postgres/utils.rs b/quickwit/quickwit-metastore/src/metastore/postgres/utils.rs index 28519b8a294..d63755a06f6 100644 --- a/quickwit/quickwit-metastore/src/metastore/postgres/utils.rs +++ b/quickwit/quickwit-metastore/src/metastore/postgres/utils.rs @@ -19,7 +19,7 @@ use std::time::Duration; use quickwit_common::uri::Uri; use quickwit_proto::metastore::{MetastoreError, MetastoreResult}; -use sea_query::{Expr, Func, Order, SelectStatement, any}; +use sea_query::{ArrayType, Expr, Func, Order, SelectStatement, Value, any}; use sqlx::postgres::{PgConnectOptions, PgPoolOptions}; use sqlx::{ConnectOptions, Postgres}; use tracing::error; @@ -96,10 +96,7 @@ pub(super) fn append_range_filters( }; } -pub(super) fn append_query_filters_and_order_by( - sql: &mut SelectStatement, - query: &ListSplitsQuery, -) { +pub(super) fn append_query_filters_and_order_by(sql: &mut SelectStatement, query: ListSplitsQuery) { if let Some(index_uids) = &query.index_uids { // Note: `ListSplitsQuery` builder enforces a non empty `index_uids` list. // TODO we should explore IN VALUES, = ANY and similar constructs in case they perform @@ -202,6 +199,23 @@ pub(super) fn append_query_filters_and_order_by( ); } + if !query.excluded_split_ids.is_empty() { + // One bind regardless of list length: avoids postgres' 65535 param + // ceiling and keeps parse-time O(1) in the exclude size. The `$1` is + // postgres' placeholder; sea-query substitutes it with the next bind + // slot at build time. + let excluded_values: Vec = query + .excluded_split_ids + .into_iter() + .map(|split_id| Value::String(Some(Box::new(split_id)))) + .collect(); + let excluded_array = Value::Array(ArrayType::String, Some(Box::new(excluded_values))); + sql.cond_where(Expr::cust_with_values( + "split_id <> ALL($1::text[])", + [excluded_array], + )); + } + match query.sort_by { SortBy::Staleness => { sql.order_by(Splits::DeleteOpstamp, Order::Asc) @@ -211,6 +225,10 @@ pub(super) fn append_query_filters_and_order_by( sql.order_by(Splits::IndexUid, Order::Asc) .order_by(Splits::SplitId, Order::Asc); } + SortBy::MaturityTimestamp => { + sql.order_by(Splits::MaturityTimestamp, Order::Asc) + .order_by(Splits::SplitId, Order::Asc); + } SortBy::None => (), } diff --git a/quickwit/quickwit-metastore/src/split_metadata.rs b/quickwit/quickwit-metastore/src/split_metadata.rs index fe88fe379d3..95066e87838 100644 --- a/quickwit/quickwit-metastore/src/split_metadata.rs +++ b/quickwit/quickwit-metastore/src/split_metadata.rs @@ -218,6 +218,18 @@ impl SplitMetadata { } } + /// Returns the unix timestamp at which the split becomes mature, or 0 if + /// the split is already mature (matching the metastore's stored + /// `maturity_timestamp` column). + pub fn maturity_unix_timestamp(&self) -> i64 { + match self.maturity { + SplitMaturity::Mature => 0, + SplitMaturity::Immature { maturation_period } => { + self.create_timestamp + maturation_period.as_secs() as i64 + } + } + } + #[cfg(any(test, feature = "testsuite"))] /// Returns an instance of `SplitMetadata` for testing. pub fn for_test(split_id: SplitId) -> SplitMetadata { diff --git a/quickwit/quickwit-proto/build.rs b/quickwit/quickwit-proto/build.rs index a5f2e1f2b53..20724113bb2 100644 --- a/quickwit/quickwit-proto/build.rs +++ b/quickwit/quickwit-proto/build.rs @@ -35,6 +35,22 @@ fn main() -> Result<(), Box> { .run() .unwrap(); + // Compaction service. + let mut prost_config = prost_build::Config::default(); + prost_config.file_descriptor_set_path("src/codegen/quickwit/compaction_descriptor.bin"); + prost_config.extern_path(".quickwit.common.IndexUid", "crate::types::IndexUid"); + + Codegen::builder() + .with_prost_config(prost_config) + .with_protos(&["protos/quickwit/compaction.proto"]) + .with_includes(&["protos"]) + .with_output_dir("src/codegen/quickwit") + .with_result_type_path("crate::compaction::CompactionResult") + .with_error_type_path("crate::compaction::CompactionError") + .generate_rpc_name_impls() + .run() + .unwrap(); + // Control plane. let mut prost_config = prost_build::Config::default(); prost_config.file_descriptor_set_path("src/codegen/quickwit/control_plane_descriptor.bin"); diff --git a/quickwit/quickwit-proto/protos/quickwit/compaction.proto b/quickwit/quickwit-proto/protos/quickwit/compaction.proto new file mode 100644 index 00000000000..3c82d6b34c7 --- /dev/null +++ b/quickwit/quickwit-proto/protos/quickwit/compaction.proto @@ -0,0 +1,64 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package quickwit.compaction; + +import "quickwit/common.proto"; + +service CompactionPlannerService { + rpc ReportStatus(ReportStatusRequest) returns (ReportStatusResponse); +} + +message ReportStatusRequest { + string node_id = 1; + uint32 available_slots = 2; + repeated CompactionInProgress in_progress = 3; + repeated CompactionSuccess successes = 4; + repeated CompactionFailure failures = 5; +} + +message CompactionInProgress { + string task_id = 1; + quickwit.common.IndexUid index_uid = 2; + string source_id = 3; + repeated string split_ids = 4; +} + +message CompactionSuccess { + string task_id = 1; +} + +message CompactionFailure { + string task_id = 1; + string error_message = 2; +} + +message ReportStatusResponse { + repeated MergeTaskAssignment new_tasks = 1; +} + +message MergeTaskAssignment { + string task_id = 1; + repeated string splits_metadata_json = 2; + string doc_mapping_json = 3; + string search_settings_json = 4; + string indexing_settings_json = 5; + string retention_policy_json = 6; + quickwit.common.IndexUid index_uid = 7; + string source_id = 8; + string index_storage_uri = 9; + uint64 merge_level = 10; +} \ No newline at end of file diff --git a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.compaction.rs b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.compaction.rs new file mode 100644 index 00000000000..708c35fa5af --- /dev/null +++ b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.compaction.rs @@ -0,0 +1,901 @@ +// This file is @generated by prost-build. +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct ReportStatusRequest { + #[prost(string, tag = "1")] + pub node_id: ::prost::alloc::string::String, + #[prost(uint32, tag = "2")] + pub available_slots: u32, + #[prost(message, repeated, tag = "3")] + pub in_progress: ::prost::alloc::vec::Vec, + #[prost(message, repeated, tag = "4")] + pub successes: ::prost::alloc::vec::Vec, + #[prost(message, repeated, tag = "5")] + pub failures: ::prost::alloc::vec::Vec, +} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] +pub struct CompactionInProgress { + #[prost(string, tag = "1")] + pub task_id: ::prost::alloc::string::String, + #[prost(message, optional, tag = "2")] + pub index_uid: ::core::option::Option, + #[prost(string, tag = "3")] + pub source_id: ::prost::alloc::string::String, + #[prost(string, repeated, tag = "4")] + pub split_ids: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, +} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] +pub struct CompactionSuccess { + #[prost(string, tag = "1")] + pub task_id: ::prost::alloc::string::String, +} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] +pub struct CompactionFailure { + #[prost(string, tag = "1")] + pub task_id: ::prost::alloc::string::String, + #[prost(string, tag = "2")] + pub error_message: ::prost::alloc::string::String, +} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct ReportStatusResponse { + #[prost(message, repeated, tag = "1")] + pub new_tasks: ::prost::alloc::vec::Vec, +} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] +pub struct MergeTaskAssignment { + #[prost(string, tag = "1")] + pub task_id: ::prost::alloc::string::String, + #[prost(string, repeated, tag = "2")] + pub splits_metadata_json: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, + #[prost(string, tag = "3")] + pub doc_mapping_json: ::prost::alloc::string::String, + #[prost(string, tag = "4")] + pub search_settings_json: ::prost::alloc::string::String, + #[prost(string, tag = "5")] + pub indexing_settings_json: ::prost::alloc::string::String, + #[prost(string, tag = "6")] + pub retention_policy_json: ::prost::alloc::string::String, + #[prost(message, optional, tag = "7")] + pub index_uid: ::core::option::Option, + #[prost(string, tag = "8")] + pub source_id: ::prost::alloc::string::String, + #[prost(string, tag = "9")] + pub index_storage_uri: ::prost::alloc::string::String, + #[prost(uint64, tag = "10")] + pub merge_level: u64, +} +/// BEGIN quickwit-codegen +#[allow(unused_imports)] +use std::str::FromStr; +use tower::{Layer, Service, ServiceExt}; +use quickwit_common::tower::RpcName; +impl RpcName for ReportStatusRequest { + fn rpc_name() -> &'static str { + "report_status" + } +} +#[cfg_attr(any(test, feature = "testsuite"), mockall::automock)] +#[async_trait::async_trait] +pub trait CompactionPlannerService: std::fmt::Debug + Send + Sync + 'static { + async fn report_status( + &self, + request: ReportStatusRequest, + ) -> crate::compaction::CompactionResult; +} +#[derive(Debug, Clone)] +pub struct CompactionPlannerServiceClient { + inner: InnerCompactionPlannerServiceClient, +} +#[derive(Debug, Clone)] +struct InnerCompactionPlannerServiceClient(std::sync::Arc); +impl CompactionPlannerServiceClient { + pub fn new(instance: T) -> Self + where + T: CompactionPlannerService, + { + #[cfg(any(test, feature = "testsuite"))] + assert!( + std::any::TypeId::of:: < T > () != std::any::TypeId::of:: < + MockCompactionPlannerService > (), + "`MockCompactionPlannerService` must be wrapped in a `MockCompactionPlannerServiceWrapper`: use `CompactionPlannerServiceClient::from_mock(mock)` to instantiate the client" + ); + Self { + inner: InnerCompactionPlannerServiceClient(std::sync::Arc::new(instance)), + } + } + pub fn as_grpc_service( + &self, + max_message_size: bytesize::ByteSize, + ) -> compaction_planner_service_grpc_server::CompactionPlannerServiceGrpcServer< + CompactionPlannerServiceGrpcServerAdapter, + > { + let adapter = CompactionPlannerServiceGrpcServerAdapter::new(self.clone()); + compaction_planner_service_grpc_server::CompactionPlannerServiceGrpcServer::new( + adapter, + ) + .accept_compressed(tonic::codec::CompressionEncoding::Gzip) + .accept_compressed(tonic::codec::CompressionEncoding::Zstd) + .send_compressed(tonic::codec::CompressionEncoding::Gzip) + .send_compressed(tonic::codec::CompressionEncoding::Zstd) + .max_decoding_message_size(max_message_size.0 as usize) + .max_encoding_message_size(max_message_size.0 as usize) + } + pub fn from_channel( + addr: std::net::SocketAddr, + channel: tonic::transport::Channel, + max_message_size: bytesize::ByteSize, + compression_encoding_opt: Option, + ) -> Self { + let (_, connection_keys_watcher) = tokio::sync::watch::channel( + std::collections::HashSet::from_iter([addr]), + ); + let mut client = compaction_planner_service_grpc_client::CompactionPlannerServiceGrpcClient::new( + channel, + ) + .max_decoding_message_size(max_message_size.0 as usize) + .max_encoding_message_size(max_message_size.0 as usize); + if let Some(compression_encoding) = compression_encoding_opt { + client = client + .accept_compressed(compression_encoding) + .send_compressed(compression_encoding); + } + let adapter = CompactionPlannerServiceGrpcClientAdapter::new( + client, + connection_keys_watcher, + ); + Self::new(adapter) + } + pub fn from_balance_channel( + balance_channel: quickwit_common::tower::BalanceChannel, + max_message_size: bytesize::ByteSize, + compression_encoding_opt: Option, + ) -> CompactionPlannerServiceClient { + let connection_keys_watcher = balance_channel.connection_keys_watcher(); + let mut client = compaction_planner_service_grpc_client::CompactionPlannerServiceGrpcClient::new( + balance_channel, + ) + .max_decoding_message_size(max_message_size.0 as usize) + .max_encoding_message_size(max_message_size.0 as usize); + if let Some(compression_encoding) = compression_encoding_opt { + client = client + .accept_compressed(compression_encoding) + .send_compressed(compression_encoding); + } + let adapter = CompactionPlannerServiceGrpcClientAdapter::new( + client, + connection_keys_watcher, + ); + Self::new(adapter) + } + pub fn from_mailbox(mailbox: quickwit_actors::Mailbox) -> Self + where + A: quickwit_actors::Actor + std::fmt::Debug + Send + 'static, + CompactionPlannerServiceMailbox: CompactionPlannerService, + { + CompactionPlannerServiceClient::new( + CompactionPlannerServiceMailbox::new(mailbox), + ) + } + pub fn tower() -> CompactionPlannerServiceTowerLayerStack { + CompactionPlannerServiceTowerLayerStack::default() + } + #[cfg(any(test, feature = "testsuite"))] + pub fn from_mock(mock: MockCompactionPlannerService) -> Self { + let mock_wrapper = mock_compaction_planner_service::MockCompactionPlannerServiceWrapper { + inner: tokio::sync::Mutex::new(mock), + }; + Self::new(mock_wrapper) + } + #[cfg(any(test, feature = "testsuite"))] + pub fn mocked() -> Self { + Self::from_mock(MockCompactionPlannerService::new()) + } +} +#[async_trait::async_trait] +impl CompactionPlannerService for CompactionPlannerServiceClient { + #[tracing::instrument(skip_all, name = "compaction.report_status")] + async fn report_status( + &self, + request: ReportStatusRequest, + ) -> crate::compaction::CompactionResult { + self.inner.0.report_status(request).await + } +} +#[cfg(any(test, feature = "testsuite"))] +pub mod mock_compaction_planner_service { + use super::*; + #[derive(Debug)] + pub struct MockCompactionPlannerServiceWrapper { + pub(super) inner: tokio::sync::Mutex, + } + #[async_trait::async_trait] + impl CompactionPlannerService for MockCompactionPlannerServiceWrapper { + async fn report_status( + &self, + request: super::ReportStatusRequest, + ) -> crate::compaction::CompactionResult { + self.inner.lock().await.report_status(request).await + } + } +} +pub type BoxFuture = std::pin::Pin< + Box> + Send + 'static>, +>; +impl tower::Service for InnerCompactionPlannerServiceClient { + type Response = ReportStatusResponse; + type Error = crate::compaction::CompactionError; + type Future = BoxFuture; + fn poll_ready( + &mut self, + _cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + std::task::Poll::Ready(Ok(())) + } + fn call(&mut self, request: ReportStatusRequest) -> Self::Future { + let svc = self.clone(); + let fut = async move { svc.0.report_status(request).await }; + Box::pin(fut) + } +} +/// A tower service stack is a set of tower services. +#[derive(Debug)] +struct CompactionPlannerServiceTowerServiceStack { + #[allow(dead_code)] + inner: InnerCompactionPlannerServiceClient, + report_status_svc: quickwit_common::tower::BoxService< + ReportStatusRequest, + ReportStatusResponse, + crate::compaction::CompactionError, + >, +} +#[async_trait::async_trait] +impl CompactionPlannerService for CompactionPlannerServiceTowerServiceStack { + async fn report_status( + &self, + request: ReportStatusRequest, + ) -> crate::compaction::CompactionResult { + self.report_status_svc.clone().ready().await?.call(request).await + } +} +type ReportStatusLayer = quickwit_common::tower::BoxLayer< + quickwit_common::tower::BoxService< + ReportStatusRequest, + ReportStatusResponse, + crate::compaction::CompactionError, + >, + ReportStatusRequest, + ReportStatusResponse, + crate::compaction::CompactionError, +>; +#[derive(Debug, Default)] +pub struct CompactionPlannerServiceTowerLayerStack { + report_status_layers: Vec, +} +impl CompactionPlannerServiceTowerLayerStack { + pub fn stack_layer(mut self, layer: L) -> Self + where + L: tower::Layer< + quickwit_common::tower::BoxService< + ReportStatusRequest, + ReportStatusResponse, + crate::compaction::CompactionError, + >, + > + Clone + Send + Sync + 'static, + , + >>::Service: tower::Service< + ReportStatusRequest, + Response = ReportStatusResponse, + Error = crate::compaction::CompactionError, + > + Clone + Send + Sync + 'static, + <, + >>::Service as tower::Service>::Future: Send + 'static, + { + self.report_status_layers + .push(quickwit_common::tower::BoxLayer::new(layer.clone())); + self + } + pub fn stack_report_status_layer(mut self, layer: L) -> Self + where + L: tower::Layer< + quickwit_common::tower::BoxService< + ReportStatusRequest, + ReportStatusResponse, + crate::compaction::CompactionError, + >, + > + Send + Sync + 'static, + L::Service: tower::Service< + ReportStatusRequest, + Response = ReportStatusResponse, + Error = crate::compaction::CompactionError, + > + Clone + Send + Sync + 'static, + >::Future: Send + 'static, + { + self.report_status_layers.push(quickwit_common::tower::BoxLayer::new(layer)); + self + } + pub fn build(self, instance: T) -> CompactionPlannerServiceClient + where + T: CompactionPlannerService, + { + let inner_client = InnerCompactionPlannerServiceClient( + std::sync::Arc::new(instance), + ); + self.build_from_inner_client(inner_client) + } + pub fn build_from_channel( + self, + addr: std::net::SocketAddr, + channel: tonic::transport::Channel, + max_message_size: bytesize::ByteSize, + compression_encoding_opt: Option, + ) -> CompactionPlannerServiceClient { + let client = CompactionPlannerServiceClient::from_channel( + addr, + channel, + max_message_size, + compression_encoding_opt, + ); + let inner_client = client.inner; + self.build_from_inner_client(inner_client) + } + pub fn build_from_balance_channel( + self, + balance_channel: quickwit_common::tower::BalanceChannel, + max_message_size: bytesize::ByteSize, + compression_encoding_opt: Option, + ) -> CompactionPlannerServiceClient { + let client = CompactionPlannerServiceClient::from_balance_channel( + balance_channel, + max_message_size, + compression_encoding_opt, + ); + let inner_client = client.inner; + self.build_from_inner_client(inner_client) + } + pub fn build_from_mailbox( + self, + mailbox: quickwit_actors::Mailbox, + ) -> CompactionPlannerServiceClient + where + A: quickwit_actors::Actor + std::fmt::Debug + Send + 'static, + CompactionPlannerServiceMailbox: CompactionPlannerService, + { + let inner_client = InnerCompactionPlannerServiceClient( + std::sync::Arc::new(CompactionPlannerServiceMailbox::new(mailbox)), + ); + self.build_from_inner_client(inner_client) + } + #[cfg(any(test, feature = "testsuite"))] + pub fn build_from_mock( + self, + mock: MockCompactionPlannerService, + ) -> CompactionPlannerServiceClient { + let client = CompactionPlannerServiceClient::from_mock(mock); + let inner_client = client.inner; + self.build_from_inner_client(inner_client) + } + fn build_from_inner_client( + self, + inner_client: InnerCompactionPlannerServiceClient, + ) -> CompactionPlannerServiceClient { + let report_status_svc = self + .report_status_layers + .into_iter() + .rev() + .fold( + quickwit_common::tower::BoxService::new(inner_client.clone()), + |svc, layer| layer.layer(svc), + ); + let tower_svc_stack = CompactionPlannerServiceTowerServiceStack { + inner: inner_client, + report_status_svc, + }; + CompactionPlannerServiceClient::new(tower_svc_stack) + } +} +#[derive(Debug, Clone)] +struct MailboxAdapter { + inner: quickwit_actors::Mailbox, + phantom: std::marker::PhantomData, +} +impl std::ops::Deref for MailboxAdapter +where + A: quickwit_actors::Actor, +{ + type Target = quickwit_actors::Mailbox; + fn deref(&self) -> &Self::Target { + &self.inner + } +} +#[derive(Debug)] +pub struct CompactionPlannerServiceMailbox { + inner: MailboxAdapter, +} +impl CompactionPlannerServiceMailbox { + pub fn new(instance: quickwit_actors::Mailbox) -> Self { + let inner = MailboxAdapter { + inner: instance, + phantom: std::marker::PhantomData, + }; + Self { inner } + } +} +impl Clone for CompactionPlannerServiceMailbox { + fn clone(&self) -> Self { + let inner = MailboxAdapter { + inner: self.inner.clone(), + phantom: std::marker::PhantomData, + }; + Self { inner } + } +} +impl tower::Service for CompactionPlannerServiceMailbox +where + A: quickwit_actors::Actor + + quickwit_actors::DeferableReplyHandler> + Send + + 'static, + M: std::fmt::Debug + Send + 'static, + T: Send + 'static, + E: std::fmt::Debug + Send + 'static, + crate::compaction::CompactionError: From>, +{ + type Response = T; + type Error = crate::compaction::CompactionError; + type Future = BoxFuture; + fn poll_ready( + &mut self, + _cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + //! This does not work with balance middlewares such as `tower::balance::pool::Pool` because + //! this always returns `Poll::Ready`. The fix is to acquire a permit from the + //! mailbox in `poll_ready` and consume it in `call`. + std::task::Poll::Ready(Ok(())) + } + fn call(&mut self, message: M) -> Self::Future { + let mailbox = self.inner.clone(); + let fut = async move { + mailbox.ask_for_res(message).await.map_err(|error| error.into()) + }; + Box::pin(fut) + } +} +#[async_trait::async_trait] +impl CompactionPlannerService for CompactionPlannerServiceMailbox +where + A: quickwit_actors::Actor + std::fmt::Debug, + CompactionPlannerServiceMailbox< + A, + >: tower::Service< + ReportStatusRequest, + Response = ReportStatusResponse, + Error = crate::compaction::CompactionError, + Future = BoxFuture, + >, +{ + async fn report_status( + &self, + request: ReportStatusRequest, + ) -> crate::compaction::CompactionResult { + self.clone().call(request).await + } +} +#[derive(Debug, Clone)] +pub struct CompactionPlannerServiceGrpcClientAdapter { + inner: T, + #[allow(dead_code)] + connection_addrs_rx: tokio::sync::watch::Receiver< + std::collections::HashSet, + >, +} +impl CompactionPlannerServiceGrpcClientAdapter { + pub fn new( + instance: T, + connection_addrs_rx: tokio::sync::watch::Receiver< + std::collections::HashSet, + >, + ) -> Self { + Self { + inner: instance, + connection_addrs_rx, + } + } +} +#[async_trait::async_trait] +impl CompactionPlannerService +for CompactionPlannerServiceGrpcClientAdapter< + compaction_planner_service_grpc_client::CompactionPlannerServiceGrpcClient, +> +where + T: tonic::client::GrpcService + std::fmt::Debug + Clone + Send + + Sync + 'static, + T::ResponseBody: tonic::codegen::Body + Send + 'static, + ::Error: Into + + Send, + T::Future: Send, +{ + async fn report_status( + &self, + request: ReportStatusRequest, + ) -> crate::compaction::CompactionResult { + let mut tonic_request = tonic::Request::new(request); + quickwit_common::tracing_utils::inject_current_context( + tonic_request.metadata_mut(), + ); + self.inner + .clone() + .report_status(tonic_request) + .await + .map(|response| response.into_inner()) + .map_err(|status| crate::error::grpc_status_to_service_error( + status, + ReportStatusRequest::rpc_name(), + )) + } +} +#[derive(Debug)] +pub struct CompactionPlannerServiceGrpcServerAdapter { + inner: InnerCompactionPlannerServiceClient, +} +impl CompactionPlannerServiceGrpcServerAdapter { + pub fn new(instance: T) -> Self + where + T: CompactionPlannerService, + { + Self { + inner: InnerCompactionPlannerServiceClient(std::sync::Arc::new(instance)), + } + } +} +#[async_trait::async_trait] +impl compaction_planner_service_grpc_server::CompactionPlannerServiceGrpc +for CompactionPlannerServiceGrpcServerAdapter { + async fn report_status( + &self, + tonic_request: tonic::Request, + ) -> Result, tonic::Status> { + let parent_context = quickwit_common::tracing_utils::extract_context( + tonic_request.metadata(), + ); + let request = tonic_request.into_inner(); + let span = tracing::info_span!("compaction.report_status"); + let _ = ::set_parent( + &span, + parent_context, + ); + let fut = async move { + self.inner + .0 + .report_status(request) + .await + .map(tonic::Response::new) + .map_err(crate::error::grpc_error_to_grpc_status) + }; + <_ as tracing::Instrument>::instrument(fut, span).await + } +} +/// Generated client implementations. +pub mod compaction_planner_service_grpc_client { + #![allow( + unused_variables, + dead_code, + missing_docs, + clippy::wildcard_imports, + clippy::let_unit_value, + )] + use tonic::codegen::*; + use tonic::codegen::http::Uri; + #[derive(Debug, Clone)] + pub struct CompactionPlannerServiceGrpcClient { + inner: tonic::client::Grpc, + } + impl CompactionPlannerServiceGrpcClient { + /// Attempt to create a new client by connecting to a given endpoint. + pub async fn connect(dst: D) -> Result + where + D: TryInto, + D::Error: Into, + { + let conn = tonic::transport::Endpoint::new(dst)?.connect().await?; + Ok(Self::new(conn)) + } + } + impl CompactionPlannerServiceGrpcClient + where + T: tonic::client::GrpcService, + T::Error: Into, + T::ResponseBody: Body + std::marker::Send + 'static, + ::Error: Into + std::marker::Send, + { + pub fn new(inner: T) -> Self { + let inner = tonic::client::Grpc::new(inner); + Self { inner } + } + pub fn with_origin(inner: T, origin: Uri) -> Self { + let inner = tonic::client::Grpc::with_origin(inner, origin); + Self { inner } + } + pub fn with_interceptor( + inner: T, + interceptor: F, + ) -> CompactionPlannerServiceGrpcClient> + where + F: tonic::service::Interceptor, + T::ResponseBody: Default, + T: tonic::codegen::Service< + http::Request, + Response = http::Response< + >::ResponseBody, + >, + >, + , + >>::Error: Into + std::marker::Send + std::marker::Sync, + { + CompactionPlannerServiceGrpcClient::new( + InterceptedService::new(inner, interceptor), + ) + } + /// Compress requests with the given encoding. + /// + /// This requires the server to support it otherwise it might respond with an + /// error. + #[must_use] + pub fn send_compressed(mut self, encoding: CompressionEncoding) -> Self { + self.inner = self.inner.send_compressed(encoding); + self + } + /// Enable decompressing responses. + #[must_use] + pub fn accept_compressed(mut self, encoding: CompressionEncoding) -> Self { + self.inner = self.inner.accept_compressed(encoding); + self + } + /// Limits the maximum size of a decoded message. + /// + /// Default: `4MB` + #[must_use] + pub fn max_decoding_message_size(mut self, limit: usize) -> Self { + self.inner = self.inner.max_decoding_message_size(limit); + self + } + /// Limits the maximum size of an encoded message. + /// + /// Default: `usize::MAX` + #[must_use] + pub fn max_encoding_message_size(mut self, limit: usize) -> Self { + self.inner = self.inner.max_encoding_message_size(limit); + self + } + pub async fn report_status( + &mut self, + request: impl tonic::IntoRequest, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + > { + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::unknown( + format!("Service was not ready: {}", e.into()), + ) + })?; + let codec = tonic_prost::ProstCodec::default(); + let path = http::uri::PathAndQuery::from_static( + "/quickwit.compaction.CompactionPlannerService/ReportStatus", + ); + let mut req = request.into_request(); + req.extensions_mut() + .insert( + GrpcMethod::new( + "quickwit.compaction.CompactionPlannerService", + "ReportStatus", + ), + ); + self.inner.unary(req, path, codec).await + } + } +} +/// Generated server implementations. +pub mod compaction_planner_service_grpc_server { + #![allow( + unused_variables, + dead_code, + missing_docs, + clippy::wildcard_imports, + clippy::let_unit_value, + )] + use tonic::codegen::*; + /// Generated trait containing gRPC methods that should be implemented for use with CompactionPlannerServiceGrpcServer. + #[async_trait] + pub trait CompactionPlannerServiceGrpc: std::marker::Send + std::marker::Sync + 'static { + async fn report_status( + &self, + request: tonic::Request, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + >; + } + #[derive(Debug)] + pub struct CompactionPlannerServiceGrpcServer { + inner: Arc, + accept_compression_encodings: EnabledCompressionEncodings, + send_compression_encodings: EnabledCompressionEncodings, + max_decoding_message_size: Option, + max_encoding_message_size: Option, + } + impl CompactionPlannerServiceGrpcServer { + pub fn new(inner: T) -> Self { + Self::from_arc(Arc::new(inner)) + } + pub fn from_arc(inner: Arc) -> Self { + Self { + inner, + accept_compression_encodings: Default::default(), + send_compression_encodings: Default::default(), + max_decoding_message_size: None, + max_encoding_message_size: None, + } + } + pub fn with_interceptor( + inner: T, + interceptor: F, + ) -> InterceptedService + where + F: tonic::service::Interceptor, + { + InterceptedService::new(Self::new(inner), interceptor) + } + /// Enable decompressing requests with the given encoding. + #[must_use] + pub fn accept_compressed(mut self, encoding: CompressionEncoding) -> Self { + self.accept_compression_encodings.enable(encoding); + self + } + /// Compress responses with the given encoding, if the client supports it. + #[must_use] + pub fn send_compressed(mut self, encoding: CompressionEncoding) -> Self { + self.send_compression_encodings.enable(encoding); + self + } + /// Limits the maximum size of a decoded message. + /// + /// Default: `4MB` + #[must_use] + pub fn max_decoding_message_size(mut self, limit: usize) -> Self { + self.max_decoding_message_size = Some(limit); + self + } + /// Limits the maximum size of an encoded message. + /// + /// Default: `usize::MAX` + #[must_use] + pub fn max_encoding_message_size(mut self, limit: usize) -> Self { + self.max_encoding_message_size = Some(limit); + self + } + } + impl tonic::codegen::Service> + for CompactionPlannerServiceGrpcServer + where + T: CompactionPlannerServiceGrpc, + B: Body + std::marker::Send + 'static, + B::Error: Into + std::marker::Send + 'static, + { + type Response = http::Response; + type Error = std::convert::Infallible; + type Future = BoxFuture; + fn poll_ready( + &mut self, + _cx: &mut Context<'_>, + ) -> Poll> { + Poll::Ready(Ok(())) + } + fn call(&mut self, req: http::Request) -> Self::Future { + match req.uri().path() { + "/quickwit.compaction.CompactionPlannerService/ReportStatus" => { + #[allow(non_camel_case_types)] + struct ReportStatusSvc(pub Arc); + impl< + T: CompactionPlannerServiceGrpc, + > tonic::server::UnaryService + for ReportStatusSvc { + type Response = super::ReportStatusResponse; + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; + fn call( + &mut self, + request: tonic::Request, + ) -> Self::Future { + let inner = Arc::clone(&self.0); + let fut = async move { + ::report_status( + &inner, + request, + ) + .await + }; + Box::pin(fut) + } + } + let accept_compression_encodings = self.accept_compression_encodings; + let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; + let inner = self.inner.clone(); + let fut = async move { + let method = ReportStatusSvc(inner); + let codec = tonic_prost::ProstCodec::default(); + let mut grpc = tonic::server::Grpc::new(codec) + .apply_compression_config( + accept_compression_encodings, + send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, + ); + let res = grpc.unary(method, req).await; + Ok(res) + }; + Box::pin(fut) + } + _ => { + Box::pin(async move { + let mut response = http::Response::new( + tonic::body::Body::default(), + ); + let headers = response.headers_mut(); + headers + .insert( + tonic::Status::GRPC_STATUS, + (tonic::Code::Unimplemented as i32).into(), + ); + headers + .insert( + http::header::CONTENT_TYPE, + tonic::metadata::GRPC_CONTENT_TYPE, + ); + Ok(response) + }) + } + } + } + } + impl Clone for CompactionPlannerServiceGrpcServer { + fn clone(&self) -> Self { + let inner = self.inner.clone(); + Self { + inner, + accept_compression_encodings: self.accept_compression_encodings, + send_compression_encodings: self.send_compression_encodings, + max_decoding_message_size: self.max_decoding_message_size, + max_encoding_message_size: self.max_encoding_message_size, + } + } + } + /// Generated gRPC service name + pub const SERVICE_NAME: &str = "quickwit.compaction.CompactionPlannerService"; + impl tonic::server::NamedService for CompactionPlannerServiceGrpcServer { + const NAME: &'static str = SERVICE_NAME; + } +} diff --git a/quickwit/quickwit-proto/src/compaction/mod.rs b/quickwit/quickwit-proto/src/compaction/mod.rs new file mode 100644 index 00000000000..0eee7adb6b0 --- /dev/null +++ b/quickwit/quickwit-proto/src/compaction/mod.rs @@ -0,0 +1,91 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use quickwit_actors::AskError; +use quickwit_common::rate_limited_error; +use quickwit_common::tower::MakeLoadShedError; +use serde::{Deserialize, Serialize}; + +use crate::GrpcServiceError; +use crate::error::{ServiceError, ServiceErrorCode}; + +include!("../codegen/quickwit/quickwit.compaction.rs"); + +pub const COMPACTION_FILE_DESCRIPTOR_SET: &[u8] = + include_bytes!("../codegen/quickwit/compaction_descriptor.bin"); + +pub type CompactionResult = std::result::Result; + +#[derive(Debug, thiserror::Error, Eq, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum CompactionError { + #[error("internal error: {0}")] + Internal(String), + #[error("request timed out: {0}")] + Timeout(String), + #[error("too many requests")] + TooManyRequests, + #[error("service unavailable: {0}")] + Unavailable(String), +} + +impl ServiceError for CompactionError { + fn error_code(&self) -> ServiceErrorCode { + match self { + Self::Internal(err_msg) => { + rate_limited_error!(limit_per_min = 6, "compaction error: {err_msg}"); + ServiceErrorCode::Internal + } + Self::Timeout(_) => ServiceErrorCode::Timeout, + Self::TooManyRequests => ServiceErrorCode::TooManyRequests, + Self::Unavailable(_) => ServiceErrorCode::Unavailable, + } + } +} + +// Required by the codegen tower layers. All four constructors are mandatory. +impl GrpcServiceError for CompactionError { + fn new_internal(message: String) -> Self { + Self::Internal(message) + } + fn new_timeout(message: String) -> Self { + Self::Timeout(message) + } + fn new_too_many_requests() -> Self { + Self::TooManyRequests + } + fn new_unavailable(message: String) -> Self { + Self::Unavailable(message) + } +} + +impl MakeLoadShedError for CompactionError { + fn make_load_shed_error() -> Self { + CompactionError::TooManyRequests + } +} + +impl From> for CompactionError { + fn from(error: AskError) -> Self { + match error { + AskError::ErrorReply(error) => error, + AskError::MessageNotDelivered => { + Self::new_unavailable("request could not be delivered to actor".to_string()) + } + AskError::ProcessMessageError => { + Self::new_internal("an error occurred while processing the request".to_string()) + } + } + } +} diff --git a/quickwit/quickwit-proto/src/indexing/mod.rs b/quickwit/quickwit-proto/src/indexing/mod.rs index bf0d147b685..281bc73e7df 100644 --- a/quickwit/quickwit-proto/src/indexing/mod.rs +++ b/quickwit/quickwit-proto/src/indexing/mod.rs @@ -138,6 +138,7 @@ impl Display for IndexingPipelineId { /// Uniquely identifies a merge pipeline. There exists at most one merge pipeline per /// `(index_uid, source_id)` running on indexer at any given time fed by one or more indexing /// pipelines. +/// TODO: Rework/remove this as part of splitting up merges. #[derive(Clone, Debug, Hash, Eq, PartialEq)] pub struct MergePipelineId { pub node_id: NodeId, diff --git a/quickwit/quickwit-proto/src/lib.rs b/quickwit/quickwit-proto/src/lib.rs index 428387c6353..fb21c378a18 100644 --- a/quickwit/quickwit-proto/src/lib.rs +++ b/quickwit/quickwit-proto/src/lib.rs @@ -21,6 +21,7 @@ use std::cmp::Ordering; pub mod cluster; +pub mod compaction; pub mod control_plane; pub use bytes; pub use tonic; diff --git a/quickwit/quickwit-search/src/leaf.rs b/quickwit/quickwit-search/src/leaf.rs index 8bf45a20aba..6e78cf48c2b 100644 --- a/quickwit/quickwit-search/src/leaf.rs +++ b/quickwit/quickwit-search/src/leaf.rs @@ -41,7 +41,7 @@ use quickwit_query::query_ast::{ }; use quickwit_query::tokenizers::TokenizerManager; use quickwit_storage::{ - BundleStorage, ByteRangeCache, CountingStorage, MemorySizedCache, OwnedBytes, SplitCache, + BundleStorage, ByteRangeCache, CountingStorage, MemorySizedCache, OwnedBytes, SearchSplitCache, Storage, StorageResolver, TimeoutAndRetryStorage, wrap_storage_with_cache, }; use tantivy::aggregation::AggContextParams; @@ -174,7 +174,7 @@ pub(crate) async fn open_split_bundle( // This is before the bundle storage: at this point, this storage is reading `.split` files. let index_storage_with_split_cache = if let Some(split_cache) = searcher_context.split_cache_opt.as_ref() { - SplitCache::wrap_storage(split_cache.clone(), index_storage.clone()) + SearchSplitCache::wrap_storage(split_cache.clone(), index_storage.clone()) } else { index_storage.clone() }; diff --git a/quickwit/quickwit-search/src/service.rs b/quickwit/quickwit-search/src/service.rs index 930ae5096fb..2d340a0dd16 100644 --- a/quickwit/quickwit-search/src/service.rs +++ b/quickwit/quickwit-search/src/service.rs @@ -29,7 +29,7 @@ use quickwit_proto::search::{ SearchPlanResponse, SearchRequest, SearchResponse, SnippetRequest, }; use quickwit_storage::{ - MemorySizedCache, QuickwitCache, SplitCache, StorageCache, StorageResolver, + MemorySizedCache, QuickwitCache, SearchSplitCache, StorageCache, StorageResolver, }; use tantivy::aggregation::AggregationLimitsGuard; @@ -417,7 +417,7 @@ pub struct SearcherContext { /// Per-split and per-predicate cache. pub predicate_cache: Arc, /// Search split cache. `None` if no split cache is configured. - pub split_cache_opt: Option>, + pub split_cache_opt: Option>, /// List fields cache. Caches the raw fields-metadata blob for a given split. pub list_fields_cache: ListFieldsCache, /// The aggregation limits are passed to limit the memory usage. @@ -446,7 +446,7 @@ impl SearcherContext { /// Creates a new searcher context without a lambda invoker. pub fn new_without_invoker( searcher_config: SearcherConfig, - split_cache_opt: Option>, + split_cache_opt: Option>, ) -> Self { Self::new( searcher_config, @@ -458,7 +458,7 @@ impl SearcherContext { /// Creates a new searcher context, given a searcher config, and an optional `SplitCache`. pub fn new( searcher_config: SearcherConfig, - split_cache_opt: Option>, + split_cache_opt: Option>, lambda_invoker: Option, ) -> Self { let global_split_footer_cache = MemorySizedCache::from_config( diff --git a/quickwit/quickwit-serve/Cargo.toml b/quickwit/quickwit-serve/Cargo.toml index bd3ee2006a0..0d892796fef 100644 --- a/quickwit/quickwit-serve/Cargo.toml +++ b/quickwit/quickwit-serve/Cargo.toml @@ -68,6 +68,7 @@ quickwit-datafusion = { workspace = true, optional = true } quickwit-doc-mapper = { workspace = true } quickwit-index-management = { workspace = true } quickwit-indexing = { workspace = true } +quickwit-compaction = { workspace = true } quickwit-ingest = { workspace = true } quickwit-jaeger = { workspace = true } quickwit-janitor = { workspace = true } diff --git a/quickwit/quickwit-serve/src/grpc.rs b/quickwit/quickwit-serve/src/grpc.rs index e7bf1245792..5107ad8ea79 100644 --- a/quickwit/quickwit-serve/src/grpc.rs +++ b/quickwit/quickwit-serve/src/grpc.rs @@ -141,6 +141,21 @@ pub(crate) async fn start_grpc_server( None }; + // Mount gRPC compaction service if this node is a janitor with the compaction service enabled. + let compaction_grpc_service = if services + .node_config + .is_service_enabled(QuickwitService::Janitor) + { + if let Some(compaction_service) = &services.compaction_service_client_opt { + enabled_grpc_services.insert("compaction"); + file_descriptor_sets.push(quickwit_proto::compaction::COMPACTION_FILE_DESCRIPTOR_SET); + Some(compaction_service.as_grpc_service(grpc_config.max_message_size)) + } else { + None + } + } else { + None + }; // Mount gRPC control plane service if `QuickwitService::ControlPlane` is enabled on node. let control_plane_grpc_service = if services .node_config @@ -250,6 +265,7 @@ pub(crate) async fn start_grpc_server( .add_service(developer_grpc_service) .add_service(health_service) .add_service(reflection_service) + .add_optional_service(compaction_grpc_service) .add_optional_service(control_plane_grpc_service) .add_optional_service(indexing_grpc_service) .add_optional_service(ingest_api_grpc_service) diff --git a/quickwit/quickwit-serve/src/lib.rs b/quickwit/quickwit-serve/src/lib.rs index 04ab93cf2e7..f2c9c5659b3 100644 --- a/quickwit/quickwit-serve/src/lib.rs +++ b/quickwit/quickwit-serve/src/lib.rs @@ -59,7 +59,7 @@ pub(crate) use decompression::Body; pub use format::BodyFormat; use futures::StreamExt; use itertools::Itertools; -use quickwit_actors::{ActorExitStatus, Mailbox, SpawnContext, Universe}; +use quickwit_actors::{ActorExitStatus, ActorHandle, Mailbox, SpawnContext, Universe}; use quickwit_cluster::{ Cluster, ClusterChange, ClusterChangeStream, ClusterNode, ListenerHandle, start_cluster_service, }; @@ -74,14 +74,16 @@ use quickwit_common::tower::{ }; use quickwit_common::uri::Uri; use quickwit_common::{get_bool_from_env, spawn_named_task}; +use quickwit_compaction::planner::CompactionPlanner; +use quickwit_compaction::{CompactorSupervisor, start_compactor_service}; use quickwit_config::service::QuickwitService; use quickwit_config::{ClusterConfig, IngestApiConfig, NodeConfig}; use quickwit_control_plane::control_plane::{ControlPlane, ControlPlaneEventSubscriber}; use quickwit_control_plane::{IndexerNodeInfo, IndexerPool}; use quickwit_index_management::{IndexService as IndexManager, IndexServiceError}; -use quickwit_indexing::actors::IndexingService; +use quickwit_indexing::actors::{IndexingService, MergeSchedulerService}; use quickwit_indexing::models::ShardPositionsService; -use quickwit_indexing::start_indexing_service; +use quickwit_indexing::{IndexingSplitCache, start_indexing_service}; use quickwit_ingest::{ GetMemoryCapacity, IngestRequest, IngestRouter, IngestServiceClient, Ingester, IngesterPool, IngesterPoolEntry, LocalShardsUpdate, get_idle_shard_timeout, @@ -95,6 +97,7 @@ use quickwit_metastore::{ ControlPlaneMetastore, ListIndexesMetadataResponseExt, MetastoreResolver, }; use quickwit_opentelemetry::otlp::{OtlpGrpcLogsService, OtlpGrpcTracesService}; +use quickwit_proto::compaction::CompactionPlannerServiceClient; use quickwit_proto::control_plane::ControlPlaneServiceClient; use quickwit_proto::indexing::{IndexingServiceClient, ShardPositionsUpdate}; use quickwit_proto::ingest::ingester::{ @@ -113,7 +116,7 @@ use quickwit_search::{ SearchJobPlacer, SearchService, SearchServiceClient, SearcherContext, SearcherPool, create_search_client_from_channel, start_searcher_service, }; -use quickwit_storage::{SplitCache, StorageResolver}; +use quickwit_storage::{SearchSplitCache, StorageResolver}; pub use quickwit_telemetry_exporters::{EnvFilterReloadFn, do_nothing_env_filter_reload_fn}; use tcp_listener::TcpListenerResolver; use tokio::sync::oneshot; @@ -140,6 +143,12 @@ const READINESS_REPORTING_INTERVAL: Duration = if cfg!(any(test, feature = "test Duration::from_secs(10) }; +const COMPACTION_SERVICE_DISCOVERY_TIMEOUT: Duration = if cfg!(any(test, feature = "testsuite")) { + Duration::from_millis(100) +} else { + Duration::from_secs(300) +}; + const METASTORE_CLIENT_MAX_CONCURRENCY_ENV_KEY: &str = "QW_METASTORE_CLIENT_MAX_CONCURRENCY"; const DEFAULT_METASTORE_CLIENT_MAX_CONCURRENCY: usize = 6; const DISABLE_DELETE_TASK_SERVICE_ENV_KEY: &str = "QW_DISABLE_DELETE_TASK_SERVICE"; @@ -192,6 +201,8 @@ struct QuickwitServices { pub ingest_router_service: IngestRouterServiceClient, ingester_opt: Option, + pub compaction_service_client_opt: Option, + pub _compactor_supervisor_opt: Option>, pub janitor_service_opt: Option>, pub jaeger_service_opt: Option, pub otlp_logs_service_opt: Option, @@ -265,6 +276,71 @@ async fn balance_channel_for_service( BalanceChannel::from_stream(service_change_stream) } +/// Builds a `CompactionPlannerServiceClient` if standalone compactors are enabled +/// and the node runs the janitor or compactor. +/// +/// On janitor nodes, spawns a `CompactionPlanner` actor and builds the client from +/// its mailbox. On compactor-only nodes, connects to a remote janitor via gRPC. +/// +/// The second tuple element is the local planner's `ActorHandle`, returned only +/// on janitor nodes so the caller can attach it to the janitor liveness probe. +async fn get_compaction_planner_client_if_needed( + node_config: &NodeConfig, + cluster: &Cluster, + universe: &Universe, + metastore_client: &MetastoreServiceClient, +) -> anyhow::Result<( + Option, + Option>, +)> { + if !node_config.indexer_config.enable_standalone_compactors { + return Ok((None, None)); + } + let is_janitor = node_config.is_service_enabled(QuickwitService::Janitor); + let is_compactor = node_config.is_service_enabled(QuickwitService::Compactor); + if !is_janitor && !is_compactor { + return Ok((None, None)); + } + if is_janitor { + let planner = CompactionPlanner::new(metastore_client.clone()); + let (mailbox, handle) = universe.spawn_builder().spawn(planner); + info!("compaction planner actor started on janitor node"); + return Ok(( + Some(CompactionPlannerServiceClient::from_mailbox(mailbox)), + Some(handle), + )); + } + // Compactor-only node: connect to the planner on a remote janitor. + let balance_channel = balance_channel_for_service(cluster, QuickwitService::Janitor).await; + let found = balance_channel + .wait_for(COMPACTION_SERVICE_DISCOVERY_TIMEOUT, |connections| { + !connections.is_empty() + }) + .await; + if !found { + bail!("compactor is enabled but no janitor node was found in the cluster") + } + info!("remote compaction planner detected on janitor node"); + Ok(( + Some(CompactionPlannerServiceClient::from_balance_channel( + balance_channel, + node_config.grpc_config.max_message_size, + None, + )), + None, + )) +} + +fn spawn_merge_scheduler_service( + universe: &Universe, + node_config: &NodeConfig, +) -> Mailbox { + let (mailbox, _) = universe.spawn_builder().spawn(MergeSchedulerService::new( + node_config.indexer_config.merge_concurrency.get(), + )); + mailbox +} + async fn start_ingest_client_if_needed( node_config: &NodeConfig, universe: &Universe, @@ -535,7 +611,7 @@ pub async fn serve_quickwit( // Set up the "control plane proxy" for the metastore. let metastore_through_control_plane = MetastoreServiceClient::new(ControlPlaneMetastore::new( control_plane_client.clone(), - metastore_client, + metastore_client.clone(), )); // Setup ingest service v1. @@ -543,7 +619,44 @@ pub async fn serve_quickwit( .await .context("failed to start ingest v1 service")?; + let (compaction_service_client_opt, compaction_planner_handle_opt) = + get_compaction_planner_client_if_needed( + &node_config, + &cluster, + &universe, + &metastore_client, + ) + .await + .context("failed to initialize compaction service client")?; + + // In the case where a compactor and indexer run on the same node (in a single node deployment, + // for example), they should share the split cache so that downloads/new splits end up in + // predictable locations. If there's only one service enabled, no harm, no foul. + let indexing_split_cache: Arc = if node_config + .is_service_enabled(QuickwitService::Indexer) + && node_config.is_service_enabled(QuickwitService::Compactor) + { + let cache = IndexingSplitCache::from_config( + &node_config.indexer_config, + &node_config.data_dir_path, + ) + .await?; + Arc::new(cache) + } else { + Arc::new(IndexingSplitCache::no_caching()) + }; + let indexing_service_opt = if node_config.is_service_enabled(QuickwitService::Indexer) { + // if standalone compactors is enabled, indexing pipelines don't perform any merges. + // if standalone compactors is disabled, indexing pipelines perform all merges as before. + let merge_scheduler_mailbox_opt = + if !node_config.indexer_config.enable_standalone_compactors { + Some(spawn_merge_scheduler_service(&universe, &node_config)) + } else { + None + }; + + let split_cache = indexing_split_cache.clone(); let indexing_service = start_indexing_service( &universe, &node_config, @@ -553,6 +666,8 @@ pub async fn serve_quickwit( ingester_pool.clone(), storage_resolver.clone(), event_broker.clone(), + merge_scheduler_mailbox_opt, + split_cache, ) .await .context("failed to start indexing service")?; @@ -623,15 +738,15 @@ pub async fn serve_quickwit( } } - let split_cache_opt: Option> = + let search_split_cache_opt: Option> = if let Some(split_cache_limits) = node_config.searcher_config.split_cache { - let split_cache = SplitCache::with_root_path( + let search_split_cache = SearchSplitCache::with_root_path( node_config.data_dir_path.join("searcher-split-cache"), storage_resolver.clone(), split_cache_limits, ) .context("failed to load searcher split cache")?; - Some(split_cache) + Some(search_split_cache) } else { None }; @@ -647,7 +762,7 @@ pub async fn serve_quickwit( quickwit_lambda_client::try_get_or_deploy_invoker(lambda_config).await?; Arc::new(SearcherContext::new( node_config.searcher_config.clone(), - split_cache_opt, + search_split_cache_opt, Some(invoker), )) } @@ -659,13 +774,13 @@ pub async fn serve_quickwit( } else { Arc::new(SearcherContext::new_without_invoker( node_config.searcher_config.clone(), - split_cache_opt, + search_split_cache_opt, )) } } else { Arc::new(SearcherContext::new_without_invoker( node_config.searcher_config.clone(), - split_cache_opt, + search_split_cache_opt, )) }; @@ -727,6 +842,7 @@ pub async fn serve_quickwit( storage_resolver.clone(), event_broker.clone(), !get_bool_from_env(DISABLE_DELETE_TASK_SERVICE_ENV_KEY, false), + compaction_planner_handle_opt, ) .await .context("failed to start janitor service")?; @@ -735,6 +851,36 @@ pub async fn serve_quickwit( None }; + let compactor_supervisor_opt = if node_config.is_service_enabled(QuickwitService::Compactor) + && node_config.indexer_config.enable_standalone_compactors + { + let compaction_dir = node_config.data_dir_path.join("compaction"); + fs::create_dir_all(&compaction_dir)?; + let compaction_root_directory = quickwit_common::temp_dir::Builder::default() + .tempdir_in(&compaction_dir) + .context("failed to create compaction temp directory")?; + let compaction_client = compaction_service_client_opt + .clone() + .expect("compactor service enabled but no compaction client available"); + let split_cache = indexing_split_cache.clone(); + let compactor_mailbox = start_compactor_service( + &universe, + cluster.self_node_id().into(), + compaction_client, + &node_config.compactor_config, + metastore_client.clone(), + storage_resolver.clone(), + split_cache, + event_broker.clone(), + compaction_root_directory, + ) + .await + .context("failed to start compactor service")?; + Some(compactor_mailbox) + } else { + None + }; + let jaeger_service_opt = if node_config.jaeger_config.enable_endpoint && node_config.is_service_enabled(QuickwitService::Searcher) { @@ -783,6 +929,8 @@ pub async fn serve_quickwit( ingest_router_service, ingest_service, ingester_opt: ingester_opt.clone(), + compaction_service_client_opt, + _compactor_supervisor_opt: compactor_supervisor_opt, janitor_service_opt, jaeger_service_opt, otlp_logs_service_opt, @@ -1833,4 +1981,84 @@ mod tests { assert!(ingester_pool.is_empty()); } + + #[tokio::test] + async fn test_compaction_service_on_janitor_node() { + let transport = ChannelTransport::default(); + let cluster = + create_cluster_for_test(Vec::new(), &["janitor", "indexer"], &transport, true) + .await + .unwrap(); + let universe = Universe::new(); + let metastore = MetastoreServiceClient::from_mock(MockMetastoreService::new()); + + // Janitor + indexer with standalone compactors enabled: planner client is returned. + let mut node_config = NodeConfig::for_test(); + node_config.indexer_config.enable_standalone_compactors = true; + node_config.enabled_services = + HashSet::from([QuickwitService::Janitor, QuickwitService::Indexer]); + let (client_opt, handle_opt) = + get_compaction_planner_client_if_needed(&node_config, &cluster, &universe, &metastore) + .await + .unwrap(); + assert!(client_opt.is_some()); + assert!(handle_opt.is_some()); + + // With compactor + janitor enabled, planner client is also returned. + node_config.enabled_services = HashSet::from([ + QuickwitService::Janitor, + QuickwitService::Indexer, + QuickwitService::Compactor, + ]); + let (client_opt, handle_opt) = + get_compaction_planner_client_if_needed(&node_config, &cluster, &universe, &metastore) + .await + .unwrap(); + assert!(client_opt.is_some()); + assert!(handle_opt.is_some()); + + // Neither janitor nor compactor: no client, no handle. + node_config.enabled_services = HashSet::from([QuickwitService::Indexer]); + let (client_opt, handle_opt) = + get_compaction_planner_client_if_needed(&node_config, &cluster, &universe, &metastore) + .await + .unwrap(); + assert!(client_opt.is_none()); + assert!(handle_opt.is_none()); + + // Standalone compactors disabled: short-circuit returns (None, None) regardless of + // which services are enabled. + node_config.indexer_config.enable_standalone_compactors = false; + node_config.enabled_services = + HashSet::from([QuickwitService::Janitor, QuickwitService::Indexer]); + let (client_opt, handle_opt) = + get_compaction_planner_client_if_needed(&node_config, &cluster, &universe, &metastore) + .await + .unwrap(); + assert!(client_opt.is_none()); + assert!(handle_opt.is_none()); + + universe.assert_quit().await; + } + + #[tokio::test] + async fn test_compaction_service_returns_error_when_no_janitor() { + let transport = ChannelTransport::default(); + let cluster = create_cluster_for_test(Vec::new(), &["indexer"], &transport, false) + .await + .unwrap(); + let universe = Universe::new(); + let metastore = MetastoreServiceClient::from_mock(MockMetastoreService::new()); + + let mut node_config = NodeConfig::for_test(); + node_config.indexer_config.enable_standalone_compactors = true; + node_config.enabled_services = + HashSet::from([QuickwitService::Indexer, QuickwitService::Compactor]); + let result = + get_compaction_planner_client_if_needed(&node_config, &cluster, &universe, &metastore) + .await; + assert!(result.is_err()); + + universe.assert_quit().await; + } } diff --git a/quickwit/quickwit-serve/src/rest.rs b/quickwit/quickwit-serve/src/rest.rs index 082bdf6d3ad..38c16b38a85 100644 --- a/quickwit/quickwit-serve/src/rest.rs +++ b/quickwit/quickwit-serve/src/rest.rs @@ -859,6 +859,7 @@ mod tests { _report_splits_subscription_handle_opt: None, _local_shards_update_listener_handle_opt: None, cluster, + compaction_service_client_opt: None, control_plane_server_opt: None, control_plane_client, indexing_service_opt: None, @@ -875,6 +876,7 @@ mod tests { node_config: Arc::new(node_config.clone()), search_service: Arc::new(MockSearchService::new()), jaeger_service_opt: None, + _compactor_supervisor_opt: None, env_filter_reload_fn: crate::do_nothing_env_filter_reload_fn(), #[cfg(feature = "datafusion")] datafusion_session_builder: None, diff --git a/quickwit/quickwit-storage/src/lib.rs b/quickwit/quickwit-storage/src/lib.rs index 5f2c6057b5c..2a6338fc33c 100644 --- a/quickwit/quickwit-storage/src/lib.rs +++ b/quickwit/quickwit-storage/src/lib.rs @@ -56,7 +56,7 @@ mod storage_resolver; mod versioned_component; use quickwit_common::uri::Uri; -pub use split_cache::SplitCache; +pub use split_cache::SearchSplitCache; pub use tantivy::directory::OwnedBytes; pub use versioned_component::VersionedComponent; diff --git a/quickwit/quickwit-storage/src/metrics.rs b/quickwit/quickwit-storage/src/metrics.rs index 520d5bd2bcf..34d199b470c 100644 --- a/quickwit/quickwit-storage/src/metrics.rs +++ b/quickwit/quickwit-storage/src/metrics.rs @@ -25,6 +25,10 @@ use quickwit_metrics::{ }; const ACTION_DELETE_OBJECT: Labels<1> = labels!("action" => "delete_object"); +const ACTION_DELETE_OBJECTS: Labels<1> = labels!("action" => "delete_objects"); +const ACTION_GET_OBJECT: Labels<1> = labels!("action" => "get_object"); +const ACTION_PUT_OBJECT: Labels<1> = labels!("action" => "put_object"); +const ACTION_UPLOAD_PART: Labels<1> = labels!("action" => "upload_part"); const OUTCOME_NAME: LabelNames<1> = label_names!("outcome"); const COMPONENT_NAME: LabelNames<1> = label_names!("component_name"); const COMPONENT_CAPACITY_POLICY: LabelNames<3> = @@ -73,13 +77,22 @@ pub(crate) static OBJECT_STORAGE_DELETE_REQUESTS_TOTAL: LazyCounter = lazy_counter!(parent: OBJECT_STORAGE_REQUESTS_TOTAL, labels: [ACTION_DELETE_OBJECT]); pub(crate) static OBJECT_STORAGE_BULK_DELETE_REQUESTS_TOTAL: LazyCounter = - lazy_counter!(parent: OBJECT_STORAGE_REQUESTS_TOTAL, labels: [ACTION_DELETE_OBJECT]); + lazy_counter!(parent: OBJECT_STORAGE_REQUESTS_TOTAL, labels: [ACTION_DELETE_OBJECTS]); pub(crate) static OBJECT_STORAGE_DELETE_REQUEST_DURATION: LazyHistogram = lazy_histogram!(parent: OBJECT_STORAGE_REQUEST_DURATION, labels: [ACTION_DELETE_OBJECT]); pub(crate) static OBJECT_STORAGE_BULK_DELETE_REQUEST_DURATION: LazyHistogram = - lazy_histogram!(parent: OBJECT_STORAGE_REQUEST_DURATION, labels: [ACTION_DELETE_OBJECT]); + lazy_histogram!(parent: OBJECT_STORAGE_REQUEST_DURATION, labels: [ACTION_DELETE_OBJECTS]); + +pub(crate) static OBJECT_STORAGE_GET_OBJECT_DURATION: LazyHistogram = + lazy_histogram!(parent: OBJECT_STORAGE_REQUEST_DURATION, labels: [ACTION_GET_OBJECT]); + +pub(crate) static OBJECT_STORAGE_PUT_OBJECT_DURATION: LazyHistogram = + lazy_histogram!(parent: OBJECT_STORAGE_REQUEST_DURATION, labels: [ACTION_PUT_OBJECT]); + +pub(crate) static OBJECT_STORAGE_UPLOAD_PART_DURATION: LazyHistogram = + lazy_histogram!(parent: OBJECT_STORAGE_REQUEST_DURATION, labels: [ACTION_UPLOAD_PART]); pub(crate) static OBJECT_STORAGE_GET_TOTAL: LazyCounter = lazy_counter!( name: "object_storage_gets_total", diff --git a/quickwit/quickwit-storage/src/object_storage/azure_blob_storage.rs b/quickwit/quickwit-storage/src/object_storage/azure_blob_storage.rs index 0a7cb969639..d8c0d0346f4 100644 --- a/quickwit/quickwit-storage/src/object_storage/azure_blob_storage.rs +++ b/quickwit/quickwit-storage/src/object_storage/azure_blob_storage.rs @@ -27,6 +27,7 @@ use azure_storage::prelude::*; use azure_storage_blobs::blob::operations::GetBlobResponse; use azure_storage_blobs::prelude::*; use bytes::{Bytes, BytesMut}; +use bytesize::ByteSize; use futures::io::Error as FutureError; use futures::stream::{StreamExt, TryStreamExt}; use md5::Digest; @@ -34,6 +35,7 @@ use quickwit_common::retry::{RetryParams, Retryable, retry}; use quickwit_common::uri::Uri; use quickwit_common::{chunk_range, ignore_error_kind, into_u64_range}; use quickwit_config::{AzureStorageConfig, StorageBackend}; +use quickwit_metrics::HistogramTimer; use regex::Regex; use tantivy::directory::OwnedBytes; use thiserror::Error; @@ -110,10 +112,11 @@ impl AzureBlobStorage { multipart_policy: MultiPartPolicy { // Azure max part size is 100MB // https://azure.microsoft.com/en-us/blog/general-availability-larger-block-blobs-in-azure-storage/ - target_part_num_bytes: 100_000_000, - multipart_threshold_num_bytes: 100_000_000, + target_part_num_bytes: ByteSize::mb(100), + multipart_threshold_num_bytes: ByteSize::mb(100), max_num_parts: 50_000, // Azure allows up to 50,000 blocks - max_object_num_bytes: 4_770_000_000_000u64, // Azure allows up to 4.77TB objects + max_object_num_bytes: ByteSize::b(4_770_000_000_000), /* Azure allows up to + * 4.77TB objects */ max_concurrent_uploads: 100, }, retry_params: RetryParams::aggressive(), @@ -213,6 +216,7 @@ impl AzureBlobStorage { let name = self.blob_name(path); let capacity = range_opt.as_ref().map(Range::len).unwrap_or(0); retry(&self.retry_params, || async { + let _timer = HistogramTimer::new(&crate::metrics::OBJECT_STORAGE_GET_OBJECT_DURATION); let (mut response_stream, _in_flight_guards) = if let Some(range) = range_opt.as_ref() { let stream = self .container_client @@ -242,6 +246,7 @@ impl AzureBlobStorage { ) -> StorageResult<()> { crate::metrics::OBJECT_STORAGE_PUT_PARTS.inc(); crate::metrics::OBJECT_STORAGE_UPLOAD_NUM_BYTES.inc_by(payload.len()); + let _timer = HistogramTimer::new(&crate::metrics::OBJECT_STORAGE_PUT_OBJECT_DURATION); retry(&self.retry_params, || async { let data = Bytes::from(payload.read_all().await?.to_vec()); let hash = azure_storage_blobs::prelude::Hash::from(md5::compute(&data[..]).0); @@ -277,6 +282,8 @@ impl AzureBlobStorage { crate::metrics::OBJECT_STORAGE_PUT_PARTS.inc(); crate::metrics::OBJECT_STORAGE_UPLOAD_NUM_BYTES.inc_by(range.end - range.start); async move { + let _timer = + HistogramTimer::new(&crate::metrics::OBJECT_STORAGE_UPLOAD_PART_DURATION); retry(&self.retry_params, || async { // zero pad block ids to make them sortable as strings let block_id = format!("block:{:05}", num); diff --git a/quickwit/quickwit-storage/src/object_storage/policy.rs b/quickwit/quickwit-storage/src/object_storage/policy.rs index 6ce48ab7a94..bfb6baf3f12 100644 --- a/quickwit/quickwit-storage/src/object_storage/policy.rs +++ b/quickwit/quickwit-storage/src/object_storage/policy.rs @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +use bytesize::ByteSize; + /// The multipart policy defines when and how multipart upload / download should happen. /// /// The right settings might be vendor specific, but if not available the default values @@ -20,13 +22,13 @@ pub struct MultiPartPolicy { /// Ideal part size. /// Since S3 has a constraint on the number of parts, it cannot always be /// respected. - pub target_part_num_bytes: usize, + pub target_part_num_bytes: ByteSize, /// Maximum number of parts allowed. pub max_num_parts: usize, /// Threshold above which multipart is triggered. - pub multipart_threshold_num_bytes: u64, + pub multipart_threshold_num_bytes: ByteSize, /// Maximum size allowed for an object. - pub max_object_num_bytes: u64, + pub max_object_num_bytes: ByteSize, /// Maximum number of parts to be upload concurrently. pub max_concurrent_uploads: usize, } @@ -39,22 +41,22 @@ impl MultiPartPolicy { /// will not be used. pub fn part_num_bytes(&self, len: u64) -> u64 { assert!( - len < self.max_object_num_bytes, + len < self.max_object_num_bytes.as_u64(), "This object storage does not support object of that size {}", - self.max_object_num_bytes + self.max_object_num_bytes.as_u64() ); assert!( self.max_num_parts > 0, "Misconfiguration: max_num_parts == 0 makes no sense." ); - if len < self.multipart_threshold_num_bytes || self.max_num_parts == 1 { + if len < self.multipart_threshold_num_bytes.as_u64() || self.max_num_parts == 1 { return len; } let max_num_parts = self.max_num_parts as u64; // complete part is the smallest integer such that // * >= len. let min_part_len = 1u64 + (len - 1u64) / max_num_parts; - (min_part_len).max(self.target_part_num_bytes as u64) + min_part_len.max(self.target_part_num_bytes.as_u64()) } /// Limits the number of parts that can be concurrently uploaded. @@ -69,10 +71,10 @@ impl Default for MultiPartPolicy { MultiPartPolicy { // S3 limits part size from 5M to 5GB, we want to end up with as few parts as possible // since each part is charged as a put request. - target_part_num_bytes: 5_000_000_000, // 5GB - multipart_threshold_num_bytes: 128 * 1_024 * 1_024, // 128 MiB + target_part_num_bytes: ByteSize::gib(5), + multipart_threshold_num_bytes: ByteSize::mib(128), max_num_parts: 10_000, - max_object_num_bytes: 5_000_000_000_000u64, // S3 allows up to 5TB objects + max_object_num_bytes: ByteSize::tb(5), // S3 allows up to 5TB objects max_concurrent_uploads: 100, } } diff --git a/quickwit/quickwit-storage/src/object_storage/s3_compatible_storage.rs b/quickwit/quickwit-storage/src/object_storage/s3_compatible_storage.rs index a407ccdaeca..c2c8f0a68ff 100644 --- a/quickwit/quickwit-storage/src/object_storage/s3_compatible_storage.rs +++ b/quickwit/quickwit-storage/src/object_storage/s3_compatible_storage.rs @@ -303,6 +303,7 @@ impl S3CompatibleObjectStorage { crate::metrics::OBJECT_STORAGE_PUT_PARTS.inc(); crate::metrics::OBJECT_STORAGE_UPLOAD_NUM_BYTES.inc_by(len); + let _timer = HistogramTimer::new(&crate::metrics::OBJECT_STORAGE_PUT_OBJECT_DURATION); self.s3_client .put_object() @@ -420,6 +421,7 @@ impl S3CompatibleObjectStorage { Ok(delete_requests) } + #[tracing::instrument(skip_all, fields(part_number = part.part_number, num_bytes=part.len()))] async fn upload_part<'a>( &'a self, upload_id: MultipartUploadId, @@ -436,6 +438,7 @@ impl S3CompatibleObjectStorage { crate::metrics::OBJECT_STORAGE_PUT_PARTS.inc(); crate::metrics::OBJECT_STORAGE_UPLOAD_NUM_BYTES.inc_by(part.len()); + let _timer = HistogramTimer::new(&crate::metrics::OBJECT_STORAGE_UPLOAD_PART_DURATION); let upload_part_output = self .s3_client @@ -556,6 +559,7 @@ impl S3CompatibleObjectStorage { let range_str = range_opt.map(|range| format!("bytes={}-{}", range.start, range.end - 1)); crate::metrics::OBJECT_STORAGE_GET_TOTAL.inc(); + let _timer = HistogramTimer::new(&crate::metrics::OBJECT_STORAGE_GET_OBJECT_DURATION); let get_object_output = self .s3_client diff --git a/quickwit/quickwit-storage/src/opendal_storage/base.rs b/quickwit/quickwit-storage/src/opendal_storage/base.rs index a6933db9178..3fe53881e7f 100644 --- a/quickwit/quickwit-storage/src/opendal_storage/base.rs +++ b/quickwit/quickwit-storage/src/opendal_storage/base.rs @@ -84,6 +84,7 @@ impl Storage for OpendalStorage { #[instrument(name = "storage.gcs.put", level = "debug", skip(self, payload), fields(payload_len = payload.len()))] async fn put(&self, path: &Path, payload: Box) -> StorageResult<()> { crate::metrics::OBJECT_STORAGE_PUT_TOTAL.inc(); + let _timer = HistogramTimer::new(&crate::metrics::OBJECT_STORAGE_PUT_OBJECT_DURATION); let path = path.as_os_str().to_string_lossy(); let mut payload_reader = payload.byte_stream().await?.into_async_read(); @@ -125,6 +126,7 @@ impl Storage for OpendalStorage { // recorded before issuing the query to the object store. let _inflight_guards = object_storage_get_slice_in_flight_guards(size); crate::metrics::OBJECT_STORAGE_GET_TOTAL.inc(); + let _timer = HistogramTimer::new(&crate::metrics::OBJECT_STORAGE_GET_OBJECT_DURATION); // `Buffer::to_bytes` is zero-copy when the underlying buffer is contiguous, and coalesces // into a single `Bytes` otherwise — avoiding the extra `Vec` round-trip `to_vec` would // perform. diff --git a/quickwit/quickwit-storage/src/split_cache/download_task.rs b/quickwit/quickwit-storage/src/split_cache/download_task.rs index 9af0390bfb5..50631836203 100644 --- a/quickwit/quickwit-storage/src/split_cache/download_task.rs +++ b/quickwit/quickwit-storage/src/split_cache/download_task.rs @@ -21,7 +21,7 @@ use quickwit_common::split_file; use tokio::sync::{OwnedSemaphorePermit, Semaphore}; use crate::split_cache::split_table::{CandidateSplit, DownloadOpportunity}; -use crate::{SplitCache, StorageResolver}; +use crate::{SearchSplitCache, StorageResolver}; async fn download_split( root_path: &Path, @@ -44,7 +44,7 @@ async fn download_split( async fn perform_eviction_and_download( download_opportunity: DownloadOpportunity, - split_cache: Arc, + split_cache: Arc, storage_resolver: StorageResolver, _download_permit: OwnedSemaphorePermit, ) -> anyhow::Result<()> { @@ -67,7 +67,7 @@ async fn perform_eviction_and_download( } pub(crate) fn spawn_download_task( - split_cache: Arc, + split_cache: Arc, storage_resolver: StorageResolver, num_concurrent_downloads: NonZeroU32, ) { diff --git a/quickwit/quickwit-storage/src/split_cache/mod.rs b/quickwit/quickwit-storage/src/split_cache/mod.rs index 1c0de124d73..ca6b914d8d1 100644 --- a/quickwit/quickwit-storage/src/split_cache/mod.rs +++ b/quickwit/quickwit-storage/src/split_cache/mod.rs @@ -40,7 +40,7 @@ use crate::{Storage, StorageCache, wrap_storage_with_cache}; /// On disk Cache of splits for searchers. /// /// The search acts receives reports of splits. -pub struct SplitCache { +pub struct SearchSplitCache { // Directory containing the cached split files. // Split ids are universally unique, so we all put them in the same directory. root_path: PathBuf, @@ -50,14 +50,14 @@ pub struct SplitCache { fd_cache: FileDescriptorCache, } -impl SplitCache { +impl SearchSplitCache { /// Creates a new SplitCache and spawns the task that will continuously search for /// download opportunities. pub fn with_root_path( root_path: PathBuf, storage_resolver: crate::StorageResolver, limits: SplitCacheLimits, - ) -> io::Result> { + ) -> io::Result> { std::fs::create_dir_all(&root_path)?; let mut existing_splits: BTreeMap = Default::default(); for dir_entry_res in std::fs::read_dir(&root_path)? { @@ -103,7 +103,7 @@ impl SplitCache { delete_evicted_splits(&root_path, &splits_to_remove[..]); } let fd_cache = FileDescriptorCache::with_fd_cache_capacity(limits.max_file_descriptors); - let split_cache = Arc::new(SplitCache { + let split_cache = Arc::new(SearchSplitCache { root_path, split_table: Mutex::new(split_table), fd_cache, @@ -193,7 +193,7 @@ fn split_id_from_path(split_path: &Path) -> Option { } struct SplitCacheBackingStorage { - split_cache: Arc, + split_cache: Arc, storage_root_uri: Uri, } diff --git a/quickwit/quickwit-storage/tests/azure_storage.rs b/quickwit/quickwit-storage/tests/azure_storage.rs index c5d4937220a..1619596a822 100644 --- a/quickwit/quickwit-storage/tests/azure_storage.rs +++ b/quickwit/quickwit-storage/tests/azure_storage.rs @@ -45,10 +45,10 @@ async fn azure_storage_test_suite() -> anyhow::Result<()> { object_storage.set_policy(MultiPartPolicy { // On azure, block size is limited between 64KB and 100MB. - target_part_num_bytes: 5 * 1_024 * 1_024, // 5MiB + target_part_num_bytes: bytesize::ByteSize::mib(5), max_num_parts: 10_000, - multipart_threshold_num_bytes: 10_000_000, - max_object_num_bytes: 5_000_000_000_000, + multipart_threshold_num_bytes: bytesize::ByteSize::mb(10), + max_object_num_bytes: bytesize::ByteSize::tb(5), max_concurrent_uploads: 100, }); quickwit_storage::storage_test_multi_part_upload(&mut object_storage) diff --git a/quickwit/quickwit-storage/tests/s3_storage.rs b/quickwit/quickwit-storage/tests/s3_storage.rs index 73e0454af77..428093fb093 100644 --- a/quickwit/quickwit-storage/tests/s3_storage.rs +++ b/quickwit/quickwit-storage/tests/s3_storage.rs @@ -72,10 +72,10 @@ pub mod s3_storage_test_suite { .unwrap(); object_storage.set_policy(MultiPartPolicy { - target_part_num_bytes: 5 * 1_024 * 1_024, //< the minimum on S3 is 5MB. + target_part_num_bytes: bytesize::ByteSize::mib(5), //< the minimum on S3 is 5MB. max_num_parts: 10_000, - multipart_threshold_num_bytes: 10_000_000, - max_object_num_bytes: 5_000_000_000_000, + multipart_threshold_num_bytes: bytesize::ByteSize::mb(10), + max_object_num_bytes: bytesize::ByteSize::tb(5), max_concurrent_uploads: 100, });