+ * This CLI serves as the primary entry point for command-line benchmark execution and
+ * delegates to {@link BenchFrame} for actual benchmark orchestration.
+ *
+ *
{
+
+ /**
+ * Called when no subcommand is specified. Displays help information.
+ *
+ * @return exit code 0
+ */
+ @Override
+ public Integer call() {
+ // If no subcommand, show help
+ CommandLine.usage(this, System.out);
+ return 0;
+ }
+
+ /**
+ * Subcommand for running Bench-style benchmarks with hardcoded grid parameters.
+ * Provides compatibility with the original Bench.java behavior.
+ *
+ * Uses fixed default parameters (M=32, efConstruction=100, etc.) and loads
+ * datasets from the DatasetCollection.
+ */
+ @CommandLine.Command(
+ name = "bench",
+ description = "Run benchmarks with hardcoded grid parameters (original Bench.java style)"
+ )
+ static class BenchCommand implements Callable {
+ @CommandLine.Parameters(
+ arity = "0..*",
+ description = "Dataset name patterns (regex). If not specified, matches all datasets."
+ )
+ private String[] datasets = new String[0];
+
+ @Override
+ public Integer call() throws IOException {
+ System.out.println("Heap space available is " + Runtime.getRuntime().maxMemory());
+ BenchFrame.likeBench().execute(datasets);
+ return 0;
+ }
+ }
+
+ /**
+ * Subcommand for running BenchYAML-style benchmarks with YAML-based configuration.
+ * Provides compatibility with the original BenchYAML.java behavior.
+ *
+ * Loads benchmark parameters from YAML files per dataset, allowing different
+ * configurations for different datasets.
+ */
+ @CommandLine.Command(
+ name = "benchyaml",
+ description = "Run benchmarks with YAML-based configuration (original BenchYAML.java style)"
+ )
+ static class BenchYAMLCommand implements Callable {
+ @CommandLine.Parameters(
+ arity = "0..*",
+ description = "Dataset name patterns (regex) or YAML config files. If not specified, matches all datasets."
+ )
+ private String[] datasets = new String[0];
+
+ @Override
+ public Integer call() throws IOException {
+ System.out.println("Heap space available is " + Runtime.getRuntime().maxMemory());
+ BenchFrame.likeBenchYAML().execute(datasets);
+ return 0;
+ }
+ }
+
+ /**
+ * Subcommand for running AutoBench-style benchmarks in CI/CD mode with checkpointing.
+ * Provides compatibility with the original AutoBenchYAML.java behavior.
+ *
+ * Features:
+ *
+ * - File-based checkpointing for resumption after failures
+ * - CSV summary and JSON detail output
+ * - Hardcoded dataset list for consistent CI/CD runs
+ * - Configurable diagnostic output level
+ *
+ */
+ @CommandLine.Command(
+ name = "autobenchyaml",
+ description = "Run benchmarks for CI/CD with checkpointing and file output (original AutoBenchYAML.java style)"
+ )
+ static class AutoBenchYAMLCommand implements Callable {
+ @CommandLine.Parameters(
+ arity = "0..*",
+ description = "Dataset name patterns (regex). If not specified, matches all datasets."
+ )
+ private String[] datasets = new String[0];
+
+ @CommandLine.Option(
+ names = {"-o", "--output"},
+ required = true,
+ description = "Base path for output files (.csv, .json, .checkpoint.json)"
+ )
+ private String outputPath;
+
+ @CommandLine.Option(
+ names = {"-d", "--diag"},
+ description = "Diagnostic level: 0=none, 1=basic, 2=detailed, 3=verbose (default: ${DEFAULT-VALUE})",
+ defaultValue = "0"
+ )
+ private int diagnosticLevel;
+
+ @Override
+ public Integer call() throws IOException {
+ System.out.println("Heap space available is " + Runtime.getRuntime().maxMemory());
+ BenchFrame.likeAutoBenchYAML(outputPath, diagnosticLevel).execute(datasets);
+ return 0;
+ }
+ }
+
+ /**
+ * Subcommand that delegates to the datatools-nbvectors datasets command.
+ * Provides access to dataset listing and management functionality.
+ */
+ @CommandLine.Command(
+ name = "datasets",
+ description = "List and manage vector datasets (delegates to nbvectors datasets command)"
+ )
+ static class DatasetsCommand implements Callable {
+ @CommandLine.Parameters(
+ arity = "0..*",
+ description = "Arguments to pass to the nbvectors datasets command"
+ )
+ private String[] args = new String[0];
+
+ @Override
+ public Integer call() throws Exception {
+ // Delegate to CommandBundler with datasets subcommand
+ String[] nbvectorArgs = new String[args.length + 1];
+ nbvectorArgs[0] = "datasets";
+ System.arraycopy(args, 0, nbvectorArgs, 1, args.length);
+
+ io.nosqlbench.commands.CommandBundler.main(nbvectorArgs);
+ return 0;
+ }
+ }
+
+ /**
+ * Subcommand that delegates to the datatools-nbvectors main CLI.
+ * Provides access to the full nbvectors command-line functionality.
+ */
+ @CommandLine.Command(
+ name = "nbvectors",
+ description = "Access full nbvectors CLI functionality (delegates to CommandBundler)"
+ )
+ static class NBVectorsCommand implements Callable {
+ @CommandLine.Parameters(
+ arity = "0..*",
+ description = "Arguments to pass to the nbvectors CLI"
+ )
+ private String[] args = new String[0];
+
+ @Override
+ public Integer call() throws Exception {
+ // Delegate to CommandBundler
+ io.nosqlbench.commands.CommandBundler.main(args);
+ return 0;
+ }
+ }
+
+ /**
+ * Main entry point for command-line execution.
+ *
+ * @param args command-line arguments
+ */
+ public static void main(String[] args) {
+ int exitCode = new CommandLine(new BenchFrameCLI()).execute(args);
+ System.exit(exitCode);
+ }
+
+}
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchFrameConfig.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchFrameConfig.java
new file mode 100644
index 000000000..edfc556d6
--- /dev/null
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchFrameConfig.java
@@ -0,0 +1,490 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.github.jbellis.jvector.benchframe;
+
+import io.github.jbellis.jvector.example.util.CompressorParameters;
+import io.github.jbellis.jvector.example.util.CompressorParameters.PQParameters;
+import io.github.jbellis.jvector.example.util.DataSet;
+import io.github.jbellis.jvector.example.yaml.MultiConfig;
+import io.github.jbellis.jvector.graph.disk.feature.FeatureId;
+import io.github.jbellis.jvector.vector.VectorSimilarityFunction;
+
+import java.util.*;
+import java.util.function.Function;
+
+import static io.github.jbellis.jvector.quantization.KMeansPlusPlusClusterer.UNWEIGHTED;
+
+/**
+ * Typesafe configuration class for benchmark execution. Provides a unified, immutable configuration
+ * model that encapsulates all parameters needed to run a benchmark including graph construction
+ * parameters, search parameters, and feature sets.
+ *
+ * This class can be used programmatically through its {@link Builder} or constructed from
+ * YAML-based {@link MultiConfig} using {@link #fromMultiConfig(MultiConfig)}.
+ *
+ * All collections returned by getter methods are unmodifiable to maintain immutability.
+ *
+ *
Usage Examples
+ * {@code
+ * // Create from YAML MultiConfig
+ * MultiConfig yaml = MultiConfig.getDefaultConfig("dataset-name");
+ * BenchFrameConfig config = BenchFrameConfig.fromMultiConfig(yaml);
+ *
+ * // Create with Builder
+ * BenchFrameConfig config = new BenchFrameConfig.Builder()
+ * .withDatasetName("my-dataset")
+ * .withMGrid(List.of(16, 32, 64))
+ * .withEfConstructionGrid(List.of(100, 200))
+ * .build();
+ *
+ * // Use default Bench-style configuration
+ * BenchFrameConfig defaults = BenchFrameConfig.createBenchDefaults();
+ * }
+ *
+ * @see MultiConfig
+ * @see BenchFrame
+ */
+public class BenchFrameConfig {
+ // Dataset identification
+ private final String datasetName;
+
+ // Graph construction parameters
+ private final List mGrid;
+ private final List efConstructionGrid;
+ private final List neighborOverflowGrid;
+ private final List addHierarchyGrid;
+ private final List refineFinalGraphGrid;
+ private final List extends Set> featureSets;
+ private final List> buildCompressors;
+
+ // Search parameters
+ private final List> searchCompressors;
+ private final Map> topKOverqueryGrid;
+ private final List usePruningGrid;
+
+ // Benchmark selection
+ private final Map> benchmarkSpec;
+
+ // Result collection mode
+ private final boolean collectResults;
+
+ private BenchFrameConfig(Builder builder) {
+ this.datasetName = builder.datasetName;
+ this.mGrid = Collections.unmodifiableList(builder.mGrid);
+ this.efConstructionGrid = Collections.unmodifiableList(builder.efConstructionGrid);
+ this.neighborOverflowGrid = Collections.unmodifiableList(builder.neighborOverflowGrid);
+ this.addHierarchyGrid = Collections.unmodifiableList(builder.addHierarchyGrid);
+ this.refineFinalGraphGrid = Collections.unmodifiableList(builder.refineFinalGraphGrid);
+ this.featureSets = Collections.unmodifiableList(builder.featureSets);
+ this.buildCompressors = Collections.unmodifiableList(builder.buildCompressors);
+ this.searchCompressors = Collections.unmodifiableList(builder.searchCompressors);
+ this.topKOverqueryGrid = Collections.unmodifiableMap(builder.topKOverqueryGrid);
+ this.usePruningGrid = Collections.unmodifiableList(builder.usePruningGrid);
+ this.benchmarkSpec = builder.benchmarkSpec == null ? null : Collections.unmodifiableMap(builder.benchmarkSpec);
+ this.collectResults = builder.collectResults;
+ }
+
+ /**
+ * Returns the dataset name associated with this configuration.
+ *
+ * @return the dataset name, may be null if not specified
+ */
+ public String getDatasetName() { return datasetName; }
+
+ /**
+ * Returns the grid of M (max connections per node) values to test.
+ *
+ * @return unmodifiable list of M values
+ */
+ public List getMGrid() { return mGrid; }
+
+ /**
+ * Returns the grid of efConstruction values to test during graph construction.
+ *
+ * @return unmodifiable list of efConstruction values
+ */
+ public List getEfConstructionGrid() { return efConstructionGrid; }
+
+ /**
+ * Returns the grid of neighbor overflow multipliers to test. This controls how many
+ * candidate neighbors are considered relative to M during graph construction.
+ *
+ * @return unmodifiable list of neighbor overflow multipliers
+ */
+ public List getNeighborOverflowGrid() { return neighborOverflowGrid; }
+
+ /**
+ * Returns the grid of add hierarchy boolean values indicating whether to use hierarchical
+ * graph construction.
+ *
+ * @return unmodifiable list of boolean values
+ */
+ public List getAddHierarchyGrid() { return addHierarchyGrid; }
+
+ /**
+ * Returns the grid of refine final graph boolean values indicating whether to perform
+ * final graph refinement after construction.
+ *
+ * @return unmodifiable list of boolean values
+ */
+ public List getRefineFinalGraphGrid() { return refineFinalGraphGrid; }
+
+ /**
+ * Returns the feature sets to test. Each set contains {@link FeatureId}s that enable
+ * specific features like inline vectors or NVQ vectors.
+ *
+ * @return unmodifiable list of feature sets
+ */
+ public List extends Set> getFeatureSets() { return featureSets; }
+
+ /**
+ * Returns the compressor functions to use during graph construction. Each function takes
+ * a {@link DataSet} and returns appropriate {@link CompressorParameters}.
+ *
+ * @return unmodifiable list of compressor parameter functions
+ */
+ public List> getBuildCompressors() { return buildCompressors; }
+
+ /**
+ * Returns the compressor functions to use during search. Each function takes
+ * a {@link DataSet} and returns appropriate {@link CompressorParameters}.
+ *
+ * @return unmodifiable list of compressor parameter functions
+ */
+ public List> getSearchCompressors() { return searchCompressors; }
+
+ /**
+ * Returns the grid of topK overquery multipliers mapped by K value. For example,
+ * a map entry of (10, [1.0, 2.0, 5.0]) means for top-10 queries, test overquery
+ * factors of 1.0x, 2.0x, and 5.0x.
+ *
+ * @return unmodifiable map of K values to overquery multipliers
+ */
+ public Map> getTopKOverqueryGrid() { return topKOverqueryGrid; }
+
+ /**
+ * Returns the grid of boolean values indicating whether to use search pruning.
+ *
+ * @return unmodifiable list of boolean values
+ */
+ public List getUsePruningGrid() { return usePruningGrid; }
+
+ /**
+ * Returns the benchmark specification mapping benchmark types to their configurations.
+ * A null value indicates all default benchmarks should be run.
+ *
+ * @return unmodifiable map of benchmark specifications, or null for default benchmarks
+ */
+ public Map> getBenchmarkSpec() { return benchmarkSpec; }
+
+ /**
+ * Returns whether results should be collected and returned from benchmark execution.
+ *
+ * @return true if results should be collected, false otherwise
+ */
+ public boolean shouldCollectResults() { return collectResults; }
+
+ /**
+ * Creates a new {@link Builder} initialized with this configuration's values.
+ * This is useful for creating modified copies of existing configurations.
+ *
+ * @return a new Builder with this configuration's values
+ */
+ public Builder toBuilder() {
+ return new Builder()
+ .withDatasetName(datasetName)
+ .withMGrid(mGrid)
+ .withEfConstructionGrid(efConstructionGrid)
+ .withNeighborOverflowGrid(neighborOverflowGrid)
+ .withAddHierarchyGrid(addHierarchyGrid)
+ .withRefineFinalGraphGrid(refineFinalGraphGrid)
+ .withFeatureSets(featureSets)
+ .withBuildCompressors(buildCompressors)
+ .withSearchCompressors(searchCompressors)
+ .withTopKOverqueryGrid(topKOverqueryGrid)
+ .withUsePruningGrid(usePruningGrid)
+ .withBenchmarkSpec(benchmarkSpec)
+ .collectResults(collectResults);
+ }
+
+ /**
+ * Creates a BenchFrameConfig from a YAML-based {@link MultiConfig}. This factory method
+ * provides compatibility with the existing YAML configuration system.
+ *
+ * @param config the MultiConfig to convert
+ * @return a new BenchFrameConfig with values from the MultiConfig
+ */
+ public static BenchFrameConfig fromMultiConfig(MultiConfig config) {
+ return new Builder()
+ .withDatasetName(config.dataset)
+ .withMGrid(config.construction.outDegree)
+ .withEfConstructionGrid(config.construction.efConstruction)
+ .withNeighborOverflowGrid(config.construction.neighborOverflow)
+ .withAddHierarchyGrid(config.construction.addHierarchy)
+ .withRefineFinalGraphGrid(config.construction.refineFinalGraph)
+ .withFeatureSets(config.construction.getFeatureSets())
+ .withBuildCompressors(config.construction.getCompressorParameters())
+ .withSearchCompressors(config.search.getCompressorParameters())
+ .withTopKOverqueryGrid(config.search.topKOverquery)
+ .withUsePruningGrid(config.search.useSearchPruning)
+ .withBenchmarkSpec(config.search.benchmarks)
+ .build();
+ }
+
+ /**
+ * Creates a default configuration matching the original Bench.java's hardcoded parameters.
+ * This provides a baseline configuration suitable for most benchmark scenarios.
+ *
+ * Default values include:
+ *
+ * - M: 32
+ * - efConstruction: 100
+ * - neighborOverflow: 1.2
+ * - addHierarchy: true
+ * - refineFinalGraph: true
+ * - usePruning: true
+ * - topK overquery: 10 -> [1.0, 2.0, 5.0, 10.0], 100 -> [1.0, 2.0]
+ * - Feature sets: NVQ_VECTORS and INLINE_VECTORS
+ * - Compressors: PQ for build, both none and PQ for search
+ *
+ *
+ * @return a new BenchFrameConfig with default Bench.java values
+ */
+ public static BenchFrameConfig createBenchDefaults() {
+ return new Builder()
+ .withMGrid(List.of(32))
+ .withEfConstructionGrid(List.of(100))
+ .withNeighborOverflowGrid(List.of(1.2f))
+ .withAddHierarchyGrid(List.of(true))
+ .withRefineFinalGraphGrid(List.of(true))
+ .withUsePruningGrid(List.of(true))
+ .withTopKOverqueryGrid(Map.of(
+ 10, List.of(1.0, 2.0, 5.0, 10.0),
+ 100, List.of(1.0, 2.0)
+ ))
+ .withFeatureSets(Arrays.asList(
+ EnumSet.of(FeatureId.NVQ_VECTORS),
+ EnumSet.of(FeatureId.INLINE_VECTORS)
+ ))
+ .withBuildCompressors(Arrays.asList(
+ ds -> new PQParameters(ds.getDimension() / 8,
+ 256,
+ ds.getSimilarityFunction() == VectorSimilarityFunction.EUCLIDEAN,
+ UNWEIGHTED),
+ __ -> CompressorParameters.NONE
+ ))
+ .withSearchCompressors(Arrays.asList(
+ __ -> CompressorParameters.NONE,
+ ds -> new PQParameters(ds.getDimension() / 8,
+ 256,
+ ds.getSimilarityFunction() == VectorSimilarityFunction.EUCLIDEAN,
+ UNWEIGHTED)
+ ))
+ .build();
+ }
+
+ /**
+ * Builder for fluent BenchFrameConfig construction. All builder methods return the builder
+ * instance for method chaining. Collections provided to builder methods are defensively
+ * copied to prevent external modification.
+ *
+ * Default values provide sensible single-value grids:
+ *
+ * - mGrid: [32]
+ * - efConstructionGrid: [100]
+ * - neighborOverflowGrid: [1.2]
+ * - addHierarchyGrid: [true]
+ * - refineFinalGraphGrid: [true]
+ * - featureSets: [INLINE_VECTORS]
+ * - buildCompressors: [NONE]
+ * - searchCompressors: [NONE]
+ * - topKOverqueryGrid: {10: [1.0]}
+ * - usePruningGrid: [true]
+ * - benchmarkSpec: null (use default benchmarks)
+ * - collectResults: false
+ *
+ */
+ public static class Builder {
+ private String datasetName;
+ private List mGrid = List.of(32);
+ private List efConstructionGrid = List.of(100);
+ private List neighborOverflowGrid = List.of(1.2f);
+ private List addHierarchyGrid = List.of(true);
+ private List refineFinalGraphGrid = List.of(true);
+ private List extends Set> featureSets = List.of(EnumSet.of(FeatureId.INLINE_VECTORS));
+ private List> buildCompressors =
+ List.of(__ -> CompressorParameters.NONE);
+ private List> searchCompressors =
+ List.of(__ -> CompressorParameters.NONE);
+ private Map> topKOverqueryGrid = Map.of(10, List.of(1.0));
+ private List usePruningGrid = List.of(true);
+ private Map> benchmarkSpec = null; // null means use default benchmarks
+ private boolean collectResults = false;
+
+ /**
+ * Sets the dataset name.
+ *
+ * @param datasetName the dataset name to associate with this configuration
+ * @return this builder for method chaining
+ */
+ public Builder withDatasetName(String datasetName) {
+ this.datasetName = datasetName;
+ return this;
+ }
+
+ /**
+ * Sets the grid of M (max connections per node) values to test.
+ *
+ * @param mGrid list of M values, defensively copied
+ * @return this builder for method chaining
+ */
+ public Builder withMGrid(List mGrid) {
+ this.mGrid = new ArrayList<>(mGrid);
+ return this;
+ }
+
+ /**
+ * Sets the grid of efConstruction values to test during graph construction.
+ *
+ * @param efConstructionGrid list of efConstruction values, defensively copied
+ * @return this builder for method chaining
+ */
+ public Builder withEfConstructionGrid(List efConstructionGrid) {
+ this.efConstructionGrid = new ArrayList<>(efConstructionGrid);
+ return this;
+ }
+
+ /**
+ * Sets the grid of neighbor overflow multipliers to test.
+ *
+ * @param neighborOverflowGrid list of overflow multipliers, defensively copied
+ * @return this builder for method chaining
+ */
+ public Builder withNeighborOverflowGrid(List neighborOverflowGrid) {
+ this.neighborOverflowGrid = new ArrayList<>(neighborOverflowGrid);
+ return this;
+ }
+
+ /**
+ * Sets the grid of add hierarchy boolean values.
+ *
+ * @param addHierarchyGrid list of boolean values, defensively copied
+ * @return this builder for method chaining
+ */
+ public Builder withAddHierarchyGrid(List addHierarchyGrid) {
+ this.addHierarchyGrid = new ArrayList<>(addHierarchyGrid);
+ return this;
+ }
+
+ /**
+ * Sets the grid of refine final graph boolean values.
+ *
+ * @param refineFinalGraphGrid list of boolean values, defensively copied
+ * @return this builder for method chaining
+ */
+ public Builder withRefineFinalGraphGrid(List refineFinalGraphGrid) {
+ this.refineFinalGraphGrid = new ArrayList<>(refineFinalGraphGrid);
+ return this;
+ }
+
+ /**
+ * Sets the feature sets to test.
+ *
+ * @param featureSets list of feature sets, defensively copied
+ * @return this builder for method chaining
+ */
+ public Builder withFeatureSets(List extends Set> featureSets) {
+ this.featureSets = new ArrayList<>(featureSets);
+ return this;
+ }
+
+ /**
+ * Sets the compressor functions to use during graph construction.
+ *
+ * @param buildCompressors list of compressor parameter functions, defensively copied
+ * @return this builder for method chaining
+ */
+ public Builder withBuildCompressors(List> buildCompressors) {
+ this.buildCompressors = new ArrayList<>(buildCompressors);
+ return this;
+ }
+
+ /**
+ * Sets the compressor functions to use during search.
+ *
+ * @param searchCompressors list of compressor parameter functions, defensively copied
+ * @return this builder for method chaining
+ */
+ public Builder withSearchCompressors(List> searchCompressors) {
+ this.searchCompressors = new ArrayList<>(searchCompressors);
+ return this;
+ }
+
+ /**
+ * Sets the grid of topK overquery multipliers.
+ *
+ * @param topKOverqueryGrid map of K values to overquery multipliers, defensively copied
+ * @return this builder for method chaining
+ */
+ public Builder withTopKOverqueryGrid(Map> topKOverqueryGrid) {
+ this.topKOverqueryGrid = new HashMap<>(topKOverqueryGrid);
+ return this;
+ }
+
+ /**
+ * Sets the grid of use pruning boolean values.
+ *
+ * @param usePruningGrid list of boolean values, defensively copied
+ * @return this builder for method chaining
+ */
+ public Builder withUsePruningGrid(List usePruningGrid) {
+ this.usePruningGrid = new ArrayList<>(usePruningGrid);
+ return this;
+ }
+
+ /**
+ * Sets the benchmark specification. A null value indicates default benchmarks should be used.
+ *
+ * @param benchmarkSpec map of benchmark specifications, defensively copied if not null
+ * @return this builder for method chaining
+ */
+ public Builder withBenchmarkSpec(Map> benchmarkSpec) {
+ this.benchmarkSpec = benchmarkSpec == null ? null : new HashMap<>(benchmarkSpec);
+ return this;
+ }
+
+ /**
+ * Sets whether to collect results.
+ *
+ * @param collectResults true to collect results, false otherwise
+ * @return this builder for method chaining
+ */
+ public Builder collectResults(boolean collectResults) {
+ this.collectResults = collectResults;
+ return this;
+ }
+
+ /**
+ * Builds and returns a configured BenchFrameConfig instance with immutable collections.
+ *
+ * @return a new BenchFrameConfig with the configured values
+ */
+ public BenchFrameConfig build() {
+ return new BenchFrameConfig(this);
+ }
+ }
+}
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchResult.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchResult.java
new file mode 100644
index 000000000..6abc0352f
--- /dev/null
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchResult.java
@@ -0,0 +1,84 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.github.jbellis.jvector.benchframe;
+
+import java.util.Map;
+
+/**
+ * Result model for a single benchmark execution. Encapsulates the dataset identifier,
+ * configuration parameters, and performance metrics from a benchmark run.
+ *
+ * This class is designed for serialization to JSON and CSV formats through {@link ResultHandler}
+ * implementations. All fields are public for compatibility with Jackson and other serialization
+ * libraries.
+ *
+ * Typical parameter keys include:
+ *
+ * - {@code M} - max connections per node
+ * - {@code efConstruction} - construction-time search depth
+ * - {@code buildCompressor} - compression used during construction
+ * - {@code searchCompressor} - compression used during search
+ * - {@code featureSet} - enabled feature flags
+ *
+ *
+ * Typical metric keys include:
+ *
+ * - {@code recall} - search accuracy (0.0 to 1.0)
+ * - {@code qps} - queries per second
+ * - {@code latency} - average query latency in milliseconds
+ * - {@code buildTimeMs} - index construction time in milliseconds
+ * - {@code indexSizeBytes} - on-disk index size in bytes
+ *
+ *
+ * @see ResultHandler
+ * @see BenchFrame
+ */
+public class BenchResult {
+ /**
+ * The name of the dataset this result is for.
+ */
+ public String dataset;
+
+ /**
+ * Map of configuration parameters used for this benchmark run.
+ * Keys are parameter names, values are parameter values (typically String, Integer, Boolean, etc.).
+ */
+ public Map parameters;
+
+ /**
+ * Map of performance metrics measured during this benchmark run.
+ * Keys are metric names, values are metric values (typically Double, Long, Integer, etc.).
+ */
+ public Map metrics;
+
+ /**
+ * Default constructor for deserialization.
+ */
+ public BenchResult() {}
+
+ /**
+ * Constructs a BenchResult with the specified dataset, parameters, and metrics.
+ *
+ * @param dataset the dataset name
+ * @param parameters map of configuration parameters
+ * @param metrics map of performance metrics
+ */
+ public BenchResult(String dataset, Map parameters, Map metrics) {
+ this.dataset = dataset;
+ this.parameters = parameters;
+ this.metrics = metrics;
+ }
+}
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/CheckpointStrategy.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/CheckpointStrategy.java
new file mode 100644
index 000000000..f99f722da
--- /dev/null
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/CheckpointStrategy.java
@@ -0,0 +1,177 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.github.jbellis.jvector.benchframe;
+
+import io.github.jbellis.jvector.example.util.CheckpointManager;
+
+import java.util.Collections;
+import java.util.List;
+
+/**
+ * Strategy interface for managing benchmark checkpointing. Implements the Strategy pattern
+ * to enable resumable benchmark execution after failures or interruptions.
+ *
+ * Checkpointing is particularly valuable for long-running benchmarks in CI/CD environments
+ * where resource limits or transient failures may interrupt execution. By tracking which
+ * datasets have been completed, benchmarks can resume from where they left off.
+ *
+ * Two implementations are provided:
+ *
+ * - {@link NoCheckpointing} - no-op implementation for simple scenarios
+ * - {@link FileCheckpointing} - persistent file-based checkpointing using JSON
+ *
+ *
+ * Usage Example
+ * {@code
+ * // No checkpointing (default)
+ * CheckpointStrategy strategy = CheckpointStrategy.none();
+ *
+ * // File-based checkpointing
+ * CheckpointStrategy strategy = CheckpointStrategy.fileBasedCheckpointing("results/checkpoint");
+ *
+ * // Custom implementation
+ * CheckpointStrategy strategy = new CheckpointStrategy() {
+ * @Override
+ * public boolean shouldSkipDataset(String datasetName) {
+ * // Check database or cache
+ * return completedDatasets.contains(datasetName);
+ * }
+ *
+ * @Override
+ * public void recordCompletion(String datasetName, List results) {
+ * // Update database or cache
+ * completedDatasets.add(datasetName);
+ * }
+ *
+ * @Override
+ * public List getPreviousResults() {
+ * // Load from database or cache
+ * return loadPreviousResults();
+ * }
+ * };
+ * }
+ *
+ * @see BenchFrame.Builder#withCheckpointStrategy(CheckpointStrategy)
+ * @see BenchResult
+ */
+public interface CheckpointStrategy {
+ /**
+ * Checks if a dataset should be skipped because it has already been completed.
+ * This is called before attempting to benchmark each dataset.
+ *
+ * @param datasetName the name of the dataset to check
+ * @return true if the dataset has already been completed and should be skipped, false otherwise
+ */
+ boolean shouldSkipDataset(String datasetName);
+
+ /**
+ * Records the completion of a dataset with its results. This is called after successfully
+ * benchmarking a dataset. Implementations should persist this information to enable resumption.
+ *
+ * @param datasetName the name of the completed dataset
+ * @param results the benchmark results for this dataset
+ */
+ void recordCompletion(String datasetName, List results);
+
+ /**
+ * Retrieves any previously completed results from earlier runs. These results are included
+ * in the final output to provide a complete view across multiple executions.
+ *
+ * @return list of results from previous runs, or empty list if none exist
+ */
+ List getPreviousResults();
+
+ /**
+ * Creates a no-op checkpoint strategy that does not track or resume progress.
+ * This is the default for simple benchmark scenarios.
+ *
+ * @return a checkpoint strategy that performs no checkpointing
+ */
+ static CheckpointStrategy none() {
+ return new NoCheckpointing();
+ }
+
+ /**
+ * Creates a file-based checkpoint strategy that persists progress to JSON files.
+ * Creates files at {@code outputPath.checkpoint.json} containing completed dataset
+ * names and their results.
+ *
+ * @param outputPath base path for checkpoint file (e.g., "results/benchmark")
+ * @return a checkpoint strategy using file-based persistence
+ * @see FileCheckpointing
+ */
+ static CheckpointStrategy fileBasedCheckpointing(String outputPath) {
+ return new FileCheckpointing(outputPath);
+ }
+
+ /**
+ * No-op implementation that performs no checkpointing. All datasets are processed
+ * on every run without tracking completion state.
+ */
+ class NoCheckpointing implements CheckpointStrategy {
+ @Override
+ public boolean shouldSkipDataset(String datasetName) {
+ return false;
+ }
+
+ @Override
+ public void recordCompletion(String datasetName, List results) {
+ // Do nothing
+ }
+
+ @Override
+ public List getPreviousResults() {
+ return Collections.emptyList();
+ }
+ }
+
+ /**
+ * File-based implementation that uses {@link CheckpointManager} for persistent checkpointing.
+ * Stores checkpoint state in a JSON file at {@code outputPath.checkpoint.json}.
+ *
+ * The checkpoint file contains:
+ *
+ * - List of completed dataset names
+ * - All benchmark results from completed datasets
+ * - Timestamp of last update
+ *
+ *
+ * On initialization, loads any existing checkpoint file to resume from previous runs.
+ */
+ class FileCheckpointing implements CheckpointStrategy {
+ private final CheckpointManager manager;
+
+ public FileCheckpointing(String outputPath) {
+ this.manager = new CheckpointManager(outputPath);
+ }
+
+ @Override
+ public boolean shouldSkipDataset(String datasetName) {
+ return manager.isDatasetCompleted(datasetName);
+ }
+
+ @Override
+ public void recordCompletion(String datasetName, List results) {
+ manager.markDatasetCompleted(datasetName, results);
+ }
+
+ @Override
+ public List getPreviousResults() {
+ return manager.getCompletedResults();
+ }
+ }
+}
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/ResultHandler.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/ResultHandler.java
new file mode 100644
index 000000000..050681763
--- /dev/null
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/ResultHandler.java
@@ -0,0 +1,201 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.github.jbellis.jvector.benchframe;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import io.github.jbellis.jvector.example.util.BenchmarkSummarizer;
+import io.github.jbellis.jvector.example.util.BenchmarkSummarizer.SummaryStats;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Strategy interface for handling benchmark results after execution completes.
+ * Implements the Strategy pattern to decouple result handling from benchmark execution.
+ *
+ * This functional interface supports various output modes including:
+ *
+ * - Console-only output (Grid handles printing)
+ * - File-based output (CSV summary and JSON details)
+ * - Combined output to multiple destinations
+ * - Custom implementations for specialized scenarios
+ *
+ *
+ * Usage Examples
+ * {@code
+ * // Console only (default)
+ * ResultHandler handler = ResultHandler.consoleOnly();
+ *
+ * // Write to files
+ * ResultHandler handler = ResultHandler.toFiles("results/benchmark");
+ *
+ * // Combine multiple handlers
+ * ResultHandler handler = ResultHandler.combining(
+ * ResultHandler.consoleOnly(),
+ * ResultHandler.toFiles("results/benchmark")
+ * );
+ *
+ * // Custom implementation
+ * ResultHandler handler = results -> {
+ * // Send to monitoring system
+ * monitoringService.recordBenchmarks(results);
+ * // Upload to cloud storage
+ * cloudStorage.upload("benchmarks", results);
+ * };
+ * }
+ *
+ * @see BenchResult
+ * @see BenchFrame.Builder#withResultHandler(ResultHandler)
+ */
+@FunctionalInterface
+public interface ResultHandler {
+ /**
+ * Handles the benchmark results after execution completes. Implementations may write
+ * to files, send to external systems, or perform other processing.
+ *
+ * @param results list of benchmark results to handle
+ * @throws IOException if output or I/O operations fail
+ */
+ void handleResults(List results) throws IOException;
+
+ /**
+ * Creates a no-op result handler that does nothing with results. Console output
+ * is already handled by Grid during benchmark execution.
+ * This matches the behavior of the original Bench.java and BenchYAML.java.
+ *
+ * @return a result handler that performs no additional output
+ */
+ static ResultHandler consoleOnly() {
+ return results -> {
+ // Grid already printed results to console, nothing to do
+ };
+ }
+
+ /**
+ * Creates a result handler that writes results to CSV summary and JSON detail files.
+ * This matches the behavior of AutoBenchYAML.java.
+ *
+ * Files created:
+ *
+ * - {@code outputBasePath.csv} - CSV summary with aggregate statistics per dataset
+ * - {@code outputBasePath.json} - JSON file with complete detailed results
+ *
+ *
+ * The CSV file contains columns: dataset, QPS, QPS StdDev, Mean Latency, Recall@10,
+ * Index Construction Time.
+ *
+ * @param outputBasePath base path for output files (without extension)
+ * @return a result handler that writes to CSV and JSON files
+ * @see FileOutputHandler
+ */
+ static ResultHandler toFiles(String outputBasePath) {
+ return new FileOutputHandler(outputBasePath);
+ }
+
+ /**
+ * Implementation that writes benchmark results to CSV summary and JSON details files.
+ * Uses {@link BenchmarkSummarizer} to calculate aggregate statistics across multiple
+ * benchmark runs.
+ */
+ class FileOutputHandler implements ResultHandler {
+ private static final Logger logger = LoggerFactory.getLogger(FileOutputHandler.class);
+ private final String outputBasePath;
+
+ public FileOutputHandler(String outputBasePath) {
+ this.outputBasePath = outputBasePath;
+ }
+
+ @Override
+ public void handleResults(List results) throws IOException {
+ if (results.isEmpty()) {
+ logger.warn("No results to write");
+ return;
+ }
+
+ // Calculate summary statistics
+ SummaryStats stats = BenchmarkSummarizer.summarize(results);
+ logger.info("Benchmark summary: {}", stats.toString());
+
+ // Write detailed results to JSON
+ File detailsFile = new File(outputBasePath + ".json");
+ ObjectMapper mapper = new ObjectMapper();
+ mapper.writerWithDefaultPrettyPrinter().writeValue(detailsFile, results);
+ logger.info("Detailed results written to {}", detailsFile.getAbsolutePath());
+
+ // Write summary to CSV
+ File csvFile = new File(outputBasePath + ".csv");
+ writeCsvSummary(results, csvFile);
+ logger.info("Summary results written to {}", csvFile.getAbsolutePath());
+
+ // Verify files were created
+ if (csvFile.exists()) {
+ logger.info("CSV file size: {} bytes", csvFile.length());
+ } else {
+ logger.error("Failed to create CSV file at {}", csvFile.getAbsolutePath());
+ }
+
+ if (detailsFile.exists()) {
+ logger.info("JSON file size: {} bytes", detailsFile.length());
+ } else {
+ logger.error("Failed to create JSON file at {}", detailsFile.getAbsolutePath());
+ }
+ }
+
+ private void writeCsvSummary(List results, File outputFile) throws IOException {
+ // Get summary statistics by dataset
+ Map statsByDataset = BenchmarkSummarizer.summarizeByDataset(results);
+
+ try (FileWriter writer = new FileWriter(outputFile)) {
+ // Write CSV header
+ writer.write("dataset,QPS,QPS StdDev,Mean Latency,Recall@10,Index Construction Time\n");
+
+ // Write one row per dataset with average metrics
+ for (Map.Entry entry : statsByDataset.entrySet()) {
+ String dataset = entry.getKey();
+ SummaryStats datasetStats = entry.getValue();
+
+ writer.write(dataset + ",");
+ writer.write(datasetStats.getAvgQps() + ",");
+ writer.write(datasetStats.getQpsStdDev() + ",");
+ writer.write(datasetStats.getAvgLatency() + ",");
+ writer.write(datasetStats.getAvgRecall() + ",");
+ writer.write(datasetStats.getIndexConstruction() + "\n");
+ }
+ }
+ }
+ }
+
+ /**
+ * Creates a result handler that delegates to multiple handlers in sequence.
+ * If any handler throws an exception, subsequent handlers are not called.
+ *
+ * @param handlers the handlers to combine
+ * @return a result handler that invokes all provided handlers
+ */
+ static ResultHandler combining(ResultHandler... handlers) {
+ return results -> {
+ for (ResultHandler handler : handlers) {
+ handler.handleResults(results);
+ }
+ };
+ }
+}
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/TestDataViewWrapper.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/TestDataViewWrapper.java
new file mode 100644
index 000000000..117fe9eda
--- /dev/null
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/TestDataViewWrapper.java
@@ -0,0 +1,90 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.github.jbellis.jvector.benchframe;
+
+import io.github.jbellis.jvector.example.util.DataSet;
+import io.github.jbellis.jvector.example.util.FloatVectorsWrapper;
+import io.github.jbellis.jvector.graph.RandomAccessVectorValues;
+import io.github.jbellis.jvector.vector.VectorSimilarityFunction;
+import io.github.jbellis.jvector.vector.VectorizationProvider;
+import io.github.jbellis.jvector.vector.types.VectorFloat;
+import io.github.jbellis.jvector.vector.types.VectorTypeSupport;
+import io.nosqlbench.vectordata.discovery.TestDataView;
+import io.nosqlbench.vectordata.spec.datasets.types.NeighborIndices;
+import io.nosqlbench.vectordata.spec.datasets.types.QueryVectors;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Optional;
+
+public class TestDataViewWrapper implements DataSet {
+ public final TestDataView view;
+ private static final VectorTypeSupport vts = VectorizationProvider.getInstance().getVectorTypeSupport();
+
+ public TestDataViewWrapper(TestDataView view) {
+ this.view = view;
+ }
+
+ @Override
+ public String getName() {
+ return view.getName();
+ }
+
+ @Override
+ public VectorSimilarityFunction getSimilarityFunction() {
+ var df = view.getDistanceFunction();
+ switch (df) {
+ case EUCLIDEAN: return VectorSimilarityFunction.EUCLIDEAN;
+ case COSINE: return VectorSimilarityFunction.COSINE;
+ case DOT_PRODUCT: return VectorSimilarityFunction.DOT_PRODUCT;
+ default: throw new IllegalArgumentException("Unknown distance function " + df);
+ }
+ }
+
+ @Override
+ public List> getBaseVectors() {
+ throw new RuntimeException("This method should not be called. Use getBaseRavv() instead.");
+ }
+
+ @Override
+ public List> getQueryVectors() {
+ QueryVectors queryVectors = view.getQueryVectors().orElseThrow(() -> new RuntimeException("unable to load query vectors"));
+ ArrayList> vectorFlaots = new ArrayList<>(queryVectors.getCount());
+ for (float[] qv : queryVectors) {
+ vectorFlaots.add(vts.createFloatVector(qv));
+ }
+ return vectorFlaots;
+
+ }
+
+ @Override
+ public List extends List> getGroundTruth() {
+ Optional gt = view.getNeighborIndices();
+
+ return List.of();
+ }
+
+ @Override
+ public int getDimension() {
+ return view.getBaseVectors().get().getVectorDimensions();
+ }
+
+ @Override
+ public RandomAccessVectorValues getBaseRavv() {
+ return view.getBaseVectors().map(FloatVectorsWrapper::new).orElseThrow(() -> new RuntimeException("unable to load float vectors"));
+ }
+}
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/package-info.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/package-info.java
new file mode 100644
index 000000000..d73b951e5
--- /dev/null
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/package-info.java
@@ -0,0 +1,230 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Unified benchmark framework for JVector graph indexes. This package consolidates the functionality
+ * from the legacy benchmark classes (Bench, BenchYAML, AutoBenchYAML) into a modular,
+ * composable architecture using closures and strategy interfaces.
+ *
+ * Quick Start
+ *
+ * Command-Line Interface
+ * The recommended way to run benchmarks is via the CLI:
+ *
+ * # Run with hardcoded parameters (original Bench.java)
+ * benchframe bench dataset-name
+ *
+ * # Run with YAML configuration (original BenchYAML.java)
+ * benchframe benchyaml dataset-name
+ *
+ * # Run CI/CD mode with checkpointing (original AutoBenchYAML.java)
+ * benchframe autobenchyaml -o results/output dataset-name
+ *
+ * # List available datasets
+ * benchframe datasets
+ *
+ * # Access full nbvectors functionality
+ * benchframe nbvectors --help
+ *
+ *
+ * Programmatic Usage
+ * For library usage, use the convenience factory methods:
+ * {@code
+ * // Hardcoded defaults (Bench-style)
+ * BenchFrame.likeBench().execute(args);
+ *
+ * // YAML configuration (BenchYAML-style)
+ * BenchFrame.likeBenchYAML().execute(args);
+ *
+ * // CI/CD with checkpointing (AutoBenchYAML-style)
+ * BenchFrame.likeAutoBenchYAML(outputPath, diagnosticLevel).execute(args);
+ * }
+ *
+ *
+ * Developer Documentation
+ *
+ * The sections below provide detailed information for developers working on the BenchFrame itself.
+ *
+ * Package Overview
+ * The benchframe package provides a flexible framework for benchmarking JVector's approximate
+ * nearest neighbor search implementations. It supports multiple execution modes from simple
+ * interactive testing to complex CI/CD scenarios with checkpointing and automated result collection.
+ *
+ * Core Components
+ *
+ * Main Orchestrator
+ *
+ * - {@link io.github.jbellis.jvector.benchframe.BenchFrame} - Main orchestrator that coordinates
+ * benchmark execution using pluggable strategies
+ * - {@link io.github.jbellis.jvector.benchframe.BenchFrameCLI} - Command-line interface providing
+ * subcommands for different benchmark modes
+ *
+ *
+ * Configuration
+ *
+ * - {@link io.github.jbellis.jvector.benchframe.BenchFrameConfig} - Immutable configuration class
+ * encapsulating all benchmark parameters. Can be used as a single shared config or via a
+ * Function for per-dataset configuration (e.g., YAML)
+ *
+ *
+ * Result Handling
+ *
+ * - {@link io.github.jbellis.jvector.benchframe.BenchResult} - Result model encapsulating dataset,
+ * parameters, and metrics
+ * - {@link io.github.jbellis.jvector.benchframe.ResultHandler} - Strategy interface for handling
+ * results (console, files, etc.)
+ *
+ *
+ * Checkpointing
+ *
+ * - {@link io.github.jbellis.jvector.benchframe.CheckpointStrategy} - Strategy interface for
+ * managing resumable benchmark execution
+ *
+ *
+ * Usage Patterns
+ *
+ * Available CLI Subcommands
+ *
+ * - bench - Run with hardcoded grid parameters (original Bench.java)
+ * - benchyaml - Run with YAML-based configuration (original BenchYAML.java)
+ * - autobenchyaml - CI/CD mode with checkpointing and file output (original AutoBenchYAML.java)
+ * - datasets - List and manage vector datasets (delegates to nbvectors)
+ * - nbvectors - Access full nbvectors CLI functionality
+ *
+ *
+ * CLI Examples
+ *
+ * # Run with hardcoded parameters
+ * benchframe bench dataset-name
+ *
+ * # Run with YAML configuration
+ * benchframe benchyaml dataset-name
+ *
+ * # Run CI/CD mode with checkpointing (--output required)
+ * benchframe autobenchyaml -o results/output dataset-name
+ * benchframe autobenchyaml -o results/output -d 2 cap-1M
+ *
+ * # List available datasets
+ * benchframe datasets
+ * benchframe datasets search cohere
+ *
+ * # Access nbvectors functionality
+ * benchframe nbvectors --help
+ * benchframe nbvectors catalogs list
+ *
+ *
+ * Environment Variables
+ *
+ * - VECTORDATA_CATALOGS - Comma-separated list of additional catalog YAML files
+ * to load (e.g., "~/.config/custom/catalogs.yaml,~/work/catalogs.yaml")
+ *
+ *
+ * Programmatic Usage - Factory Methods
+ * Factory methods provide pre-configured instances matching legacy behavior:
+ * {@code
+ * // Bench-style: hardcoded defaults
+ * BenchFrame frame = BenchFrame.likeBench();
+ * frame.execute(new String[]{"dataset-name"});
+ *
+ * // BenchYAML-style: YAML configuration
+ * BenchFrame frame = BenchFrame.likeBenchYAML();
+ * frame.execute(new String[]{"dataset-name"});
+ *
+ * // AutoBenchYAML-style: CI/CD with checkpointing
+ * BenchFrame frame = BenchFrame.likeAutoBenchYAML("results/benchmark", 2);
+ * frame.execute(new String[]{"dataset-name"});
+ * }
+ *
+ * Programmatic Usage - Custom Configuration
+ * The Builder API provides fine-grained control over all aspects:
+ * {@code
+ * // With a single shared config
+ * BenchFrame frame = new BenchFrame.Builder()
+ * .withDatasetNames(List.of("dataset1", "dataset2"))
+ * .withConfig(BenchFrameConfig.createBenchDefaults())
+ * .withDataSetSource(DataSetSource.DEFAULT)
+ * .withResultHandler(ResultHandler.toFiles("results/benchmark"))
+ * .withCheckpointStrategy(CheckpointStrategy.fileBasedCheckpointing("results/checkpoint"))
+ * .withDiagnosticLevel(2)
+ * .build();
+ *
+ * // With per-dataset config function (like YAML)
+ * BenchFrame frame = new BenchFrame.Builder()
+ * .withDatasetNames(List.of("dataset1", "dataset2"))
+ * .withConfigFunction(name -> loadCustomConfig(name))
+ * .build();
+ *
+ * frame.execute(new String[]{".*"});
+ * }
+ *
+ * Extension Points
+ * The framework is designed for extension through closures and strategy interfaces:
+ *
+ * Custom Configuration Function
+ * {@code
+ * Function customConfigFn = datasetName -> {
+ * // Load from database, REST API, etc.
+ * return new BenchFrameConfig.Builder()
+ * .withDatasetName(datasetName)
+ * .withMGrid(List.of(16, 32, 64))
+ * .build();
+ * };
+ * }
+ *
+ * Custom Result Handler
+ * {@code
+ * ResultHandler customHandler = results -> {
+ * // Send to monitoring system
+ * monitoringSystem.record(results);
+ * // Upload to cloud storage
+ * cloudStorage.upload("benchmarks", results);
+ * };
+ * }
+ *
+ * Custom Checkpoint Strategy
+ * {@code
+ * CheckpointStrategy customStrategy = new CheckpointStrategy() {
+ * public boolean shouldSkipDataset(String name) {
+ * return database.isCompleted(name);
+ * }
+ * public void recordCompletion(String name, List results) {
+ * database.markCompleted(name, results);
+ * }
+ * public List getPreviousResults() {
+ * return database.loadPreviousResults();
+ * }
+ * };
+ * }
+ *
+ * Architecture Benefits
+ *
+ * - Modularity: Clean separation of concerns through strategy interfaces
+ * - Composability: Mix and match strategies for different scenarios
+ * - Testability: Easy to test components in isolation with mock strategies
+ * - Extensibility: Add new strategies without modifying existing code
+ * - Backward Compatibility: Factory methods preserve legacy behavior
+ *
+ *
+ * Thread Safety
+ * The framework components are generally not thread-safe and are designed for single-threaded
+ * benchmark execution. {@link io.github.jbellis.jvector.benchframe.BenchFrameConfig} instances
+ * are immutable and thread-safe once constructed.
+ *
+ * @see io.github.jbellis.jvector.benchframe.BenchFrame
+ * @see io.github.jbellis.jvector.benchframe.BenchFrameCLI
+ * @see io.github.jbellis.jvector.example.Grid
+ */
+package io.github.jbellis.jvector.benchframe;
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/AutoBenchYAML.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/AutoBenchYAML.java
index f8aa81575..47582e227 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/AutoBenchYAML.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/AutoBenchYAML.java
@@ -16,209 +16,90 @@
package io.github.jbellis.jvector.example;
-import com.fasterxml.jackson.databind.ObjectMapper;
-import io.github.jbellis.jvector.example.util.BenchmarkSummarizer;
-import io.github.jbellis.jvector.example.util.BenchmarkSummarizer.SummaryStats;
-import io.github.jbellis.jvector.example.util.CheckpointManager;
-import io.github.jbellis.jvector.example.util.DataSet;
-import io.github.jbellis.jvector.example.util.DataSetLoader;
-import io.github.jbellis.jvector.example.yaml.ConstructionParameters;
-import io.github.jbellis.jvector.example.yaml.MultiConfig;
-import io.github.jbellis.jvector.example.yaml.SearchParameters;
-import io.github.jbellis.jvector.graph.disk.feature.FeatureId;
-
+import io.github.jbellis.jvector.benchframe.BenchFrame;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import java.io.File;
-import java.io.FileWriter;
import java.io.IOException;
-import java.util.ArrayList;
import java.util.Arrays;
-import java.util.List;
-import java.util.Map;
-import java.util.regex.Pattern;
-import java.util.stream.Collectors;
/**
* Automated benchmark runner for GitHub Actions workflow.
* This class is specifically designed to handle the --output argument
* for regression testing in the run-bench.yml workflow.
- *
+ *
* The benchmark runner supports checkpointing to allow resuming from failures.
* It creates a checkpoint file (outputPath + ".checkpoint.json") that records
* which datasets have been fully processed. If the benchmark is restarted,
* it will skip datasets that have already been processed, allowing it to
* continue from where it left off rather than starting over from the beginning.
+ *
+ * This class has been refactored to use BenchFrame for modularity and DRY principles.
+ * All shared functionality is now in reusable modules.
*/
+@Deprecated
public class AutoBenchYAML {
private static final Logger logger = LoggerFactory.getLogger(AutoBenchYAML.class);
- /**
- * Returns a list of all dataset names.
- * This replaces the need to load datasets.yml which may not be available in all environments.
- */
- private static List getAllDatasetNames() {
- List allDatasets = new ArrayList<>();
- allDatasets.add("cap-1M");
- allDatasets.add("cap-6M");
- allDatasets.add("cohere-english-v3-1M");
- allDatasets.add("cohere-english-v3-10M");
- allDatasets.add("dpr-1M");
- allDatasets.add("dpr-10M");
-
- return allDatasets;
- }
-
public static void main(String[] args) throws IOException {
- // Check for --output argument (required for this class)
- String outputPath = null;
- for (int i = 0; i < args.length - 1; i++) {
- if (args[i].equals("--output")) outputPath = args[i+1];
- }
-
+ // Parse command-line arguments
+ String outputPath = extractArgument(args, "--output");
if (outputPath == null) {
logger.error("Error: --output argument is required for AutoBenchYAML");
System.exit(1);
}
- logger.info("Heap space available is {}", Runtime.getRuntime().maxMemory());
-
- // Initialize checkpoint manager
- CheckpointManager checkpointManager = new CheckpointManager(outputPath);
- logger.info("Initialized checkpoint manager. Already completed datasets: {}", checkpointManager.getCompletedDatasets());
+ int diagnosticLevel = extractIntArgument(args, "--diag", 0);
+ String[] filteredArgs = filterArguments(args, "--output", outputPath, "--diag", String.valueOf(diagnosticLevel));
- // Filter out --output, --config and their arguments from the args
- String finalOutputPath = outputPath;
- String configPath = null;
- int diagnostic_level = 0;
- for (int i = 0; i < args.length - 1; i++) {
- if (args[i].equals("--config")) configPath = args[i+1];
- if (args[i].equals("--diag")) diagnostic_level = Integer.parseInt(args[i+1]);
- }
- if (diagnostic_level > 0) {
- Grid.setDiagnosticLevel(diagnostic_level);
- }
- String finalConfigPath = configPath;
- String[] filteredArgs = Arrays.stream(args)
- .filter(arg -> !arg.equals("--output") && !arg.equals(finalOutputPath) &&
- !arg.equals("--config") && !arg.equals(finalConfigPath))
- .toArray(String[]::new);
-
- // Log the filtered arguments for debugging
+ logger.info("Heap space available is {}", Runtime.getRuntime().maxMemory());
logger.info("Filtered arguments: {}", Arrays.toString(filteredArgs));
- // generate a regex that matches any regex in filteredArgs, or if filteredArgs is empty/null, match everything
- var regex = filteredArgs.length == 0 ? ".*" : Arrays.stream(filteredArgs).flatMap(s -> Arrays.stream(s.split("\\s"))).map(s -> "(?:" + s + ")").collect(Collectors.joining("|"));
- logger.info("Generated regex pattern: {}", regex);
-
- // compile regex and do substring matching using find
- var pattern = Pattern.compile(regex);
-
- var datasetNames = getAllDatasetNames().stream().filter(dn -> pattern.matcher(dn).find()).collect(Collectors.toList());
-
- logger.info("Executing the following datasets: {}", datasetNames);
- List results = new ArrayList<>();
- // Add results from checkpoint if present
- results.addAll(checkpointManager.getCompletedResults());
-
- // Process datasets from regex patterns
- if (!datasetNames.isEmpty()) {
- for (var datasetName : datasetNames) {
- // Skip already completed datasets
- if (checkpointManager.isDatasetCompleted(datasetName)) {
- logger.info("Skipping already completed dataset: {}", datasetName);
- continue;
- }
-
- logger.info("Loading dataset: {}", datasetName);
- try {
- DataSet ds = DataSetLoader.loadDataSet(datasetName);
- logger.info("Dataset loaded: {} with {} vectors", datasetName, ds.baseVectors.size());
-
- String normalizedDatasetName = datasetName;
- if (normalizedDatasetName.endsWith(".hdf5")) {
- normalizedDatasetName = normalizedDatasetName.substring(0, normalizedDatasetName.length() - ".hdf5".length());
- }
-
- MultiConfig config;
- if (finalConfigPath != null) {
- config = MultiConfig.getConfig(finalConfigPath);
- // Override dataset name if not specified in custom config
- if (config.dataset == null || config.dataset.isEmpty()) {
- config.dataset = normalizedDatasetName;
- }
- } else {
- config = MultiConfig.getDefaultConfig("autoDefault");
- config.dataset = normalizedDatasetName;
- }
- logger.info("Using configuration: {}", config);
-
- List datasetResults = Grid.runAllAndCollectResults(ds,
- config.construction.outDegree,
- config.construction.efConstruction,
- config.construction.neighborOverflow,
- config.construction.addHierarchy,
- config.construction.getFeatureSets(),
- config.construction.getCompressorParameters(),
- config.search.getCompressorParameters(),
- config.search.topKOverquery,
- config.search.useSearchPruning);
- results.addAll(datasetResults);
+ // Execute benchmark using convenience method
+ BenchFrame.likeAutoBenchYAML(outputPath, diagnosticLevel).execute(filteredArgs);
+ }
- logger.info("Benchmark completed for dataset: {}", datasetName);
- // Mark dataset as completed and update checkpoint, passing results
- checkpointManager.markDatasetCompleted(datasetName, datasetResults);
- } catch (Exception e) {
- logger.error("Exception while processing dataset {}", datasetName, e);
- }
+ /**
+ * Extract a string argument value from command-line args
+ */
+ private static String extractArgument(String[] args, String flag) {
+ for (int i = 0; i < args.length - 1; i++) {
+ if (args[i].equals(flag)) {
+ return args[i + 1];
}
}
+ return null;
+ }
- // Calculate summary statistics
+ /**
+ * Extract an integer argument value from command-line args
+ */
+ private static int extractIntArgument(String[] args, String flag, int defaultValue) {
+ String value = extractArgument(args, flag);
+ if (value == null) {
+ return defaultValue;
+ }
try {
- SummaryStats stats = BenchmarkSummarizer.summarize(results);
- logger.info("Benchmark summary: {}", stats.toString());
-
- // Write results to csv file and details to json
- File detailsFile = new File(outputPath + ".json");
- ObjectMapper mapper = new ObjectMapper();
- mapper.writerWithDefaultPrettyPrinter().writeValue(detailsFile, results);
-
- File outputFile = new File(outputPath + ".csv");
-
- // Get summary statistics by dataset
- Map statsByDataset = BenchmarkSummarizer.summarizeByDataset(results);
-
- // Write CSV data
- try (FileWriter writer = new FileWriter(outputFile)) {
- // Write CSV header
- writer.write("dataset,QPS,QPS StdDev,Mean Latency,Recall@10,Index Construction Time\n");
-
- // Write one row per dataset with average metrics
- for (Map.Entry entry : statsByDataset.entrySet()) {
- String dataset = entry.getKey();
- SummaryStats datasetStats = entry.getValue();
-
- writer.write(dataset + ",");
- writer.write(datasetStats.getAvgQps() + ",");
- writer.write(datasetStats.getQpsStdDev() + ",");
- writer.write(datasetStats.getAvgLatency() + ",");
- writer.write(datasetStats.getAvgRecall() + ",");
- writer.write(datasetStats.getIndexConstruction() + "\n");
- }
- }
-
- logger.info("Benchmark results written to {} (file exists: {})", outputPath, outputFile.exists());
- // Double check that the file was created and log its size
- if (outputFile.exists()) {
- logger.info("Output file size: {} bytes", outputFile.length());
- } else {
- logger.error("Failed to create output file at {}", outputPath);
- }
- } catch (Exception e) {
- logger.error("Exception during final processing", e);
+ return Integer.parseInt(value);
+ } catch (NumberFormatException e) {
+ logger.warn("Invalid integer value for {}: {}", flag, value);
+ return defaultValue;
}
}
+ /**
+ * Filter out specific arguments and their values from the args array
+ */
+ private static String[] filterArguments(String[] args, String... toFilter) {
+ return Arrays.stream(args)
+ .filter(arg -> {
+ for (String filter : toFilter) {
+ if (arg.equals(filter)) {
+ return false;
+ }
+ }
+ return true;
+ })
+ .toArray(String[]::new);
+ }
}
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java
index 4623cbe9d..6b675acab 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java
@@ -16,82 +16,22 @@
package io.github.jbellis.jvector.example;
-import io.github.jbellis.jvector.example.util.CompressorParameters;
-import io.github.jbellis.jvector.example.util.CompressorParameters.PQParameters;
-import io.github.jbellis.jvector.example.util.DataSet;
-import io.github.jbellis.jvector.example.util.DataSetLoader;
-import io.github.jbellis.jvector.example.yaml.DatasetCollection;
-import io.github.jbellis.jvector.graph.disk.feature.FeatureId;
-import io.github.jbellis.jvector.vector.VectorSimilarityFunction;
+import io.github.jbellis.jvector.benchframe.BenchFrame;
import java.io.IOException;
-import java.util.Arrays;
-import java.util.EnumSet;
-import java.util.List;
-import java.util.Map;
-import java.util.function.Function;
-import java.util.regex.Pattern;
-import java.util.stream.Collectors;
-
-import static io.github.jbellis.jvector.quantization.KMeansPlusPlusClusterer.UNWEIGHTED;
/**
- * Tests GraphIndexes against vectors from various datasets
+ * Tests GraphIndexes against vectors from various datasets using hardcoded grid parameters.
+ *
+ * This class has been refactored to use BenchFrame for modularity and DRY principles.
+ * All shared functionality is now in reusable modules.
+ *
+ * @deprecated Use {@link BenchFrame#likeBench()} directly instead. This class will be removed in a future release.
*/
+@Deprecated(forRemoval = true)
public class Bench {
public static void main(String[] args) throws IOException {
System.out.println("Heap space available is " + Runtime.getRuntime().maxMemory());
-
- var mGrid = List.of(32); // List.of(16, 24, 32, 48, 64, 96, 128);
- var efConstructionGrid = List.of(100); // List.of(60, 80, 100, 120, 160, 200, 400, 600, 800);
- var topKGrid = Map.of(
- 10, // topK
- List.of(1.0, 2.0, 5.0, 10.0), // oq
- 100, // topK
- List.of(1.0, 2.0) // oq
- ); // rerankK = oq * topK
- var neighborOverflowGrid = List.of(1.2f); // List.of(1.2f, 2.0f);
- var addHierarchyGrid = List.of(true); // List.of(false, true);
- var refineFinalGraphGrid = List.of(true); // List.of(false, true);
- var usePruningGrid = List.of(true); // List.of(false, true);
- List> buildCompression = Arrays.asList(
- ds -> new PQParameters(ds.getDimension() / 8,
- 256,
- ds.similarityFunction == VectorSimilarityFunction.EUCLIDEAN,
- UNWEIGHTED),
- __ -> CompressorParameters.NONE
- );
- List> searchCompression = Arrays.asList(
- __ -> CompressorParameters.NONE,
- // ds -> new CompressorParameters.BQParameters(),
- ds -> new PQParameters(ds.getDimension() / 8,
- 256,
- ds.similarityFunction == VectorSimilarityFunction.EUCLIDEAN,
- UNWEIGHTED)
- );
- List> featureSets = Arrays.asList(
- EnumSet.of(FeatureId.NVQ_VECTORS),
-// EnumSet.of(FeatureId.NVQ_VECTORS, FeatureId.FUSED_ADC),
- EnumSet.of(FeatureId.INLINE_VECTORS)
- );
-
- // args is list of regexes, possibly needing to be split by whitespace.
- // generate a regex that matches any regex in args, or if args is empty/null, match everything
- var regex = args.length == 0 ? ".*" : Arrays.stream(args).flatMap(s -> Arrays.stream(s.split("\\s"))).map(s -> "(?:" + s + ")").collect(Collectors.joining("|"));
- // compile regex and do substring matching using find
- var pattern = Pattern.compile(regex);
-
- execute(pattern, buildCompression, featureSets, searchCompression, mGrid, efConstructionGrid, neighborOverflowGrid, addHierarchyGrid, refineFinalGraphGrid, topKGrid, usePruningGrid);
- }
-
- private static void execute(Pattern pattern, List> buildCompression, List> featureSets, List> compressionGrid, List mGrid, List efConstructionGrid, List neighborOverflowGrid, List addHierarchyGrid, List refineFinalGraphGrid, Map> topKGrid, List usePruningGrid) throws IOException {
- var datasetCollection = DatasetCollection.load();
- var datasetNames = datasetCollection.getAll().stream().filter(dn -> pattern.matcher(dn).find()).collect(Collectors.toList());
- System.out.println("Executing the following datasets: " + datasetNames);
-
- for (var datasetName : datasetNames) {
- DataSet ds = DataSetLoader.loadDataSet(datasetName);
- Grid.runAll(ds, mGrid, efConstructionGrid, neighborOverflowGrid, addHierarchyGrid, refineFinalGraphGrid, featureSets, buildCompression, compressionGrid, topKGrid, usePruningGrid);
- }
+ BenchFrame.likeBench().execute(args);
}
}
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench2D.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench2D.java
deleted file mode 100644
index dc639f5ea..000000000
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench2D.java
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright DataStax, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package io.github.jbellis.jvector.example;
-
-import io.github.jbellis.jvector.example.util.*;
-import io.github.jbellis.jvector.example.util.CompressorParameters.PQParameters;
-import io.github.jbellis.jvector.graph.disk.feature.FeatureId;
-
-import java.io.IOException;
-import java.util.Arrays;
-import java.util.EnumSet;
-import java.util.List;
-import java.util.Map;
-import java.util.function.Function;
-
-import static io.github.jbellis.jvector.quantization.KMeansPlusPlusClusterer.UNWEIGHTED;
-
-/**
- * Tests GraphIndexes against vectors from a 2D dataset
- */
-public class Bench2D {
- public static void main(String[] args) throws IOException {
- System.out.println("Heap space available is " + Runtime.getRuntime().maxMemory());
-
- var mGrid = List.of(32); // List.of(16, 24, 32, 48, 64, 96, 128);
- var efConstructionGrid = List.of(100); // List.of(60, 80, 100, 120, 160, 200, 400, 600, 800);
- var topKGrid = Map.of(
- 10, // topK
- List.of(1.0, 2.0, 5.0, 10.0, 20.0) // oq
- ); // rerankK = oq * topK
- var neighborOverflowGrid = List.of(1.2f); // List.of(1.2f, 2.0f);
- var addHierarchyGrid = List.of(true); // List.of(false, true);
- var refineFinalGraphGrid = List.of(true); // List.of(false, true);
- var usePruningGrid = List.of(false); // List.of(false, true);
- List> buildCompression = Arrays.asList(__ -> CompressorParameters.NONE);
- List> searchCompression = Arrays.asList(
- __ -> CompressorParameters.NONE,
- ds -> new PQParameters(ds.getDimension(), 256, true, UNWEIGHTED)
- );
- List> featureSets = Arrays.asList(
- EnumSet.of(FeatureId.NVQ_VECTORS),
- EnumSet.of(FeatureId.INLINE_VECTORS)
- );
-
- // 2D grid, built and calculated at runtime
- var grid2d = DataSetCreator.create2DGrid(4_000_000, 10_000, 100);
-
- Grid.runAll(grid2d, mGrid, efConstructionGrid, neighborOverflowGrid, addHierarchyGrid, refineFinalGraphGrid,
- featureSets, buildCompression, searchCompression, topKGrid, usePruningGrid);
- }
-}
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchResult.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchResult.java
deleted file mode 100644
index 5eeeff736..000000000
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchResult.java
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright DataStax, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package io.github.jbellis.jvector.example;
-
-import java.util.Map;
-
-public class BenchResult {
- public String dataset;
- public Map parameters;
- public Map metrics;
-
- public BenchResult() {}
- public BenchResult(String dataset, Map parameters, Map metrics) {
- this.dataset = dataset;
- this.parameters = parameters;
- this.metrics = metrics;
- }
-}
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java
index e81a84863..ab2c5991b 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java
@@ -16,72 +16,22 @@
package io.github.jbellis.jvector.example;
-import io.github.jbellis.jvector.example.util.DataSet;
-import io.github.jbellis.jvector.example.util.DataSetLoader;
-import io.github.jbellis.jvector.example.yaml.DatasetCollection;
-import io.github.jbellis.jvector.example.yaml.MultiConfig;
+import io.github.jbellis.jvector.benchframe.BenchFrame;
import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-import java.util.regex.Pattern;
-import java.util.stream.Collectors;
/**
- * Tests GraphIndexes against vectors from various datasets
+ * Tests GraphIndexes against vectors from various datasets using YAML-based configuration.
+ *
+ * This class has been refactored to use BenchFrame for modularity and DRY principles.
+ * All shared functionality is now in reusable modules.
+ *
+ * @deprecated Use {@link BenchFrame#likeBenchYAML()} directly instead. This class will be removed in a future release.
*/
+@Deprecated(forRemoval = true)
public class BenchYAML {
public static void main(String[] args) throws IOException {
- // args is one of:
- // - a list of regexes, possibly needing to be split by whitespace.
- // - a list of YAML files
-
System.out.println("Heap space available is " + Runtime.getRuntime().maxMemory());
-
- // generate a regex that matches any regex in args, or if args is empty/null, match everything
- var regex = args.length == 0 ? ".*" : Arrays.stream(args).flatMap(s -> Arrays.stream(s.split("\\s"))).map(s -> "(?:" + s + ")").collect(Collectors.joining("|"));
- // compile regex and do substring matching using find
- var pattern = Pattern.compile(regex);
-
- var datasetCollection = DatasetCollection.load();
- var datasetNames = datasetCollection.getAll().stream().filter(dn -> pattern.matcher(dn).find()).collect(Collectors.toList());
-
- List allConfigs = new ArrayList<>();
-
- if (!datasetNames.isEmpty()) {
- System.out.println("Executing the following datasets: " + datasetNames);
-
- for (var datasetName : datasetNames) {
- DataSet ds = DataSetLoader.loadDataSet(datasetName);
-
- if (datasetName.endsWith(".hdf5")) {
- datasetName = datasetName.substring(0, datasetName.length() - ".hdf5".length());
- }
- MultiConfig config = MultiConfig.getDefaultConfig(datasetName);
- allConfigs.add(config);
- }
- }
-
- // get the list of YAML files from args
- List configNames = Arrays.stream(args).filter(s -> s.endsWith(".yml")).collect(Collectors.toList());
-
- if (!configNames.isEmpty()) {
- for (var configName : configNames) {
- MultiConfig config = MultiConfig.getDefaultConfig(configName);
- allConfigs.add(config);
- }
- }
-
- for (var config : allConfigs) {
- String datasetName = config.dataset;
-
- DataSet ds = DataSetLoader.loadDataSet(datasetName);
-
- Grid.runAll(ds, config.construction.outDegree, config.construction.efConstruction,
- config.construction.neighborOverflow, config.construction.addHierarchy, config.construction.refineFinalGraph,
- config.construction.getFeatureSets(), config.construction.getCompressorParameters(),
- config.search.getCompressorParameters(), config.search.topKOverquery, config.search.useSearchPruning, config.search.benchmarks);
- }
+ BenchFrame.likeBenchYAML().execute(args);
}
}
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java
index a4d62645f..c3e756cb2 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java
@@ -17,6 +17,7 @@
package io.github.jbellis.jvector.example;
import io.github.jbellis.jvector.disk.ReaderSupplierFactory;
+import io.github.jbellis.jvector.benchframe.BenchResult;
import io.github.jbellis.jvector.example.benchmarks.AccuracyBenchmark;
import io.github.jbellis.jvector.example.benchmarks.BenchmarkTablePrinter;
import io.github.jbellis.jvector.example.benchmarks.CountBenchmark;
@@ -87,7 +88,7 @@ public class Grid {
private static int diagnostic_level;
- static void runAll(DataSet ds,
+ public static void runAll(DataSet ds,
List mGrid,
List efConstructionGrid,
List neighborOverflowGrid,
@@ -175,7 +176,7 @@ static void runOneGraph(List extends Set> featureSets,
} else {
long start = System.nanoTime();
cv = compressor.encodeAll(ds.getBaseRavv());
- System.out.format("%s encoded %d vectors [%.2f MB] in %.2fs%n", compressor, ds.baseVectors.size(), (cv.ramBytesUsed() / 1024f / 1024f), (System.nanoTime() - start) / 1_000_000_000.0);
+ System.out.format("%s encoded %d vectors [%.2f MB] in %.2fs%n", compressor, ds.getBaseVectors().size(), (cv.ramBytesUsed() / 1024f / 1024f), (System.nanoTime() - start) / 1_000_000_000.0);
}
indexes.forEach((features, index) -> {
@@ -211,7 +212,7 @@ private static Map, ImmutableGraphIndex> buildOnDisk(List exten
var floatVectors = ds.getBaseRavv();
var pq = (PQVectors) buildCompressor.encodeAll(floatVectors);
- var bsp = BuildScoreProvider.pqBuildScoreProvider(ds.similarityFunction, pq);
+ var bsp = BuildScoreProvider.pqBuildScoreProvider(ds.getSimilarityFunction(), pq);
GraphIndexBuilder builder = new GraphIndexBuilder(bsp, floatVectors.dimension(), M, efConstruction, neighborOverflow, 1.2f, addHierarchy, refineFinalGraph);
// use the inline vectors index as the score provider for graph construction
@@ -277,7 +278,7 @@ private static Map, ImmutableGraphIndex> buildOnDisk(List exten
builder.close();
double totalTime = (System.nanoTime() - startTime) / 1_000_000_000.0;
System.out.format("Build and write %s in %ss%n", featureSets, totalTime);
- indexBuildTimes.put(ds.name, totalTime);
+ indexBuildTimes.put(ds.getName(), totalTime);
// open indexes
Map, ImmutableGraphIndex> indexes = new HashMap<>();
@@ -369,7 +370,7 @@ private static Map, ImmutableGraphIndex> buildInMemory(List ext
var floatVectors = ds.getBaseRavv();
Map, ImmutableGraphIndex> indexes = new HashMap<>();
long start;
- var bsp = BuildScoreProvider.randomAccessScoreProvider(floatVectors, ds.similarityFunction);
+ var bsp = BuildScoreProvider.randomAccessScoreProvider(floatVectors, ds.getSimilarityFunction());
GraphIndexBuilder builder = new GraphIndexBuilder(bsp,
floatVectors.dimension(),
M,
@@ -590,9 +591,10 @@ public static List runAllAndCollectResults(
);
for (Metric metric : metricsList) {
Map metrics = java.util.Map.of(metric.getHeader(), metric.getValue());
- results.add(new BenchResult(ds.name, params, metrics));
+ results.add(new BenchResult(ds.getName(), params, metrics));
}
- results.add(new BenchResult(ds.name, params, Map.of("Index Build Time", indexBuildTimes.get(ds.name))));
+ results.add(new BenchResult(ds.getName(), params, Map.of("Index " +
+ "Build Time", indexBuildTimes.get(ds.getName()))));
}
}
}
@@ -675,17 +677,17 @@ public static class ConfiguredSystem implements AutoCloseable {
public SearchScoreProvider scoreProviderFor(VectorFloat> queryVector, ImmutableGraphIndex.View view) {
// if we're not compressing then just use the exact score function
if (cv == null) {
- return DefaultSearchScoreProvider.exact(queryVector, ds.similarityFunction, ds.getBaseRavv());
+ return DefaultSearchScoreProvider.exact(queryVector, ds.getSimilarityFunction(), ds.getBaseRavv());
}
var scoringView = (ImmutableGraphIndex.ScoringView) view;
ScoreFunction.ApproximateScoreFunction asf;
if (features.contains(FeatureId.FUSED_ADC)) {
- asf = scoringView.approximateScoreFunctionFor(queryVector, ds.similarityFunction);
+ asf = scoringView.approximateScoreFunctionFor(queryVector, ds.getSimilarityFunction());
} else {
- asf = cv.precomputedScoreFunctionFor(queryVector, ds.similarityFunction);
+ asf = cv.precomputedScoreFunctionFor(queryVector, ds.getSimilarityFunction());
}
- var rr = scoringView.rerankerFor(queryVector, ds.similarityFunction);
+ var rr = scoringView.rerankerFor(queryVector, ds.getSimilarityFunction());
return new DefaultSearchScoreProvider(asf, rr);
}
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/HelloVectorWorld.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/HelloVectorWorld.java
index a09d1a0e7..a53e682dd 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/HelloVectorWorld.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/HelloVectorWorld.java
@@ -31,7 +31,8 @@ public static void main(String[] args) throws IOException {
String datasetName = "ada002-100k";
- var mfd = DownloadHelper.maybeDownloadFvecs(datasetName);
+ var mfd = DownloadHelper.maybeDownloadFvecs(datasetName)
+ .orElseThrow(() -> new IllegalArgumentException("Unknown dataset: " + datasetName));
DataSet ds = mfd.load();
MultiConfig config = MultiConfig.getConfig(datasetName);
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/AccuracyBenchmark.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/AccuracyBenchmark.java
index a99aca6f8..4cb72d1a5 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/AccuracyBenchmark.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/AccuracyBenchmark.java
@@ -88,7 +88,7 @@ public List runBenchmark(
throw new RuntimeException("At least one metric must be displayed");
}
- int totalQueries = cs.getDataSet().queryVectors.size();
+ int totalQueries = cs.getDataSet().getQueryVectors().size();
// execute all queries in parallel and collect results
List results = IntStream.range(0, totalQueries)
@@ -101,14 +101,14 @@ public List runBenchmark(
if (computeRecall) {
// compute recall for this run
double recall = AccuracyMetrics.recallFromSearchResults(
- cs.getDataSet().groundTruth, results, topK, topK
+ cs.getDataSet().getGroundTruth(), results, topK, topK
);
list.add(Metric.of("Recall@" + topK, formatRecall, recall));
}
if (computeMAP) {
// compute recall for this run
double map = AccuracyMetrics.meanAveragePrecisionAtK(
- cs.getDataSet().groundTruth, results, topK
+ cs.getDataSet().getGroundTruth(), results, topK
);
list.add(Metric.of("MAP@" + topK, formatMAP, map));
}
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/CountBenchmark.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/CountBenchmark.java
index d4fe68456..cd5d228c2 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/CountBenchmark.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/CountBenchmark.java
@@ -105,7 +105,7 @@ public List runBenchmark(
LongAdder nodesVisited = new LongAdder();
LongAdder nodesExpanded = new LongAdder();
LongAdder nodesExpandedBaseLayer = new LongAdder();
- int totalQueries = cs.getDataSet().queryVectors.size();
+ int totalQueries = cs.getDataSet().getQueryVectors().size();
for (int run = 0; run < queryRuns; run++) {
IntStream.range(0, totalQueries)
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/ExecutionTimeBenchmark.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/ExecutionTimeBenchmark.java
index 449a8409f..9872142d5 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/ExecutionTimeBenchmark.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/ExecutionTimeBenchmark.java
@@ -56,7 +56,7 @@ public List runBenchmark(
boolean usePruning,
int queryRuns) {
- int totalQueries = cs.getDataSet().queryVectors.size();
+ int totalQueries = cs.getDataSet().getQueryVectors().size();
double totalRuntime = 0;
for (int run = 0; run < queryRuns; run++) {
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/LatencyBenchmark.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/LatencyBenchmark.java
index 861a8d2be..eefc5ee5c 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/LatencyBenchmark.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/LatencyBenchmark.java
@@ -104,7 +104,7 @@ public List runBenchmark(
throw new IllegalArgumentException("At least one parameter must be set to true");
}
- int totalQueries = cs.getDataSet().queryVectors.size();
+ int totalQueries = cs.getDataSet().getQueryVectors().size();
double mean = 0.0;
double m2 = 0.0;
int count = 0;
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/QueryExecutor.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/QueryExecutor.java
index 9ec728808..3c202c28b 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/QueryExecutor.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/QueryExecutor.java
@@ -33,7 +33,7 @@ public class QueryExecutor {
* @return the SearchResult for query i.
*/
public static SearchResult executeQuery(ConfiguredSystem cs, int topK, int rerankK, boolean usePruning, int i) {
- var queryVector = cs.getDataSet().queryVectors.get(i);
+ var queryVector = cs.getDataSet().getQueryVectors().get(i);
var searcher = cs.getSearcher();
searcher.usePruning(usePruning);
var sf = cs.scoreProviderFor(queryVector, searcher.getView());
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/ThroughputBenchmark.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/ThroughputBenchmark.java
index 27b99fa71..c00893fa5 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/ThroughputBenchmark.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/ThroughputBenchmark.java
@@ -137,7 +137,7 @@ public List runBenchmark(
throw new RuntimeException("At least one metric must be displayed");
}
- int totalQueries = cs.getDataSet().queryVectors.size();
+ int totalQueries = cs.getDataSet().getQueryVectors().size();
int dim = cs.getDataSet().getDimension();
// Warmup Phase with diagnostics
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/BenchmarkSummarizer.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/BenchmarkSummarizer.java
index dba6064ab..88e406551 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/BenchmarkSummarizer.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/BenchmarkSummarizer.java
@@ -15,7 +15,7 @@
*/
package io.github.jbellis.jvector.example.util;
-import io.github.jbellis.jvector.example.BenchResult;
+import io.github.jbellis.jvector.benchframe.BenchResult;
import java.util.List;
import java.util.Map;
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/CheckpointManager.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/CheckpointManager.java
index 4145100b2..d09347c5b 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/CheckpointManager.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/CheckpointManager.java
@@ -16,6 +16,7 @@
package io.github.jbellis.jvector.example.util;
import com.fasterxml.jackson.databind.ObjectMapper;
+import io.github.jbellis.jvector.benchframe.BenchResult;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -35,7 +36,7 @@ public class CheckpointManager {
private final String checkpointPath;
private final ObjectMapper mapper;
private final Set completedDatasets;
- private final List completedResults;
+ private final List completedResults;
/**
* Creates a new CheckpointManager for the given output path.
@@ -88,7 +89,7 @@ public boolean isDatasetCompleted(String datasetName) {
* @param datasetName The name of the dataset
* @param resultsForDataset The results for the dataset
*/
- public void markDatasetCompleted(String datasetName, List resultsForDataset) {
+ public void markDatasetCompleted(String datasetName, List resultsForDataset) {
completedDatasets.add(datasetName);
if (resultsForDataset != null) {
completedResults.addAll(resultsForDataset);
@@ -123,7 +124,7 @@ public Set getCompletedDatasets() {
/**
* Returns the list of completed BenchResults.
*/
- public List getCompletedResults() {
+ public List getCompletedResults() {
return new ArrayList<>(completedResults);
}
@@ -132,13 +133,13 @@ public List getCompletedResults()
*/
private static class CheckpointData {
private List completedDatasets;
- private List completedResults;
+ private List completedResults;
public CheckpointData() {
// Default constructor for Jackson
}
- public CheckpointData(List completedDatasets, List completedResults) {
+ public CheckpointData(List completedDatasets, List completedResults) {
this.completedDatasets = completedDatasets;
this.completedResults = completedResults;
}
@@ -151,11 +152,11 @@ public void setCompletedDatasets(List completedDatasets) {
this.completedDatasets = completedDatasets;
}
- public List getCompletedResults() {
+ public List getCompletedResults() {
return completedResults;
}
- public void setCompletedResults(List completedResults) {
+ public void setCompletedResults(List completedResults) {
this.completedResults = completedResults;
}
}
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/CompressorParameters.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/CompressorParameters.java
index e1ffebb9b..f84b69938 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/CompressorParameters.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/CompressorParameters.java
@@ -55,7 +55,7 @@ public VectorCompressor> computeCompressor(DataSet ds) {
@Override
public String idStringFor(DataSet ds) {
- return String.format("PQ_%s_%d_%d_%s_%s", ds.name, m, k, isCentered, anisotropicThreshold);
+ return String.format("PQ_%s_%d_%d_%s_%s", ds.getName(), m, k, isCentered, anisotropicThreshold);
}
@Override
@@ -85,7 +85,7 @@ public VectorCompressor> computeCompressor(DataSet ds) {
@Override
public String idStringFor(DataSet ds) {
- return String.format("NVQ_%s_%d_%s", ds.name, nSubVectors);
+ return String.format("NVQ_%s_%d_%s", ds.getName(), nSubVectors);
}
@Override
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSet.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSet.java
index e193cd6ad..4b39ad23b 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSet.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSet.java
@@ -16,151 +16,19 @@
package io.github.jbellis.jvector.example.util;
-import io.github.jbellis.jvector.graph.ListRandomAccessVectorValues;
import io.github.jbellis.jvector.graph.RandomAccessVectorValues;
import io.github.jbellis.jvector.vector.VectorSimilarityFunction;
-import io.github.jbellis.jvector.vector.VectorUtil;
import io.github.jbellis.jvector.vector.types.VectorFloat;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.HashSet;
import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.TreeSet;
-public class DataSet {
- public final String name;
- public final VectorSimilarityFunction similarityFunction;
- public final List> baseVectors;
- public final List> queryVectors;
- public final List extends List> groundTruth;
- private RandomAccessVectorValues baseRavv;
+public interface DataSet {
+ String getName();
+ VectorSimilarityFunction getSimilarityFunction();
+ List> getBaseVectors();
+ List> getQueryVectors();
+ List extends List> getGroundTruth();
+ int getDimension();
+ RandomAccessVectorValues getBaseRavv();
- public DataSet(String name,
- VectorSimilarityFunction similarityFunction,
- List> baseVectors,
- List> queryVectors,
- List extends List> groundTruth)
- {
- if (baseVectors.isEmpty()) {
- throw new IllegalArgumentException("Base vectors must not be empty");
- }
- if (queryVectors.isEmpty()) {
- throw new IllegalArgumentException("Query vectors must not be empty");
- }
- if (groundTruth.isEmpty()) {
- throw new IllegalArgumentException("Ground truth vectors must not be empty");
- }
-
- if (baseVectors.get(0).length() != queryVectors.get(0).length()) {
- throw new IllegalArgumentException("Base and query vectors must have the same dimensionality");
- }
- if (queryVectors.size() != groundTruth.size()) {
- throw new IllegalArgumentException("Query and ground truth lists must be the same size");
- }
-
- this.name = name;
- this.similarityFunction = similarityFunction;
- this.baseVectors = baseVectors;
- this.queryVectors = queryVectors;
- this.groundTruth = groundTruth;
-
- System.out.format("%n%s: %d base and %d query vectors created, dimensions %d%n",
- name, baseVectors.size(), queryVectors.size(), baseVectors.get(0).length());
- }
-
- /**
- * Return a dataset containing the given vectors, scrubbed free from zero vectors and normalized to unit length.
- * Note: This only scrubs and normalizes for dot product similarity.
- */
- public static DataSet getScrubbedDataSet(String pathStr,
- VectorSimilarityFunction vsf,
- List> baseVectors,
- List> queryVectors,
- List> groundTruth)
- {
- // remove zero vectors and duplicates, noting that this will change the indexes of the ground truth answers
- List> scrubbedBaseVectors;
- List> scrubbedQueryVectors;
- List> gtSet;
- scrubbedBaseVectors = new ArrayList<>(baseVectors.size());
- scrubbedQueryVectors = new ArrayList<>(queryVectors.size());
- gtSet = new ArrayList<>(groundTruth.size());
- var uniqueVectors = new TreeSet>((a, b) -> {
- assert a.length() == b.length();
- for (int i = 0; i < a.length(); i++) {
- if (a.get(i) < b.get(i)) {
- return -1;
- }
- if (a.get(i) > b.get(i)) {
- return 1;
- }
- }
- return 0;
- });
- Map rawToScrubbed = new HashMap<>();
- {
- int j = 0;
- for (int i = 0; i < baseVectors.size(); i++) {
- VectorFloat> v = baseVectors.get(i);
- var valid = (vsf == VectorSimilarityFunction.EUCLIDEAN) || Math.abs(normOf(v)) > 1e-5;
- if (valid && uniqueVectors.add(v)) {
- scrubbedBaseVectors.add(v);
- rawToScrubbed.put(i, j++);
- }
- }
- }
- // also remove zero query vectors and query vectors that are present in the base set
- for (int i = 0; i < queryVectors.size(); i++) {
- VectorFloat> v = queryVectors.get(i);
- var valid = (vsf == VectorSimilarityFunction.EUCLIDEAN) || Math.abs(normOf(v)) > 1e-5;
- var dupe = uniqueVectors.contains(v);
- if (valid && !dupe) {
- scrubbedQueryVectors.add(v);
- var gt = new ArrayList();
- for (int j : groundTruth.get(i)) {
- gt.add(rawToScrubbed.get(j));
- }
- gtSet.add(gt);
- }
- }
-
- // now that the zero vectors are removed, we can normalize if it looks like they aren't already
- if (vsf == VectorSimilarityFunction.DOT_PRODUCT) {
- if (Math.abs(normOf(baseVectors.get(0)) - 1.0) > 1e-5) {
- normalizeAll(scrubbedBaseVectors);
- normalizeAll(scrubbedQueryVectors);
- }
- }
-
- assert scrubbedQueryVectors.size() == gtSet.size();
- return new DataSet(pathStr, vsf, scrubbedBaseVectors, scrubbedQueryVectors, gtSet);
- }
-
- private static void normalizeAll(Iterable> vectors) {
- for (VectorFloat> v : vectors) {
- VectorUtil.l2normalize(v);
- }
- }
-
- private static float normOf(VectorFloat> baseVector) {
- float norm = 0;
- for (int i = 0; i < baseVector.length(); i++) {
- norm += baseVector.get(i) * baseVector.get(i);
- }
- return (float) Math.sqrt(norm);
- }
-
- public int getDimension() {
- return baseVectors.get(0).length();
- }
-
- public RandomAccessVectorValues getBaseRavv() {
- if (baseRavv == null) {
- baseRavv = new ListRandomAccessVectorValues(baseVectors, getDimension());
- }
- return baseRavv;
- }
}
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetCreator.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetCreator.java
index 1cd532160..40a709f6a 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetCreator.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetCreator.java
@@ -73,6 +73,6 @@ public static DataSet create2DGrid(int nPoints, int nQueries, int topK) {
var groundTruth = queries.stream().map(Map.Entry::getValue).collect(Collectors.toList());
String name = "2D" + gridWidth;
- return new DataSet(name, VectorSimilarityFunction.EUCLIDEAN, baseVectors, queryVectors, groundTruth);
+ return new SimpleDataSet(name, VectorSimilarityFunction.EUCLIDEAN, baseVectors, queryVectors, groundTruth);
}
}
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetLoader.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetLoader.java
index e90a6f275..75b764bbb 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetLoader.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetLoader.java
@@ -16,18 +16,168 @@
package io.github.jbellis.jvector.example.util;
+import io.github.jbellis.jvector.benchframe.TestDataViewWrapper;
+import io.nosqlbench.nbdatatools.api.concurrent.ProgressIndicator;
+import io.nosqlbench.vectordata.discovery.TestDataSources;
+import io.nosqlbench.vectordata.discovery.TestDataView;
+import io.nosqlbench.vectordata.downloader.Catalog;
+import io.nosqlbench.vectordata.downloader.DatasetEntry;
+
import java.io.IOException;
+import java.util.Optional;
+import java.util.concurrent.CompletableFuture;
+
+public class DataSetLoader implements DataSetSource {
+
+ private final DataSetSource[] loaders;
+
+ public DataSetLoader(DataSetSource... loaders) {
+ this.loaders = loaders;
+ }
+
+ @Override
+ public Optional apply(String name) {
+ for (DataSetSource loader : loaders) {
+ Optional result = loader.apply(name);
+ if (result.isPresent()) {
+ return result;
+ }
+ }
+ return Optional.empty();
+ }
+
+ @Override
+ public String toString() {
+ return "DataSetLoader{loaders=" + loaders.length + "}";
+ }
+
+ public final static DataSetSource FVecsDownloader = new DataSetSource() {
+ @Override
+ public Optional apply(String name) {
+ Optional mfdOpt = DownloadHelper.maybeDownloadFvecs(name);
+ if (mfdOpt.isEmpty()) {
+ return Optional.empty();
+ }
+
+ try {
+ var ds = mfdOpt.get().load();
+ return Optional.of(ds);
+ } catch (IOException e) {
+ System.err.println("error while trying to load dataset: " + e + ", this error handling "
+ + "path needs to be updated");
+ return Optional.empty();
+ }
+ }
+
+ @Override
+ public String toString() {
+ return "FVecsDownloader";
+ }
+ };
+
+ public final static DataSetSource HDF5Loader = new DataSetSource() {
+
+ @Override
+ public Optional apply(String name) {
+ if (name.endsWith(".hdf5")) {
+ DownloadHelper.maybeDownloadHdf5(name);
+ return Optional.of(Hdf5Loader.load(name));
+ }
+ return Optional.empty();
+ }
+
+ @Override
+ public String toString() {
+ return "HDF5Loader";
+ }
+ };
+
+ /**
+ * VectorData downloader that loads datasets from the vectordata catalog system.
+ * Supports optional additional catalogs via VECTORDATA_CATALOGS environment variable.
+ *
+ * Environment variable format:
+ * VECTORDATA_CATALOGS=~/.config/custom1/catalogs.yaml,~/.config/custom2/catalogs.yaml
+ */
+ public static final DataSetSource vectorDataDownloader = new DataSetSource() {
+ private final Catalog catalog = initializeCatalog();
-public class DataSetLoader {
- public static DataSet loadDataSet(String fileName) throws IOException {
- DataSet ds;
- if (fileName.endsWith(".hdf5")) {
- DownloadHelper.maybeDownloadHdf5(fileName);
- ds = Hdf5Loader.load(fileName);
- } else {
- var mfd = DownloadHelper.maybeDownloadFvecs(fileName);
- ds = mfd.load();
+ private Catalog initializeCatalog() {
+ TestDataSources sources = new TestDataSources().configure();
+
+ // Add additional catalogs from environment variable
+ String envCatalogs = System.getenv("VECTORDATA_CATALOGS");
+ if (envCatalogs != null && !envCatalogs.trim().isEmpty()) {
+ String[] catalogPaths = envCatalogs.split(",");
+ for (String catalogPath : catalogPaths) {
+ String trimmedPath = catalogPath.trim();
+ if (!trimmedPath.isEmpty()) {
+ System.out.println("Adding optional catalog from VECTORDATA_CATALOGS: " + trimmedPath);
+ sources.addOptionalCatalogs(trimmedPath);
+ }
}
- return ds;
+ }
+
+ return sources.catalog();
+ }
+
+ @Override
+ public Optional apply(String name) {
+ name = name.contains(":") ? name : name + ":default";
+
+ TestDataView tdv = catalog.profile(name);
+ System.out.println("prebuffering dataset '" + name + "' (assumed performance oriented testing)");
+
+ CompletableFuture statusFuture = tdv.getBaseVectors().orElseThrow().prebuffer();
+ if (statusFuture instanceof ProgressIndicator>) {
+ ((ProgressIndicator>) statusFuture).monitorProgress(1000);
+ }
+
+ TestDataViewWrapper tdw = new TestDataViewWrapper(tdv);
+ System.out.println("Loaded " + tdw.getName() + " from streaming source");
+ return Optional.of(tdw);
+ }
+
+ @Override
+ public String toString() {
+ String envCatalogs = System.getenv("VECTORDATA_CATALOGS");
+ return "VectorDataDownloader{defaultCatalog=~/.config/vectordata/catalogs.yaml" +
+ (envCatalogs != null ? ", additionalCatalogs=" + envCatalogs : "") + "}";
}
+ };
+
+ /**
+ * Creates a VectorDataDownloader with a specific catalog path.
+ * Use this when you need a custom catalog location programmatically.
+ * For most use cases, prefer using the VECTORDATA_CATALOGS environment variable instead.
+ *
+ * @param catalogPath path to the catalog YAML file (e.g., "~/.config/vectordata/catalogs.yaml")
+ * @return a DataSetSource that can load from the specified catalog
+ */
+ public static DataSetSource createVectorDataDownloader(String catalogPath) {
+ Catalog catalog = new TestDataSources()
+ .configure()
+ .addOptionalCatalogs(catalogPath)
+ .catalog();
+
+ return name -> {
+ Optional dsentryOption = catalog.matchOne(name);
+ if (dsentryOption.isEmpty()) {
+ return Optional.empty();
+ }
+
+ DatasetEntry dsentry = dsentryOption.get();
+ TestDataView tdv = dsentry.select().profile(name);
+
+ System.out.println("prebuffering dataset (assumed performance oriented testing)");
+ CompletableFuture statusFuture = tdv.getBaseVectors().orElseThrow().prebuffer();
+ if (statusFuture instanceof ProgressIndicator>) {
+ ((ProgressIndicator>) statusFuture).monitorProgress(1000);
+ }
+
+ TestDataViewWrapper tdw = new TestDataViewWrapper(tdv);
+ System.out.println("Loaded " + tdw.getName() + " from streaming source");
+ return Optional.of(tdw);
+ };
+ }
}
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetSource.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetSource.java
new file mode 100644
index 000000000..b8e2042c8
--- /dev/null
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetSource.java
@@ -0,0 +1,40 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.github.jbellis.jvector.example.util;
+
+import java.util.Optional;
+import java.util.function.Function;
+
+public interface DataSetSource extends Function> {
+ public DataSetSource DEFAULT = new DataSetLoader(DataSetLoader.HDF5Loader, DataSetLoader.FVecsDownloader,
+ DataSetLoader.vectorDataDownloader);
+
+ public default DataSetSource and(DataSetSource... loaders) {
+ return new DataSetSource() {
+ @Override
+ public Optional apply(String name) {
+ for (var loader : loaders) {
+ var ds = loader.apply(name);
+ if (ds.isPresent()) {
+ return ds;
+ }
+ }
+ return Optional.empty();
+ }
+ };
+ }
+}
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetUtils.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetUtils.java
new file mode 100644
index 000000000..0a4a62421
--- /dev/null
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetUtils.java
@@ -0,0 +1,112 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.github.jbellis.jvector.example.util;
+
+import io.github.jbellis.jvector.vector.VectorSimilarityFunction;
+import io.github.jbellis.jvector.vector.VectorUtil;
+import io.github.jbellis.jvector.vector.types.VectorFloat;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeSet;
+
+public class DataSetUtils {
+
+ /**
+ * Return a dataset containing the given vectors, scrubbed free from zero vectors and normalized to unit length.
+ * Note: This only scrubs and normalizes for dot product similarity.
+ */
+ public static DataSet getScrubbedDataSet(String pathStr,
+ VectorSimilarityFunction vsf,
+ List> baseVectors,
+ List> queryVectors,
+ List> groundTruth)
+ {
+ // remove zero vectors and duplicates, noting that this will change the indexes of the ground truth answers
+ List> scrubbedBaseVectors;
+ List> scrubbedQueryVectors;
+ List> gtSet;
+ scrubbedBaseVectors = new ArrayList<>(baseVectors.size());
+ scrubbedQueryVectors = new ArrayList<>(queryVectors.size());
+ gtSet = new ArrayList<>(groundTruth.size());
+ var uniqueVectors = new TreeSet>((a, b) -> {
+ assert a.length() == b.length();
+ for (int i = 0; i < a.length(); i++) {
+ if (a.get(i) < b.get(i)) {
+ return -1;
+ }
+ if (a.get(i) > b.get(i)) {
+ return 1;
+ }
+ }
+ return 0;
+ });
+ Map rawToScrubbed = new HashMap<>();
+ {
+ int j = 0;
+ for (int i = 0; i < baseVectors.size(); i++) {
+ VectorFloat> v = baseVectors.get(i);
+ var valid = (vsf == VectorSimilarityFunction.EUCLIDEAN) || Math.abs(normOf(v)) > 1e-5;
+ if (valid && uniqueVectors.add(v)) {
+ scrubbedBaseVectors.add(v);
+ rawToScrubbed.put(i, j++);
+ }
+ }
+ }
+ // also remove zero query vectors and query vectors that are present in the base set
+ for (int i = 0; i < queryVectors.size(); i++) {
+ VectorFloat> v = queryVectors.get(i);
+ var valid = (vsf == VectorSimilarityFunction.EUCLIDEAN) || Math.abs(normOf(v)) > 1e-5;
+ var dupe = uniqueVectors.contains(v);
+ if (valid && !dupe) {
+ scrubbedQueryVectors.add(v);
+ var gt = new ArrayList();
+ for (int j : groundTruth.get(i)) {
+ gt.add(rawToScrubbed.get(j));
+ }
+ gtSet.add(gt);
+ }
+ }
+
+ // now that the zero vectors are removed, we can normalize if it looks like they aren't already
+ if (vsf == VectorSimilarityFunction.DOT_PRODUCT) {
+ if (Math.abs(normOf(baseVectors.get(0)) - 1.0) > 1e-5) {
+ normalizeAll(scrubbedBaseVectors);
+ normalizeAll(scrubbedQueryVectors);
+ }
+ }
+
+ assert scrubbedQueryVectors.size() == gtSet.size();
+ return new SimpleDataSet(pathStr, vsf, scrubbedBaseVectors, scrubbedQueryVectors, gtSet);
+ }
+
+ private static void normalizeAll(Iterable> vectors) {
+ for (VectorFloat> v : vectors) {
+ VectorUtil.l2normalize(v);
+ }
+ }
+
+ private static float normOf(VectorFloat> baseVector) {
+ float norm = 0;
+ for (int i = 0; i < baseVector.length(); i++) {
+ norm += baseVector.get(i) * baseVector.get(i);
+ }
+ return (float) Math.sqrt(norm);
+ }
+}
\ No newline at end of file
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DownloadHelper.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DownloadHelper.java
index 8725a6f65..052388d3d 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DownloadHelper.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DownloadHelper.java
@@ -36,6 +36,7 @@
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
import java.util.List;
+import java.util.Optional;
import java.util.Set;
public class DownloadHelper {
@@ -55,11 +56,11 @@ private static S3AsyncClientBuilder s3AsyncClientBuilder() {
.credentialsProvider(AnonymousCredentialsProvider.create());
}
- public static MultiFileDatasource maybeDownloadFvecs(String name) {
+ public static Optional maybeDownloadFvecs(String name) {
String bucket = infraDatasets.contains(name) ? infraBucketName : bucketName;
var mfd = MultiFileDatasource.byName.get(name);
if (mfd == null) {
- throw new IllegalArgumentException("Unknown dataset: " + name);
+ return Optional.empty();
}
// TODO how to detect and recover from incomplete downloads?
@@ -68,6 +69,7 @@ public static MultiFileDatasource maybeDownloadFvecs(String name) {
Files.createDirectories(Paths.get(fvecDir).resolve(mfd.directory()));
} catch (IOException e) {
System.err.println("Failed to create directory: " + e.getMessage());
+ return Optional.empty();
}
try (S3AsyncClient s3Client = s3AsyncClientBuilder().build()) {
@@ -104,11 +106,11 @@ public static MultiFileDatasource maybeDownloadFvecs(String name) {
}
tm.close();
} catch (Exception e) {
- System.out.println("Error downloading data from S3: " + e.getMessage());
- System.exit(1);
+ System.err.println("Error downloading data from S3: " + e.getMessage());
+ return Optional.empty();
}
- return mfd;
+ return Optional.of(mfd);
}
public static void maybeDownloadHdf5(String datasetName) {
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/FloatVectorsWrapper.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/FloatVectorsWrapper.java
new file mode 100644
index 000000000..3b0447d44
--- /dev/null
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/FloatVectorsWrapper.java
@@ -0,0 +1,69 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.github.jbellis.jvector.example.util;
+
+import io.github.jbellis.jvector.graph.RandomAccessVectorValues;
+import io.github.jbellis.jvector.vector.ArrayVectorFloat;
+import io.github.jbellis.jvector.vector.VectorizationProvider;
+import io.github.jbellis.jvector.vector.types.VectorFloat;
+import io.github.jbellis.jvector.vector.types.VectorTypeSupport;
+import io.nosqlbench.vectordata.spec.datasets.types.FloatVectors;
+
+import java.util.function.Supplier;
+
+/// Wrapper that adapts a nosqlbench FloatVectors instance to implement RandomAccessVectorValues
+public class FloatVectorsWrapper implements RandomAccessVectorValues {
+ private static final VectorTypeSupport vts = VectorizationProvider.getInstance().getVectorTypeSupport();
+
+ private final FloatVectors floatVectors;
+ private final int dimension;
+
+ public FloatVectorsWrapper(FloatVectors floatVectors) {
+ this.floatVectors = floatVectors;
+ this.dimension = floatVectors.getVectorDimensions();
+ }
+
+ @Override
+ public int size() {
+ return floatVectors.getCount();
+ }
+
+ @Override
+ public int dimension() {
+ return floatVectors.getVectorDimensions();
+ }
+
+ @Override
+ public VectorFloat> getVector(int nodeId) {
+ return vts.createFloatVector(floatVectors.get(nodeId));
+ }
+
+ @Override
+ public boolean isValueShared() {
+ return true;
+ }
+
+ @Override
+ public RandomAccessVectorValues copy() {
+ return new FloatVectorsWrapper(floatVectors);
+ }
+
+ @Override
+ public Supplier threadLocalSupplier() {
+ return () -> new FloatVectorsWrapper(floatVectors);
+ }
+}
\ No newline at end of file
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/Hdf5Loader.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/Hdf5Loader.java
index 7dfdccc07..baca10f5f 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/Hdf5Loader.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/Hdf5Loader.java
@@ -82,6 +82,6 @@ else if (filename.contains("-euclidean")) {
}
}
- return DataSet.getScrubbedDataSet(path.getFileName().toString(), similarityFunction, Arrays.asList(baseVectors), Arrays.asList(queryVectors), gtSets);
+ return DataSetUtils.getScrubbedDataSet(path.getFileName().toString(), similarityFunction, Arrays.asList(baseVectors), Arrays.asList(queryVectors), gtSets);
}
}
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/MultiFileDatasource.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/MultiFileDatasource.java
index 6f875e23c..8bba2bd88 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/MultiFileDatasource.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/MultiFileDatasource.java
@@ -51,7 +51,7 @@ public DataSet load() throws IOException {
var baseVectors = SiftLoader.readFvecs("fvec/" + basePath);
var queryVectors = SiftLoader.readFvecs("fvec/" + queriesPath);
var gtVectors = SiftLoader.readIvecs("fvec/" + groundTruthPath);
- return DataSet.getScrubbedDataSet(name, VectorSimilarityFunction.COSINE, baseVectors, queryVectors, gtVectors);
+ return DataSetUtils.getScrubbedDataSet(name, VectorSimilarityFunction.COSINE, baseVectors, queryVectors, gtVectors);
}
public static Map byName = new HashMap<>() {{
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/SimpleDataSet.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/SimpleDataSet.java
new file mode 100644
index 000000000..0d3c752dd
--- /dev/null
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/SimpleDataSet.java
@@ -0,0 +1,104 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.github.jbellis.jvector.example.util;
+
+import io.github.jbellis.jvector.graph.ListRandomAccessVectorValues;
+import io.github.jbellis.jvector.graph.RandomAccessVectorValues;
+import io.github.jbellis.jvector.vector.VectorSimilarityFunction;
+import io.github.jbellis.jvector.vector.types.VectorFloat;
+
+import java.util.List;
+
+public class SimpleDataSet implements DataSet {
+ private final String name;
+ private final VectorSimilarityFunction similarityFunction;
+ private final List> baseVectors;
+ private final List> queryVectors;
+ private final List extends List> groundTruth;
+ private RandomAccessVectorValues baseRavv;
+
+ public SimpleDataSet(String name,
+ VectorSimilarityFunction similarityFunction,
+ List> baseVectors,
+ List> queryVectors,
+ List extends List> groundTruth)
+ {
+ if (baseVectors.isEmpty()) {
+ throw new IllegalArgumentException("Base vectors must not be empty");
+ }
+ if (queryVectors.isEmpty()) {
+ throw new IllegalArgumentException("Query vectors must not be empty");
+ }
+ if (groundTruth.isEmpty()) {
+ throw new IllegalArgumentException("Ground truth vectors must not be empty");
+ }
+
+ if (baseVectors.get(0).length() != queryVectors.get(0).length()) {
+ throw new IllegalArgumentException("Base and query vectors must have the same dimensionality");
+ }
+ if (queryVectors.size() != groundTruth.size()) {
+ throw new IllegalArgumentException("Query and ground truth lists must be the same size");
+ }
+
+ this.name = name;
+ this.similarityFunction = similarityFunction;
+ this.baseVectors = baseVectors;
+ this.queryVectors = queryVectors;
+ this.groundTruth = groundTruth;
+
+ System.out.format("%n%s: %d base and %d query vectors created, dimensions %d%n",
+ name, baseVectors.size(), queryVectors.size(), baseVectors.get(0).length());
+ }
+
+ @Override
+ public String getName() {
+ return name;
+ }
+
+ @Override
+ public VectorSimilarityFunction getSimilarityFunction() {
+ return similarityFunction;
+ }
+
+ @Override
+ public List> getBaseVectors() {
+ return baseVectors;
+ }
+
+ @Override
+ public List> getQueryVectors() {
+ return queryVectors;
+ }
+
+ @Override
+ public List extends List> getGroundTruth() {
+ return groundTruth;
+ }
+
+ @Override
+ public int getDimension() {
+ return baseVectors.get(0).length();
+ }
+
+ @Override
+ public RandomAccessVectorValues getBaseRavv() {
+ if (baseRavv == null) {
+ baseRavv = new ListRandomAccessVectorValues(baseVectors, getDimension());
+ }
+ return baseRavv;
+ }
+}
\ No newline at end of file
diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/Compression.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/Compression.java
index fe7d4d82d..ef1013e80 100644
--- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/Compression.java
+++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/Compression.java
@@ -42,7 +42,7 @@ public Function getCompressorParameters() {
return ds -> {
boolean centerData;
if (strCenterData == null) {
- centerData = ds.similarityFunction == VectorSimilarityFunction.EUCLIDEAN;
+ centerData = ds.getSimilarityFunction() == VectorSimilarityFunction.EUCLIDEAN;
} else {
centerData = strCenterData.equals("Yes");;
}
diff --git a/jvector-examples/src/main/resources/logback.xml b/jvector-examples/src/main/resources/logback.xml
deleted file mode 100644
index 0a7d8846a..000000000
--- a/jvector-examples/src/main/resources/logback.xml
+++ /dev/null
@@ -1,19 +0,0 @@
-
-
-
-
-
- true
-
- %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n
-
-
-
-
-
-
-
-
-
-
-
diff --git a/jvector-examples/src/test/java/io/github/jbellis/jvector/example/util/BenchmarkSummarizerTest.java b/jvector-examples/src/test/java/io/github/jbellis/jvector/example/util/BenchmarkSummarizerTest.java
index 6168d5dca..f71a2c64f 100644
--- a/jvector-examples/src/test/java/io/github/jbellis/jvector/example/util/BenchmarkSummarizerTest.java
+++ b/jvector-examples/src/test/java/io/github/jbellis/jvector/example/util/BenchmarkSummarizerTest.java
@@ -15,9 +15,9 @@
*/
package io.github.jbellis.jvector.example.util;
-import io.github.jbellis.jvector.example.BenchResult;
+import io.github.jbellis.jvector.benchframe.BenchResult;
import io.github.jbellis.jvector.example.util.BenchmarkSummarizer.SummaryStats;
-import org.junit.Test;
+import org.junit.jupiter.api.Test;
import java.util.ArrayList;
import java.util.HashMap;
diff --git a/jvector-examples/src/test/java/io/github/jbellis/jvector/example/util/SummarizerTest.java b/jvector-examples/src/test/java/io/github/jbellis/jvector/example/util/SummarizerTest.java
index 3dbf7f403..163840193 100644
--- a/jvector-examples/src/test/java/io/github/jbellis/jvector/example/util/SummarizerTest.java
+++ b/jvector-examples/src/test/java/io/github/jbellis/jvector/example/util/SummarizerTest.java
@@ -15,7 +15,7 @@
*/
package io.github.jbellis.jvector.example.util;
-import io.github.jbellis.jvector.example.BenchResult;
+import io.github.jbellis.jvector.benchframe.BenchResult;
import io.github.jbellis.jvector.example.util.BenchmarkSummarizer.SummaryStats;
import java.util.ArrayList;
diff --git a/jvector-examples/yaml-configs/datasets.yml b/jvector-examples/yaml-configs/datasets.yml
index 3fd67217e..8146196cf 100644
--- a/jvector-examples/yaml-configs/datasets.yml
+++ b/jvector-examples/yaml-configs/datasets.yml
@@ -1,3 +1,7 @@
+streamable:
+ - cohere_msmarco
+
+
neighborhood-watch-100k:
- ada002-100k
- cohere-english-v3-100k
diff --git a/jvector-native/pom.xml b/jvector-native/pom.xml
index daf84fe6a..130e19d48 100644
--- a/jvector-native/pom.xml
+++ b/jvector-native/pom.xml
@@ -49,6 +49,17 @@