diff --git a/.github/workflows/run-bench.yml b/.github/workflows/run-bench.yml index 19762aaa5..6be8866ab 100644 --- a/.github/workflows/run-bench.yml +++ b/.github/workflows/run-bench.yml @@ -140,7 +140,7 @@ jobs: env: DATASET_HASH: ${{ secrets.DATASETS_KEYPATH }} run: | - # Check if jvector-examples directory and AutoBenchYAML class exist + # Check if jvector-examples directory exists if [ ! -d "jvector-examples" ]; then echo "Warning: jvector-examples directory not found in branch ${{ matrix.branch }}. Skipping benchmark." exit 0 @@ -201,12 +201,12 @@ jobs: java ${{ matrix.jdk >= 20 && '--enable-native-access=ALL-UNNAMED --add-modules=jdk.incubator.vector' || '' }} \ ${{ matrix.jdk >= 22 && '-Djvector.experimental.enable_native_vectorization=true' || '' }} \ -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp/heap_dump/ -Xmx${HALF_MEM_GB}g \ - -cp jvector-examples/target/jvector-examples-*-jar-with-dependencies.jar io.github.jbellis.jvector.example.AutoBenchYAML --output ${SAFE_BRANCH}-bench-results ${CONFIG_ARG} dpr-1M + -cp jvector-examples/target/jvector-examples-*-jar-with-dependencies.jar io.github.jbellis.jvector.benchframe.BenchFrameCLI autobenchyaml --output ${SAFE_BRANCH}-bench-results dpr-1M else java ${{ matrix.jdk >= 20 && '--enable-native-access=ALL-UNNAMED --add-modules=jdk.incubator.vector' || '' }} \ ${{ matrix.jdk >= 22 && '-Djvector.experimental.enable_native_vectorization=true' || '' }} \ -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp/heap_dump/ -Xmx${HALF_MEM_GB}g \ - -cp jvector-examples/target/jvector-examples-*-jar-with-dependencies.jar io.github.jbellis.jvector.example.AutoBenchYAML --output ${SAFE_BRANCH}-bench-results ${CONFIG_ARG}${BENCH_SUFFIX:+ }${BENCH_ARG} + -cp jvector-examples/target/jvector-examples-*-jar-with-dependencies.jar io.github.jbellis.jvector.benchframe.BenchFrameCLI autobenchyaml --output ${SAFE_BRANCH}-bench-results${BENCH_SUFFIX:+ }${BENCH_ARG} fi # Move the results to the benchmark_results directory diff --git a/.github/workflows/unit-tests.yaml b/.github/workflows/unit-tests.yaml index 498afa031..c8a1fc452 100644 --- a/.github/workflows/unit-tests.yaml +++ b/.github/workflows/unit-tests.yaml @@ -59,13 +59,13 @@ jobs: - name: Test Panama Support (JDK ${{ matrix.jdk }}) if: matrix.jdk == '20' run: >- - mvn -B -Pjdk20 -pl jvector-tests test -am test + mvn -X -B -Pjdk20 -pl jvector-tests test -am test -DTest_RequireSpecificVectorizationProvider=PanamaVectorizationProvider - name: Verify native-access vector support (JDK ${{ matrix.jdk }}) if: matrix.jdk == '24' run: >- - mvn -B -Punix-amd64-profile -pl jvector-tests -am test + mvn -X -B -Punix-amd64-profile -pl jvector-tests -am test -DTest_RequireSpecificVectorizationProvider=NativeVectorizationProvider -Dsurefire.failIfNoSpecifiedTests=false -Dtest=TestVectorizationProvider @@ -73,7 +73,7 @@ jobs: - name: Compile, run tests and package (JDK ${{ matrix.jdk }}) if: matrix.jdk == '24' run: >- - mvn -B -Punix-amd64-profile -pl jvector-tests -am test + mvn -X -B -Punix-amd64-profile -pl jvector-tests -am test -DTest_RequireSpecificVectorizationProvider=NativeVectorizationProvider - name: Test Summary for (ISA:${{ matrix.isa}},JDK${{ matrix.jdk }}) diff --git a/.gitignore b/.gitignore index 9fc38bae4..b651cd511 100644 --- a/.gitignore +++ b/.gitignore @@ -34,6 +34,16 @@ hdf5/ ### aider .aider* +### claude +.claude/** + +### junie +.junie/** + # JMH generated files dependency-reduced-pom.xml results.csv + +# Local testing files +local/** + diff --git a/benchmarks-jmh/pom.xml b/benchmarks-jmh/pom.xml index c82ee2707..78654edcc 100644 --- a/benchmarks-jmh/pom.xml +++ b/benchmarks-jmh/pom.xml @@ -94,6 +94,21 @@ + + org.apache.maven.plugins + maven-javadoc-plugin + + + --add-modules=jdk.incubator.vector + + 22 + false + true + + io.github.jbellis:* + + + \ No newline at end of file diff --git a/benchmarks-jmh/src/main/java/io/github/jbellis/jvector/bench/PQDistanceCalculationBenchmark.java b/benchmarks-jmh/src/main/java/io/github/jbellis/jvector/bench/PQDistanceCalculationBenchmark.java index 59342e41a..f0e07c623 100644 --- a/benchmarks-jmh/src/main/java/io/github/jbellis/jvector/bench/PQDistanceCalculationBenchmark.java +++ b/benchmarks-jmh/src/main/java/io/github/jbellis/jvector/bench/PQDistanceCalculationBenchmark.java @@ -23,15 +23,13 @@ import io.github.jbellis.jvector.quantization.PQVectors; import io.github.jbellis.jvector.quantization.ProductQuantization; import io.github.jbellis.jvector.vector.VectorSimilarityFunction; -import io.github.jbellis.jvector.vector.VectorUtil; import io.github.jbellis.jvector.vector.VectorizationProvider; -import io.github.jbellis.jvector.vector.types.ByteSequence; import io.github.jbellis.jvector.vector.types.VectorFloat; import io.github.jbellis.jvector.vector.types.VectorTypeSupport; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import org.openjdk.jmh.annotations.*; import org.openjdk.jmh.infra.Blackhole; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.ArrayList; @@ -49,7 +47,7 @@ @Measurement(iterations = 3) @Threads(1) public class PQDistanceCalculationBenchmark { - private static final Logger log = LoggerFactory.getLogger(PQDistanceCalculationBenchmark.class); + private static final Logger log = LogManager.getLogger(PQDistanceCalculationBenchmark.class); private static final VectorTypeSupport VECTOR_TYPE_SUPPORT = VectorizationProvider.getInstance().getVectorTypeSupport(); private final VectorSimilarityFunction vsf = VectorSimilarityFunction.EUCLIDEAN; diff --git a/jvector-examples/pom.xml b/jvector-examples/pom.xml index bace97046..7c6fee70e 100644 --- a/jvector-examples/pom.xml +++ b/jvector-examples/pom.xml @@ -13,7 +13,9 @@ JVector Examples 2.21.10 + 0.1.10 + @@ -27,27 +29,45 @@ org.apache.maven.plugins maven-assembly-plugin - 3.6.0 - - - jar-with-dependencies - - - - io.github.jbellis.jvector.example.AutoBenchYAML - - - + 3.7.1 - make-assembly package single + + + + true + io.github.jbellis.jvector.benchframe.BenchFrameCLI + + + + + jar-with-dependencies + + + + + org.apache.maven.plugins + maven-javadoc-plugin + + + --add-modules=jdk.incubator.vector + + 22 + false + true + + io.github.jbellis:* + + + + @@ -72,17 +92,6 @@ - - - org.slf4j - slf4j-api - 2.0.9 - - - ch.qos.logback - logback-classic - 1.4.11 - software.amazon.awssdk s3-transfer-manager @@ -115,38 +124,40 @@ - com.kohlschutter.junixsocket - junixsocket-core - 2.8.1 - pom + io.nosqlbench + datatools-vectordata + ${datatools.version} - com.fasterxml.jackson.core - jackson-databind - 2.17.1 + io.nosqlbench + datatools-nbvectors + ${datatools.version} - junit - junit - 4.13.1 - test + info.picocli + picocli + 4.7.6 + - org.testng - testng - 7.3.0 - test + com.kohlschutter.junixsocket + junixsocket-core + 2.8.1 + pom org.junit.jupiter junit-jupiter-api - 5.11.4 test jdk11 + + 11 + 11 + @@ -273,6 +284,9 @@ true + + 22 + io.github.jbellis diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchFrame.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchFrame.java new file mode 100644 index 000000000..693816acd --- /dev/null +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchFrame.java @@ -0,0 +1,601 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.github.jbellis.jvector.benchframe; + +import io.github.jbellis.jvector.example.*; +import io.github.jbellis.jvector.example.util.DataSet; +import io.github.jbellis.jvector.example.util.DataSetSource; +import io.github.jbellis.jvector.example.yaml.DatasetCollection; +import io.github.jbellis.jvector.example.yaml.MultiConfig; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import picocli.CommandLine; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.function.Function; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +/** + * Unified benchmark framework that consolidates functionality from Bench, BenchYAML, and AutoBenchYAML. + * Provides a modular, composable architecture using the strategy pattern for different components including + * configuration sources, result handlers, and checkpointing strategies. + *

+ * This class serves as the main orchestrator for JVector graph index benchmarks, supporting multiple execution + * modes from simple hardcoded configurations to complex CI/CD scenarios with checkpointing and automated result + * collection. + * + *

Environment Variables

+ *
    + *
  • {@code VECTORDATA_CATALOGS} - Comma-separated list of additional catalog YAML files to load + * (e.g., "~/.config/custom1/catalogs.yaml,~/.config/custom2/catalogs.yaml")
  • + *
+ * + *

Usage Examples

+ *

Command-Line Interface (Recommended)

+ *
{@code
+ * // Run with hardcoded parameters (Bench-style)
+ * BenchFrame.main(new String[]{"bench", "dataset-name"});
+ *
+ * // Run with YAML configuration (BenchYAML-style)
+ * BenchFrame.main(new String[]{"benchyaml", "dataset-name"});
+ *
+ * // Run in CI/CD mode with checkpointing (AutoBenchYAML-style)
+ * BenchFrame.main(new String[]{"autobenchyaml", "-o", "output", "dataset-name"});
+ *
+ * // List available datasets
+ * BenchFrame.main(new String[]{"datasets"});
+ *
+ * // Access nbvectors CLI
+ * BenchFrame.main(new String[]{"nbvectors", "--help"});
+ * }
+ * + *

Programmatic Usage - Convenience Methods

+ *
{@code
+ * // Use hardcoded defaults
+ * BenchFrame.likeBench().execute(args);
+ *
+ * // Use YAML configuration
+ * BenchFrame.likeBenchYAML().execute(args);
+ *
+ * // Use CI/CD mode with checkpointing
+ * BenchFrame.likeAutoBenchYAML(outputPath, diagnosticLevel).execute(args);
+ * }
+ * + *

Advanced - Custom Configuration with Builder

+ *
{@code
+ * // Use a single config for all datasets
+ * new BenchFrame.Builder()
+ *     .withDatasetNames(List.of("my-dataset", "another-dataset"))
+ *     .withConfig(BenchFrameConfig.createBenchDefaults())
+ *     .withDataSetSource(DataSetSource.DEFAULT)
+ *     .withResultHandler(ResultHandler.consoleOnly())
+ *     .build()
+ *     .execute(args);
+ *
+ * // Or use a function for per-dataset config (like YAML)
+ * new BenchFrame.Builder()
+ *     .withDatasetNames(List.of("dataset1", "dataset2"))
+ *     .withConfigFunction(name -> loadYamlConfig(name))
+ *     .build()
+ *     .execute(args);
+ * }
+ * + *

For Synthetic 2D Datasets (Programmatic Only)

+ *
{@code
+ * // Create and benchmark a 2D grid programmatically
+ * var grid2d = DataSetCreator.create2DGrid(4_000_000, 10_000, 100);
+ * BenchFrame.likeBench().execute(grid2d);
+ * }
+ * + * @see BenchFrameConfig + * @see ResultHandler + * @see CheckpointStrategy + * @see BenchFrameCLI + */ +public class BenchFrame { + private static final Logger logger = LoggerFactory.getLogger(BenchFrame.class); + + private final List datasetNames; + private final BenchFrameConfig config; + private final Function configFunction; + private final DataSetSource dataSetSource; + private final ResultHandler resultHandler; + private final CheckpointStrategy checkpointStrategy; + private final boolean collectResults; + private final int diagnosticLevel; + + private BenchFrame(Builder builder) { + this.datasetNames = builder.datasetNames; + this.config = builder.config; + this.configFunction = builder.configFunction; + this.dataSetSource = builder.dataSetSource; + this.resultHandler = builder.resultHandler; + this.checkpointStrategy = builder.checkpointStrategy; + this.collectResults = builder.collectResults; + this.diagnosticLevel = builder.diagnosticLevel; + } + + /** + * Executes the benchmark with a pre-created dataset. This method is primarily used by the Bench2D workflow + * for synthetic 2D datasets but can be used programmatically with any {@link DataSet} instance. + *

+ * The execution includes: + *

    + *
  • Setting diagnostic level if configured
  • + *
  • Loading configuration for the dataset name
  • + *
  • Running the benchmark grid with configured parameters
  • + *
  • Handling results through the configured {@link ResultHandler}
  • + *
+ * + * @param dataset the pre-created dataset to benchmark + * @throws IOException if benchmark execution fails or result writing encounters I/O errors + * @throws RuntimeException if the dataset configuration cannot be loaded + */ + public void execute(DataSet dataset) throws IOException { + if (diagnosticLevel > 0) { + Grid.setDiagnosticLevel(diagnosticLevel); + } + + logger.info("Executing benchmark for pre-created dataset: {}", dataset.getName()); + + try { + BenchFrameConfig datasetConfig = getConfigForDataset(dataset.getName()); + List results = executeBenchmark(dataset, datasetConfig); + + resultHandler.handleResults(results); + logger.info("Benchmark execution complete"); + } catch (Exception e) { + logger.error("Failed to process dataset: {}", dataset.getName(), e); + throw new RuntimeException("Benchmark failed for dataset: " + dataset.getName(), e); + } + } + + /** + * Executes the benchmark with the given command-line arguments. This is the primary entry point for + * benchmarking one or more datasets by name pattern. + *

+ * The execution flow includes: + *

    + *
  1. Setting diagnostic level if configured
  2. + *
  3. Building a regex pattern from the provided arguments
  4. + *
  5. Filtering datasets by the pattern
  6. + *
  7. Loading previous results from checkpoint if checkpoint strategy is enabled
  8. + *
  9. For each matched dataset: + *
      + *
    • Checking if dataset should be skipped (already completed in checkpoint)
    • + *
    • Loading the dataset from the configured {@link DataSetSource}
    • + *
    • Loading configuration (either shared config or per-dataset function)
    • + *
    • Executing the benchmark
    • + *
    • Recording completion in checkpoint if enabled
    • + *
    + *
  10. + *
  11. Handling all results through the configured {@link ResultHandler}
  12. + *
+ * + * @param args command-line arguments, typically dataset name patterns. Multiple patterns are OR'd together. + * If empty, matches all datasets. Patterns support standard Java regex syntax. + * @throws IOException if dataset loading, benchmark execution, or result writing encounters I/O errors + * @throws RuntimeException if a dataset cannot be loaded or configuration cannot be retrieved + */ + public void execute(String[] args) throws IOException { + if (diagnosticLevel > 0) { + Grid.setDiagnosticLevel(diagnosticLevel); + } + + Pattern pattern = buildPattern(args); + List matchedDatasets = filterDatasets(datasetNames, pattern); + + if (matchedDatasets.isEmpty()) { + logger.warn("No datasets matched pattern: {}", pattern.pattern()); + return; + } + + logger.info("Executing benchmarks for datasets: {}", matchedDatasets); + + List allResults = new ArrayList<>(checkpointStrategy.getPreviousResults()); + + for (String datasetName : matchedDatasets) { + if (checkpointStrategy.shouldSkipDataset(datasetName)) { + logger.info("Skipping already completed dataset: {}", datasetName); + continue; + } + + logger.info("Loading dataset: {}", datasetName); + try { + DataSet dataset = dataSetSource.apply(datasetName) + .orElseThrow(() -> new RuntimeException("Unknown dataset: " + datasetName)); + + BenchFrameConfig datasetConfig = getConfigForDataset(datasetName); + List datasetResults = executeBenchmark(dataset, datasetConfig); + + allResults.addAll(datasetResults); + checkpointStrategy.recordCompletion(datasetName, datasetResults); + + logger.info("Completed benchmark for dataset: {}", datasetName); + } catch (Exception e) { + logger.error("Failed to process dataset: {}", datasetName, e); + throw new RuntimeException("Benchmark failed for dataset: " + datasetName, e); + } + } + + resultHandler.handleResults(allResults); + logger.info("Benchmark execution complete"); + } + + /** + * Gets the configuration for a specific dataset. Uses configFunction if provided (for per-dataset config), + * otherwise uses the single shared config. + * + * @param datasetName the dataset name + * @return configuration for the dataset + */ + private BenchFrameConfig getConfigForDataset(String datasetName) { + if (configFunction != null) { + return configFunction.apply(datasetName); + } else { + // Use shared config, but set the dataset name + return config.toBuilder() + .withDatasetName(datasetName) + .build(); + } + } + + /** + * Executes the benchmark for a single dataset with the provided configuration. This method delegates + * to {@link Grid} for the actual benchmark execution. + * + * @param dataset the dataset to benchmark + * @param config the configuration specifying grid parameters and benchmark settings + * @return list of {@link BenchResult} objects if result collection is enabled, empty list otherwise + * @throws IOException if benchmark execution encounters I/O errors + */ + private List executeBenchmark(DataSet dataset, BenchFrameConfig config) throws IOException { + if (collectResults) { + return Grid.runAllAndCollectResults( + dataset, + config.getMGrid(), + config.getEfConstructionGrid(), + config.getNeighborOverflowGrid(), + config.getAddHierarchyGrid(), + config.getFeatureSets(), + config.getBuildCompressors(), + config.getSearchCompressors(), + config.getTopKOverqueryGrid(), + config.getUsePruningGrid() + ); + } else { + Grid.runAll( + dataset, + config.getMGrid(), + config.getEfConstructionGrid(), + config.getNeighborOverflowGrid(), + config.getAddHierarchyGrid(), + config.getRefineFinalGraphGrid(), + config.getFeatureSets(), + config.getBuildCompressors(), + config.getSearchCompressors(), + config.getTopKOverqueryGrid(), + config.getUsePruningGrid(), + config.getBenchmarkSpec() + ); + return List.of(); + } + } + + /** + * Builds a regex pattern from command-line arguments. Multiple patterns are OR'd together. + * Arguments can contain space-separated patterns that are split and combined. + *

+ * Examples: + *

    + *
  • Empty args: matches everything (".*")
  • + *
  • {"dataset1"}: matches "dataset1"
  • + *
  • {"dataset1", "dataset2"}: matches "dataset1" OR "dataset2"
  • + *
  • {"dataset1 dataset2"}: matches "dataset1" OR "dataset2" (space-split)
  • + *
+ * + * @param args command-line arguments containing dataset name patterns + * @return compiled regex pattern for dataset filtering + */ + private static Pattern buildPattern(String[] args) { + var regex = args.length == 0 ? ".*" + : Arrays.stream(args) + .flatMap(s -> Arrays.stream(s.split("\\s"))) + .map(s -> "(?:" + s + ")") + .collect(Collectors.joining("|")); + return Pattern.compile(regex); + } + + /** + * Filters dataset names by regex pattern using partial matching (find, not full match). + * + * @param datasets the list of dataset names to filter + * @param pattern the regex pattern to match against + * @return list of dataset names where the pattern was found + */ + private static List filterDatasets(List datasets, Pattern pattern) { + return datasets.stream() + .filter(name -> pattern.matcher(name).find()) + .collect(Collectors.toList()); + } + + /** + * Creates a BenchFrame configured like the original Bench.java with hardcoded grid parameters. + * This factory method provides compatibility with the legacy Bench class behavior. + *

+ * Configuration includes: + *

    + *
  • Datasets loaded from {@link DatasetCollection}
  • + *
  • Hardcoded default grid parameters (M=32, efConstruction=100, etc.)
  • + *
  • Console-only output (no file writing)
  • + *
  • No checkpointing
  • + *
+ * + * @return a BenchFrame instance configured with hardcoded defaults + * @throws UncheckedIOException if the dataset collection cannot be loaded + */ + public static BenchFrame likeBench() { + try { + return new Builder() + .withDatasetNames(DatasetCollection.load().getAll()) + .withConfig(BenchFrameConfig.createBenchDefaults()) + .withDataSetSource(DataSetSource.DEFAULT) + .withResultHandler(ResultHandler.consoleOnly()) + .build(); + } catch (IOException e) { + throw new UncheckedIOException("Failed to load dataset collection", e); + } + } + + /** + * Creates a BenchFrame configured like the original BenchYAML.java with YAML-based configuration. + * This factory method provides compatibility with the legacy BenchYAML class behavior. + *

+ * Configuration includes: + *

    + *
  • Datasets loaded from {@link DatasetCollection}
  • + *
  • Parameters loaded from YAML configuration files per dataset
  • + *
  • Console-only output (no file writing)
  • + *
  • No checkpointing
  • + *
+ * + * @return a BenchFrame instance configured to load parameters from YAML + * @throws UncheckedIOException if the dataset collection cannot be loaded + */ + public static BenchFrame likeBenchYAML() { + try { + return new Builder() + .withDatasetNames(DatasetCollection.load().getAll()) + .withConfigFunction(datasetName -> { + try { + MultiConfig multiConfig = MultiConfig.getDefaultConfig(datasetName); + return BenchFrameConfig.fromMultiConfig(multiConfig); + } catch (FileNotFoundException e) { + throw new RuntimeException("Failed to load YAML config for dataset: " + datasetName, e); + } + }) + .withDataSetSource(DataSetSource.DEFAULT) + .withResultHandler(ResultHandler.consoleOnly()) + .build(); + } catch (IOException e) { + throw new UncheckedIOException("Failed to load dataset collection", e); + } + } + + /** + * Creates a BenchFrame configured like the original AutoBenchYAML.java for CI/CD scenarios. + * This factory method provides compatibility with the legacy AutoBenchYAML class behavior + * with additional support for checkpointing and file-based output. + *

+ * Configuration includes: + *

    + *
  • Hardcoded dataset list for CI/CD: cap-1M, cap-6M, cohere-english-v3-1M, + * cohere-english-v3-10M, dpr-1M, dpr-10M
  • + *
  • Parameters loaded from autoDefault YAML configuration
  • + *
  • File-based output: CSV summary and JSON details
  • + *
  • File-based checkpointing to support resumption after failures
  • + *
  • Result collection enabled
  • + *
  • Configurable diagnostic level
  • + *
+ * + * @param outputPath base path for output files (.csv, .json, .checkpoint.json) + * @param diagnosticLevel diagnostic level controlling Grid output verbosity + * (0=none, 1=basic, 2=detailed, 3=verbose) + * @return a BenchFrame instance configured for CI/CD with checkpointing + * @see ResultHandler#toFiles(String) + * @see CheckpointStrategy#fileBasedCheckpointing(String) + */ + public static BenchFrame likeAutoBenchYAML(String outputPath, int diagnosticLevel) { + // Hardcoded list for CI/CD (matches original AutoBenchYAML) + List datasets = Arrays.asList( + "cap-1M", "cap-6M", + "cohere-english-v3-1M", "cohere-english-v3-10M", + "dpr-1M", "dpr-10M" + ); + + try { + MultiConfig multiConfig = MultiConfig.getDefaultConfig("autoDefault"); + BenchFrameConfig baseConfig = BenchFrameConfig.fromMultiConfig(multiConfig) + .toBuilder() + .collectResults(true) + .build(); + + return new Builder() + .withDatasetNames(datasets) + .withConfig(baseConfig) + .withDataSetSource(DataSetSource.DEFAULT) + .withResultHandler(ResultHandler.toFiles(outputPath)) + .withCheckpointStrategy(CheckpointStrategy.fileBasedCheckpointing(outputPath)) + .withDiagnosticLevel(diagnosticLevel) + .build(); + } catch (FileNotFoundException e) { + throw new RuntimeException("Failed to load autoDefault YAML config", e); + } + } + + /** + * Builder for constructing BenchFrame instances with fluent API. Provides fine-grained control + * over all aspects of benchmark configuration including datasets, configuration, + * result handling, and checkpointing. + *

+ * Default values: + *

    + *
  • datasetNames: empty list
  • + *
  • config: null (must be set via withConfig or withConfigFunction)
  • + *
  • configFunction: null
  • + *
  • dataSetSource: {@link DataSetSource#DEFAULT}
  • + *
  • resultHandler: {@link ResultHandler#consoleOnly()}
  • + *
  • checkpointStrategy: {@link CheckpointStrategy#none()}
  • + *
  • collectResults: false
  • + *
  • diagnosticLevel: 0
  • + *
+ */ + public static class Builder { + private List datasetNames = List.of(); + private BenchFrameConfig config = null; + private Function configFunction = null; + private DataSetSource dataSetSource = DataSetSource.DEFAULT; + private ResultHandler resultHandler = ResultHandler.consoleOnly(); + private CheckpointStrategy checkpointStrategy = CheckpointStrategy.none(); + private boolean collectResults = false; + private int diagnosticLevel = 0; + + /** + * Sets the list of dataset names to benchmark. The provided list is copied to prevent external modification. + * + * @param datasetNames the list of dataset names to benchmark + * @return this builder for method chaining + */ + public Builder withDatasetNames(List datasetNames) { + this.datasetNames = new ArrayList<>(datasetNames); + return this; + } + + /** + * Sets a single configuration to use for all datasets. + * Mutually exclusive with {@link #withConfigFunction}. + * + * @param config the configuration to use for all datasets + * @return this builder for method chaining + */ + public Builder withConfig(BenchFrameConfig config) { + this.config = config; + this.configFunction = null; + return this; + } + + /** + * Sets a function to generate configuration per dataset (e.g., for YAML-based config). + * Mutually exclusive with {@link #withConfig}. + * + * @param configFunction function mapping dataset name to configuration + * @return this builder for method chaining + */ + public Builder withConfigFunction(Function configFunction) { + this.configFunction = configFunction; + this.config = null; + return this; + } + + /** + * Sets the DataSetSource for loading datasets by name. + * + * @param source the dataset source to use + * @return this builder for method chaining + */ + public Builder withDataSetSource(DataSetSource source) { + this.dataSetSource = source; + return this; + } + + /** + * Sets the result handler strategy for processing benchmark results. + * + * @param handler the result handler strategy to use + * @return this builder for method chaining + * @see ResultHandler#consoleOnly() + * @see ResultHandler#toFiles(String) + */ + public Builder withResultHandler(ResultHandler handler) { + this.resultHandler = handler; + return this; + } + + /** + * Sets the checkpoint strategy for tracking and resuming benchmark progress. + * + * @param strategy the checkpoint strategy to use + * @return this builder for method chaining + * @see CheckpointStrategy#none() + * @see CheckpointStrategy#fileBasedCheckpointing(String) + */ + public Builder withCheckpointStrategy(CheckpointStrategy strategy) { + this.checkpointStrategy = strategy; + return this; + } + + /** + * Enables or disables result collection. When enabled, benchmark results are collected and returned + * from the execution. This is required for file output and checkpointing functionality. + * + * @param collect true to collect results, false to discard them + * @return this builder for method chaining + */ + public Builder collectResults(boolean collect) { + this.collectResults = collect; + return this; + } + + /** + * Sets the diagnostic level for Grid execution output. + * + * @param level diagnostic level: 0=none, 1=basic, 2=detailed, 3=verbose + * @return this builder for method chaining + */ + public Builder withDiagnosticLevel(int level) { + this.diagnosticLevel = level; + return this; + } + + /** + * Builds and returns a configured BenchFrame instance. + * + * @return a new BenchFrame instance with the configured settings + */ + public BenchFrame build() { + return new BenchFrame(this); + } + } + + /** + * Main entry point for command-line execution. Delegates to {@link BenchFrameCLI} for + * command-line parsing and subcommand handling. + * + * @param args command-line arguments + * @see BenchFrameCLI + */ + public static void main(String[] args) { + int exitCode = new CommandLine(new BenchFrameCLI()).execute(args); + System.exit(exitCode); + } +} diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchFrameCLI.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchFrameCLI.java new file mode 100644 index 000000000..f1c620139 --- /dev/null +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchFrameCLI.java @@ -0,0 +1,253 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.github.jbellis.jvector.benchframe; + +import picocli.CommandLine; + +import java.io.IOException; +import java.util.concurrent.Callable; + +/** + * Command-line interface for BenchFrame using PicoCLI. Provides subcommands for all + * benchmark modes from the original benchmark classes (Bench, BenchYAML, AutoBenchYAML) + * plus dataset management via integration with nbvectors CLI. + *

+ * This CLI serves as the primary entry point for command-line benchmark execution and + * delegates to {@link BenchFrame} for actual benchmark orchestration. + * + *

Available Subcommands

+ *
    + *
  • {@code bench} - Run with hardcoded grid parameters (Bench.java style)
  • + *
  • {@code benchyaml} - Run with YAML-based configuration (BenchYAML.java style)
  • + *
  • {@code autobenchyaml} - Run in CI/CD mode with checkpointing (AutoBenchYAML.java style)
  • + *
  • {@code datasets} - List and manage vector datasets (delegates to nbvectors)
  • + *
  • {@code nbvectors} - Access full nbvectors CLI functionality
  • + *
+ * + *

Usage Examples

+ *
+ * # Show help
+ * java -jar benchframe.jar --help
+ *
+ * # Run Bench-style on specific datasets
+ * java -jar benchframe.jar bench "dataset1|dataset2"
+ *
+ * # Run YAML-style on all datasets
+ * java -jar benchframe.jar benchyaml
+ *
+ * # Run CI/CD mode with output files
+ * java -jar benchframe.jar autobenchyaml -o results/benchmark
+ *
+ * # List available datasets
+ * java -jar benchframe.jar datasets
+ *
+ * # Access nbvectors CLI
+ * java -jar benchframe.jar nbvectors --help
+ * 
+ * + * @see BenchFrame + * @see BenchCommand + * @see BenchYAMLCommand + * @see AutoBenchYAMLCommand + * @see DatasetsCommand + * @see NBVectorsCommand + */ +@CommandLine.Command( + name = "benchframe", + mixinStandardHelpOptions = true, + version = "1.0", + description = "Unified benchmark framework for JVector graph indexes", + subcommands = { + BenchFrameCLI.BenchCommand.class, + BenchFrameCLI.BenchYAMLCommand.class, + BenchFrameCLI.AutoBenchYAMLCommand.class, + BenchFrameCLI.DatasetsCommand.class, + BenchFrameCLI.NBVectorsCommand.class + } +) +public class BenchFrameCLI implements Callable { + + /** + * Called when no subcommand is specified. Displays help information. + * + * @return exit code 0 + */ + @Override + public Integer call() { + // If no subcommand, show help + CommandLine.usage(this, System.out); + return 0; + } + + /** + * Subcommand for running Bench-style benchmarks with hardcoded grid parameters. + * Provides compatibility with the original Bench.java behavior. + *

+ * Uses fixed default parameters (M=32, efConstruction=100, etc.) and loads + * datasets from the DatasetCollection. + */ + @CommandLine.Command( + name = "bench", + description = "Run benchmarks with hardcoded grid parameters (original Bench.java style)" + ) + static class BenchCommand implements Callable { + @CommandLine.Parameters( + arity = "0..*", + description = "Dataset name patterns (regex). If not specified, matches all datasets." + ) + private String[] datasets = new String[0]; + + @Override + public Integer call() throws IOException { + System.out.println("Heap space available is " + Runtime.getRuntime().maxMemory()); + BenchFrame.likeBench().execute(datasets); + return 0; + } + } + + /** + * Subcommand for running BenchYAML-style benchmarks with YAML-based configuration. + * Provides compatibility with the original BenchYAML.java behavior. + *

+ * Loads benchmark parameters from YAML files per dataset, allowing different + * configurations for different datasets. + */ + @CommandLine.Command( + name = "benchyaml", + description = "Run benchmarks with YAML-based configuration (original BenchYAML.java style)" + ) + static class BenchYAMLCommand implements Callable { + @CommandLine.Parameters( + arity = "0..*", + description = "Dataset name patterns (regex) or YAML config files. If not specified, matches all datasets." + ) + private String[] datasets = new String[0]; + + @Override + public Integer call() throws IOException { + System.out.println("Heap space available is " + Runtime.getRuntime().maxMemory()); + BenchFrame.likeBenchYAML().execute(datasets); + return 0; + } + } + + /** + * Subcommand for running AutoBench-style benchmarks in CI/CD mode with checkpointing. + * Provides compatibility with the original AutoBenchYAML.java behavior. + *

+ * Features: + *

    + *
  • File-based checkpointing for resumption after failures
  • + *
  • CSV summary and JSON detail output
  • + *
  • Hardcoded dataset list for consistent CI/CD runs
  • + *
  • Configurable diagnostic output level
  • + *
+ */ + @CommandLine.Command( + name = "autobenchyaml", + description = "Run benchmarks for CI/CD with checkpointing and file output (original AutoBenchYAML.java style)" + ) + static class AutoBenchYAMLCommand implements Callable { + @CommandLine.Parameters( + arity = "0..*", + description = "Dataset name patterns (regex). If not specified, matches all datasets." + ) + private String[] datasets = new String[0]; + + @CommandLine.Option( + names = {"-o", "--output"}, + required = true, + description = "Base path for output files (.csv, .json, .checkpoint.json)" + ) + private String outputPath; + + @CommandLine.Option( + names = {"-d", "--diag"}, + description = "Diagnostic level: 0=none, 1=basic, 2=detailed, 3=verbose (default: ${DEFAULT-VALUE})", + defaultValue = "0" + ) + private int diagnosticLevel; + + @Override + public Integer call() throws IOException { + System.out.println("Heap space available is " + Runtime.getRuntime().maxMemory()); + BenchFrame.likeAutoBenchYAML(outputPath, diagnosticLevel).execute(datasets); + return 0; + } + } + + /** + * Subcommand that delegates to the datatools-nbvectors datasets command. + * Provides access to dataset listing and management functionality. + */ + @CommandLine.Command( + name = "datasets", + description = "List and manage vector datasets (delegates to nbvectors datasets command)" + ) + static class DatasetsCommand implements Callable { + @CommandLine.Parameters( + arity = "0..*", + description = "Arguments to pass to the nbvectors datasets command" + ) + private String[] args = new String[0]; + + @Override + public Integer call() throws Exception { + // Delegate to CommandBundler with datasets subcommand + String[] nbvectorArgs = new String[args.length + 1]; + nbvectorArgs[0] = "datasets"; + System.arraycopy(args, 0, nbvectorArgs, 1, args.length); + + io.nosqlbench.commands.CommandBundler.main(nbvectorArgs); + return 0; + } + } + + /** + * Subcommand that delegates to the datatools-nbvectors main CLI. + * Provides access to the full nbvectors command-line functionality. + */ + @CommandLine.Command( + name = "nbvectors", + description = "Access full nbvectors CLI functionality (delegates to CommandBundler)" + ) + static class NBVectorsCommand implements Callable { + @CommandLine.Parameters( + arity = "0..*", + description = "Arguments to pass to the nbvectors CLI" + ) + private String[] args = new String[0]; + + @Override + public Integer call() throws Exception { + // Delegate to CommandBundler + io.nosqlbench.commands.CommandBundler.main(args); + return 0; + } + } + + /** + * Main entry point for command-line execution. + * + * @param args command-line arguments + */ + public static void main(String[] args) { + int exitCode = new CommandLine(new BenchFrameCLI()).execute(args); + System.exit(exitCode); + } + +} diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchFrameConfig.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchFrameConfig.java new file mode 100644 index 000000000..edfc556d6 --- /dev/null +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchFrameConfig.java @@ -0,0 +1,490 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.github.jbellis.jvector.benchframe; + +import io.github.jbellis.jvector.example.util.CompressorParameters; +import io.github.jbellis.jvector.example.util.CompressorParameters.PQParameters; +import io.github.jbellis.jvector.example.util.DataSet; +import io.github.jbellis.jvector.example.yaml.MultiConfig; +import io.github.jbellis.jvector.graph.disk.feature.FeatureId; +import io.github.jbellis.jvector.vector.VectorSimilarityFunction; + +import java.util.*; +import java.util.function.Function; + +import static io.github.jbellis.jvector.quantization.KMeansPlusPlusClusterer.UNWEIGHTED; + +/** + * Typesafe configuration class for benchmark execution. Provides a unified, immutable configuration + * model that encapsulates all parameters needed to run a benchmark including graph construction + * parameters, search parameters, and feature sets. + *

+ * This class can be used programmatically through its {@link Builder} or constructed from + * YAML-based {@link MultiConfig} using {@link #fromMultiConfig(MultiConfig)}. + *

+ * All collections returned by getter methods are unmodifiable to maintain immutability. + * + *

Usage Examples

+ *
{@code
+ * // Create from YAML MultiConfig
+ * MultiConfig yaml = MultiConfig.getDefaultConfig("dataset-name");
+ * BenchFrameConfig config = BenchFrameConfig.fromMultiConfig(yaml);
+ *
+ * // Create with Builder
+ * BenchFrameConfig config = new BenchFrameConfig.Builder()
+ *     .withDatasetName("my-dataset")
+ *     .withMGrid(List.of(16, 32, 64))
+ *     .withEfConstructionGrid(List.of(100, 200))
+ *     .build();
+ *
+ * // Use default Bench-style configuration
+ * BenchFrameConfig defaults = BenchFrameConfig.createBenchDefaults();
+ * }
+ * + * @see MultiConfig + * @see BenchFrame + */ +public class BenchFrameConfig { + // Dataset identification + private final String datasetName; + + // Graph construction parameters + private final List mGrid; + private final List efConstructionGrid; + private final List neighborOverflowGrid; + private final List addHierarchyGrid; + private final List refineFinalGraphGrid; + private final List> featureSets; + private final List> buildCompressors; + + // Search parameters + private final List> searchCompressors; + private final Map> topKOverqueryGrid; + private final List usePruningGrid; + + // Benchmark selection + private final Map> benchmarkSpec; + + // Result collection mode + private final boolean collectResults; + + private BenchFrameConfig(Builder builder) { + this.datasetName = builder.datasetName; + this.mGrid = Collections.unmodifiableList(builder.mGrid); + this.efConstructionGrid = Collections.unmodifiableList(builder.efConstructionGrid); + this.neighborOverflowGrid = Collections.unmodifiableList(builder.neighborOverflowGrid); + this.addHierarchyGrid = Collections.unmodifiableList(builder.addHierarchyGrid); + this.refineFinalGraphGrid = Collections.unmodifiableList(builder.refineFinalGraphGrid); + this.featureSets = Collections.unmodifiableList(builder.featureSets); + this.buildCompressors = Collections.unmodifiableList(builder.buildCompressors); + this.searchCompressors = Collections.unmodifiableList(builder.searchCompressors); + this.topKOverqueryGrid = Collections.unmodifiableMap(builder.topKOverqueryGrid); + this.usePruningGrid = Collections.unmodifiableList(builder.usePruningGrid); + this.benchmarkSpec = builder.benchmarkSpec == null ? null : Collections.unmodifiableMap(builder.benchmarkSpec); + this.collectResults = builder.collectResults; + } + + /** + * Returns the dataset name associated with this configuration. + * + * @return the dataset name, may be null if not specified + */ + public String getDatasetName() { return datasetName; } + + /** + * Returns the grid of M (max connections per node) values to test. + * + * @return unmodifiable list of M values + */ + public List getMGrid() { return mGrid; } + + /** + * Returns the grid of efConstruction values to test during graph construction. + * + * @return unmodifiable list of efConstruction values + */ + public List getEfConstructionGrid() { return efConstructionGrid; } + + /** + * Returns the grid of neighbor overflow multipliers to test. This controls how many + * candidate neighbors are considered relative to M during graph construction. + * + * @return unmodifiable list of neighbor overflow multipliers + */ + public List getNeighborOverflowGrid() { return neighborOverflowGrid; } + + /** + * Returns the grid of add hierarchy boolean values indicating whether to use hierarchical + * graph construction. + * + * @return unmodifiable list of boolean values + */ + public List getAddHierarchyGrid() { return addHierarchyGrid; } + + /** + * Returns the grid of refine final graph boolean values indicating whether to perform + * final graph refinement after construction. + * + * @return unmodifiable list of boolean values + */ + public List getRefineFinalGraphGrid() { return refineFinalGraphGrid; } + + /** + * Returns the feature sets to test. Each set contains {@link FeatureId}s that enable + * specific features like inline vectors or NVQ vectors. + * + * @return unmodifiable list of feature sets + */ + public List> getFeatureSets() { return featureSets; } + + /** + * Returns the compressor functions to use during graph construction. Each function takes + * a {@link DataSet} and returns appropriate {@link CompressorParameters}. + * + * @return unmodifiable list of compressor parameter functions + */ + public List> getBuildCompressors() { return buildCompressors; } + + /** + * Returns the compressor functions to use during search. Each function takes + * a {@link DataSet} and returns appropriate {@link CompressorParameters}. + * + * @return unmodifiable list of compressor parameter functions + */ + public List> getSearchCompressors() { return searchCompressors; } + + /** + * Returns the grid of topK overquery multipliers mapped by K value. For example, + * a map entry of (10, [1.0, 2.0, 5.0]) means for top-10 queries, test overquery + * factors of 1.0x, 2.0x, and 5.0x. + * + * @return unmodifiable map of K values to overquery multipliers + */ + public Map> getTopKOverqueryGrid() { return topKOverqueryGrid; } + + /** + * Returns the grid of boolean values indicating whether to use search pruning. + * + * @return unmodifiable list of boolean values + */ + public List getUsePruningGrid() { return usePruningGrid; } + + /** + * Returns the benchmark specification mapping benchmark types to their configurations. + * A null value indicates all default benchmarks should be run. + * + * @return unmodifiable map of benchmark specifications, or null for default benchmarks + */ + public Map> getBenchmarkSpec() { return benchmarkSpec; } + + /** + * Returns whether results should be collected and returned from benchmark execution. + * + * @return true if results should be collected, false otherwise + */ + public boolean shouldCollectResults() { return collectResults; } + + /** + * Creates a new {@link Builder} initialized with this configuration's values. + * This is useful for creating modified copies of existing configurations. + * + * @return a new Builder with this configuration's values + */ + public Builder toBuilder() { + return new Builder() + .withDatasetName(datasetName) + .withMGrid(mGrid) + .withEfConstructionGrid(efConstructionGrid) + .withNeighborOverflowGrid(neighborOverflowGrid) + .withAddHierarchyGrid(addHierarchyGrid) + .withRefineFinalGraphGrid(refineFinalGraphGrid) + .withFeatureSets(featureSets) + .withBuildCompressors(buildCompressors) + .withSearchCompressors(searchCompressors) + .withTopKOverqueryGrid(topKOverqueryGrid) + .withUsePruningGrid(usePruningGrid) + .withBenchmarkSpec(benchmarkSpec) + .collectResults(collectResults); + } + + /** + * Creates a BenchFrameConfig from a YAML-based {@link MultiConfig}. This factory method + * provides compatibility with the existing YAML configuration system. + * + * @param config the MultiConfig to convert + * @return a new BenchFrameConfig with values from the MultiConfig + */ + public static BenchFrameConfig fromMultiConfig(MultiConfig config) { + return new Builder() + .withDatasetName(config.dataset) + .withMGrid(config.construction.outDegree) + .withEfConstructionGrid(config.construction.efConstruction) + .withNeighborOverflowGrid(config.construction.neighborOverflow) + .withAddHierarchyGrid(config.construction.addHierarchy) + .withRefineFinalGraphGrid(config.construction.refineFinalGraph) + .withFeatureSets(config.construction.getFeatureSets()) + .withBuildCompressors(config.construction.getCompressorParameters()) + .withSearchCompressors(config.search.getCompressorParameters()) + .withTopKOverqueryGrid(config.search.topKOverquery) + .withUsePruningGrid(config.search.useSearchPruning) + .withBenchmarkSpec(config.search.benchmarks) + .build(); + } + + /** + * Creates a default configuration matching the original Bench.java's hardcoded parameters. + * This provides a baseline configuration suitable for most benchmark scenarios. + *

+ * Default values include: + *

    + *
  • M: 32
  • + *
  • efConstruction: 100
  • + *
  • neighborOverflow: 1.2
  • + *
  • addHierarchy: true
  • + *
  • refineFinalGraph: true
  • + *
  • usePruning: true
  • + *
  • topK overquery: 10 -> [1.0, 2.0, 5.0, 10.0], 100 -> [1.0, 2.0]
  • + *
  • Feature sets: NVQ_VECTORS and INLINE_VECTORS
  • + *
  • Compressors: PQ for build, both none and PQ for search
  • + *
+ * + * @return a new BenchFrameConfig with default Bench.java values + */ + public static BenchFrameConfig createBenchDefaults() { + return new Builder() + .withMGrid(List.of(32)) + .withEfConstructionGrid(List.of(100)) + .withNeighborOverflowGrid(List.of(1.2f)) + .withAddHierarchyGrid(List.of(true)) + .withRefineFinalGraphGrid(List.of(true)) + .withUsePruningGrid(List.of(true)) + .withTopKOverqueryGrid(Map.of( + 10, List.of(1.0, 2.0, 5.0, 10.0), + 100, List.of(1.0, 2.0) + )) + .withFeatureSets(Arrays.asList( + EnumSet.of(FeatureId.NVQ_VECTORS), + EnumSet.of(FeatureId.INLINE_VECTORS) + )) + .withBuildCompressors(Arrays.asList( + ds -> new PQParameters(ds.getDimension() / 8, + 256, + ds.getSimilarityFunction() == VectorSimilarityFunction.EUCLIDEAN, + UNWEIGHTED), + __ -> CompressorParameters.NONE + )) + .withSearchCompressors(Arrays.asList( + __ -> CompressorParameters.NONE, + ds -> new PQParameters(ds.getDimension() / 8, + 256, + ds.getSimilarityFunction() == VectorSimilarityFunction.EUCLIDEAN, + UNWEIGHTED) + )) + .build(); + } + + /** + * Builder for fluent BenchFrameConfig construction. All builder methods return the builder + * instance for method chaining. Collections provided to builder methods are defensively + * copied to prevent external modification. + *

+ * Default values provide sensible single-value grids: + *

    + *
  • mGrid: [32]
  • + *
  • efConstructionGrid: [100]
  • + *
  • neighborOverflowGrid: [1.2]
  • + *
  • addHierarchyGrid: [true]
  • + *
  • refineFinalGraphGrid: [true]
  • + *
  • featureSets: [INLINE_VECTORS]
  • + *
  • buildCompressors: [NONE]
  • + *
  • searchCompressors: [NONE]
  • + *
  • topKOverqueryGrid: {10: [1.0]}
  • + *
  • usePruningGrid: [true]
  • + *
  • benchmarkSpec: null (use default benchmarks)
  • + *
  • collectResults: false
  • + *
+ */ + public static class Builder { + private String datasetName; + private List mGrid = List.of(32); + private List efConstructionGrid = List.of(100); + private List neighborOverflowGrid = List.of(1.2f); + private List addHierarchyGrid = List.of(true); + private List refineFinalGraphGrid = List.of(true); + private List> featureSets = List.of(EnumSet.of(FeatureId.INLINE_VECTORS)); + private List> buildCompressors = + List.of(__ -> CompressorParameters.NONE); + private List> searchCompressors = + List.of(__ -> CompressorParameters.NONE); + private Map> topKOverqueryGrid = Map.of(10, List.of(1.0)); + private List usePruningGrid = List.of(true); + private Map> benchmarkSpec = null; // null means use default benchmarks + private boolean collectResults = false; + + /** + * Sets the dataset name. + * + * @param datasetName the dataset name to associate with this configuration + * @return this builder for method chaining + */ + public Builder withDatasetName(String datasetName) { + this.datasetName = datasetName; + return this; + } + + /** + * Sets the grid of M (max connections per node) values to test. + * + * @param mGrid list of M values, defensively copied + * @return this builder for method chaining + */ + public Builder withMGrid(List mGrid) { + this.mGrid = new ArrayList<>(mGrid); + return this; + } + + /** + * Sets the grid of efConstruction values to test during graph construction. + * + * @param efConstructionGrid list of efConstruction values, defensively copied + * @return this builder for method chaining + */ + public Builder withEfConstructionGrid(List efConstructionGrid) { + this.efConstructionGrid = new ArrayList<>(efConstructionGrid); + return this; + } + + /** + * Sets the grid of neighbor overflow multipliers to test. + * + * @param neighborOverflowGrid list of overflow multipliers, defensively copied + * @return this builder for method chaining + */ + public Builder withNeighborOverflowGrid(List neighborOverflowGrid) { + this.neighborOverflowGrid = new ArrayList<>(neighborOverflowGrid); + return this; + } + + /** + * Sets the grid of add hierarchy boolean values. + * + * @param addHierarchyGrid list of boolean values, defensively copied + * @return this builder for method chaining + */ + public Builder withAddHierarchyGrid(List addHierarchyGrid) { + this.addHierarchyGrid = new ArrayList<>(addHierarchyGrid); + return this; + } + + /** + * Sets the grid of refine final graph boolean values. + * + * @param refineFinalGraphGrid list of boolean values, defensively copied + * @return this builder for method chaining + */ + public Builder withRefineFinalGraphGrid(List refineFinalGraphGrid) { + this.refineFinalGraphGrid = new ArrayList<>(refineFinalGraphGrid); + return this; + } + + /** + * Sets the feature sets to test. + * + * @param featureSets list of feature sets, defensively copied + * @return this builder for method chaining + */ + public Builder withFeatureSets(List> featureSets) { + this.featureSets = new ArrayList<>(featureSets); + return this; + } + + /** + * Sets the compressor functions to use during graph construction. + * + * @param buildCompressors list of compressor parameter functions, defensively copied + * @return this builder for method chaining + */ + public Builder withBuildCompressors(List> buildCompressors) { + this.buildCompressors = new ArrayList<>(buildCompressors); + return this; + } + + /** + * Sets the compressor functions to use during search. + * + * @param searchCompressors list of compressor parameter functions, defensively copied + * @return this builder for method chaining + */ + public Builder withSearchCompressors(List> searchCompressors) { + this.searchCompressors = new ArrayList<>(searchCompressors); + return this; + } + + /** + * Sets the grid of topK overquery multipliers. + * + * @param topKOverqueryGrid map of K values to overquery multipliers, defensively copied + * @return this builder for method chaining + */ + public Builder withTopKOverqueryGrid(Map> topKOverqueryGrid) { + this.topKOverqueryGrid = new HashMap<>(topKOverqueryGrid); + return this; + } + + /** + * Sets the grid of use pruning boolean values. + * + * @param usePruningGrid list of boolean values, defensively copied + * @return this builder for method chaining + */ + public Builder withUsePruningGrid(List usePruningGrid) { + this.usePruningGrid = new ArrayList<>(usePruningGrid); + return this; + } + + /** + * Sets the benchmark specification. A null value indicates default benchmarks should be used. + * + * @param benchmarkSpec map of benchmark specifications, defensively copied if not null + * @return this builder for method chaining + */ + public Builder withBenchmarkSpec(Map> benchmarkSpec) { + this.benchmarkSpec = benchmarkSpec == null ? null : new HashMap<>(benchmarkSpec); + return this; + } + + /** + * Sets whether to collect results. + * + * @param collectResults true to collect results, false otherwise + * @return this builder for method chaining + */ + public Builder collectResults(boolean collectResults) { + this.collectResults = collectResults; + return this; + } + + /** + * Builds and returns a configured BenchFrameConfig instance with immutable collections. + * + * @return a new BenchFrameConfig with the configured values + */ + public BenchFrameConfig build() { + return new BenchFrameConfig(this); + } + } +} diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchResult.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchResult.java new file mode 100644 index 000000000..6abc0352f --- /dev/null +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/BenchResult.java @@ -0,0 +1,84 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.github.jbellis.jvector.benchframe; + +import java.util.Map; + +/** + * Result model for a single benchmark execution. Encapsulates the dataset identifier, + * configuration parameters, and performance metrics from a benchmark run. + *

+ * This class is designed for serialization to JSON and CSV formats through {@link ResultHandler} + * implementations. All fields are public for compatibility with Jackson and other serialization + * libraries. + *

+ * Typical parameter keys include: + *

    + *
  • {@code M} - max connections per node
  • + *
  • {@code efConstruction} - construction-time search depth
  • + *
  • {@code buildCompressor} - compression used during construction
  • + *
  • {@code searchCompressor} - compression used during search
  • + *
  • {@code featureSet} - enabled feature flags
  • + *
+ *

+ * Typical metric keys include: + *

    + *
  • {@code recall} - search accuracy (0.0 to 1.0)
  • + *
  • {@code qps} - queries per second
  • + *
  • {@code latency} - average query latency in milliseconds
  • + *
  • {@code buildTimeMs} - index construction time in milliseconds
  • + *
  • {@code indexSizeBytes} - on-disk index size in bytes
  • + *
+ * + * @see ResultHandler + * @see BenchFrame + */ +public class BenchResult { + /** + * The name of the dataset this result is for. + */ + public String dataset; + + /** + * Map of configuration parameters used for this benchmark run. + * Keys are parameter names, values are parameter values (typically String, Integer, Boolean, etc.). + */ + public Map parameters; + + /** + * Map of performance metrics measured during this benchmark run. + * Keys are metric names, values are metric values (typically Double, Long, Integer, etc.). + */ + public Map metrics; + + /** + * Default constructor for deserialization. + */ + public BenchResult() {} + + /** + * Constructs a BenchResult with the specified dataset, parameters, and metrics. + * + * @param dataset the dataset name + * @param parameters map of configuration parameters + * @param metrics map of performance metrics + */ + public BenchResult(String dataset, Map parameters, Map metrics) { + this.dataset = dataset; + this.parameters = parameters; + this.metrics = metrics; + } +} diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/CheckpointStrategy.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/CheckpointStrategy.java new file mode 100644 index 000000000..f99f722da --- /dev/null +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/CheckpointStrategy.java @@ -0,0 +1,177 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.github.jbellis.jvector.benchframe; + +import io.github.jbellis.jvector.example.util.CheckpointManager; + +import java.util.Collections; +import java.util.List; + +/** + * Strategy interface for managing benchmark checkpointing. Implements the Strategy pattern + * to enable resumable benchmark execution after failures or interruptions. + *

+ * Checkpointing is particularly valuable for long-running benchmarks in CI/CD environments + * where resource limits or transient failures may interrupt execution. By tracking which + * datasets have been completed, benchmarks can resume from where they left off. + *

+ * Two implementations are provided: + *

    + *
  • {@link NoCheckpointing} - no-op implementation for simple scenarios
  • + *
  • {@link FileCheckpointing} - persistent file-based checkpointing using JSON
  • + *
+ * + *

Usage Example

+ *
{@code
+ * // No checkpointing (default)
+ * CheckpointStrategy strategy = CheckpointStrategy.none();
+ *
+ * // File-based checkpointing
+ * CheckpointStrategy strategy = CheckpointStrategy.fileBasedCheckpointing("results/checkpoint");
+ *
+ * // Custom implementation
+ * CheckpointStrategy strategy = new CheckpointStrategy() {
+ *     @Override
+ *     public boolean shouldSkipDataset(String datasetName) {
+ *         // Check database or cache
+ *         return completedDatasets.contains(datasetName);
+ *     }
+ *
+ *     @Override
+ *     public void recordCompletion(String datasetName, List results) {
+ *         // Update database or cache
+ *         completedDatasets.add(datasetName);
+ *     }
+ *
+ *     @Override
+ *     public List getPreviousResults() {
+ *         // Load from database or cache
+ *         return loadPreviousResults();
+ *     }
+ * };
+ * }
+ * + * @see BenchFrame.Builder#withCheckpointStrategy(CheckpointStrategy) + * @see BenchResult + */ +public interface CheckpointStrategy { + /** + * Checks if a dataset should be skipped because it has already been completed. + * This is called before attempting to benchmark each dataset. + * + * @param datasetName the name of the dataset to check + * @return true if the dataset has already been completed and should be skipped, false otherwise + */ + boolean shouldSkipDataset(String datasetName); + + /** + * Records the completion of a dataset with its results. This is called after successfully + * benchmarking a dataset. Implementations should persist this information to enable resumption. + * + * @param datasetName the name of the completed dataset + * @param results the benchmark results for this dataset + */ + void recordCompletion(String datasetName, List results); + + /** + * Retrieves any previously completed results from earlier runs. These results are included + * in the final output to provide a complete view across multiple executions. + * + * @return list of results from previous runs, or empty list if none exist + */ + List getPreviousResults(); + + /** + * Creates a no-op checkpoint strategy that does not track or resume progress. + * This is the default for simple benchmark scenarios. + * + * @return a checkpoint strategy that performs no checkpointing + */ + static CheckpointStrategy none() { + return new NoCheckpointing(); + } + + /** + * Creates a file-based checkpoint strategy that persists progress to JSON files. + * Creates files at {@code outputPath.checkpoint.json} containing completed dataset + * names and their results. + * + * @param outputPath base path for checkpoint file (e.g., "results/benchmark") + * @return a checkpoint strategy using file-based persistence + * @see FileCheckpointing + */ + static CheckpointStrategy fileBasedCheckpointing(String outputPath) { + return new FileCheckpointing(outputPath); + } + + /** + * No-op implementation that performs no checkpointing. All datasets are processed + * on every run without tracking completion state. + */ + class NoCheckpointing implements CheckpointStrategy { + @Override + public boolean shouldSkipDataset(String datasetName) { + return false; + } + + @Override + public void recordCompletion(String datasetName, List results) { + // Do nothing + } + + @Override + public List getPreviousResults() { + return Collections.emptyList(); + } + } + + /** + * File-based implementation that uses {@link CheckpointManager} for persistent checkpointing. + * Stores checkpoint state in a JSON file at {@code outputPath.checkpoint.json}. + *

+ * The checkpoint file contains: + *

    + *
  • List of completed dataset names
  • + *
  • All benchmark results from completed datasets
  • + *
  • Timestamp of last update
  • + *
+ *

+ * On initialization, loads any existing checkpoint file to resume from previous runs. + */ + class FileCheckpointing implements CheckpointStrategy { + private final CheckpointManager manager; + + public FileCheckpointing(String outputPath) { + this.manager = new CheckpointManager(outputPath); + } + + @Override + public boolean shouldSkipDataset(String datasetName) { + return manager.isDatasetCompleted(datasetName); + } + + @Override + public void recordCompletion(String datasetName, List results) { + manager.markDatasetCompleted(datasetName, results); + } + + @Override + public List getPreviousResults() { + return manager.getCompletedResults(); + } + } +} diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/ResultHandler.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/ResultHandler.java new file mode 100644 index 000000000..050681763 --- /dev/null +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/ResultHandler.java @@ -0,0 +1,201 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.github.jbellis.jvector.benchframe; + +import com.fasterxml.jackson.databind.ObjectMapper; +import io.github.jbellis.jvector.example.util.BenchmarkSummarizer; +import io.github.jbellis.jvector.example.util.BenchmarkSummarizer.SummaryStats; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.util.List; +import java.util.Map; + +/** + * Strategy interface for handling benchmark results after execution completes. + * Implements the Strategy pattern to decouple result handling from benchmark execution. + *

+ * This functional interface supports various output modes including: + *

    + *
  • Console-only output (Grid handles printing)
  • + *
  • File-based output (CSV summary and JSON details)
  • + *
  • Combined output to multiple destinations
  • + *
  • Custom implementations for specialized scenarios
  • + *
+ * + *

Usage Examples

+ *
{@code
+ * // Console only (default)
+ * ResultHandler handler = ResultHandler.consoleOnly();
+ *
+ * // Write to files
+ * ResultHandler handler = ResultHandler.toFiles("results/benchmark");
+ *
+ * // Combine multiple handlers
+ * ResultHandler handler = ResultHandler.combining(
+ *     ResultHandler.consoleOnly(),
+ *     ResultHandler.toFiles("results/benchmark")
+ * );
+ *
+ * // Custom implementation
+ * ResultHandler handler = results -> {
+ *     // Send to monitoring system
+ *     monitoringService.recordBenchmarks(results);
+ *     // Upload to cloud storage
+ *     cloudStorage.upload("benchmarks", results);
+ * };
+ * }
+ * + * @see BenchResult + * @see BenchFrame.Builder#withResultHandler(ResultHandler) + */ +@FunctionalInterface +public interface ResultHandler { + /** + * Handles the benchmark results after execution completes. Implementations may write + * to files, send to external systems, or perform other processing. + * + * @param results list of benchmark results to handle + * @throws IOException if output or I/O operations fail + */ + void handleResults(List results) throws IOException; + + /** + * Creates a no-op result handler that does nothing with results. Console output + * is already handled by Grid during benchmark execution. + * This matches the behavior of the original Bench.java and BenchYAML.java. + * + * @return a result handler that performs no additional output + */ + static ResultHandler consoleOnly() { + return results -> { + // Grid already printed results to console, nothing to do + }; + } + + /** + * Creates a result handler that writes results to CSV summary and JSON detail files. + * This matches the behavior of AutoBenchYAML.java. + *

+ * Files created: + *

    + *
  • {@code outputBasePath.csv} - CSV summary with aggregate statistics per dataset
  • + *
  • {@code outputBasePath.json} - JSON file with complete detailed results
  • + *
+ *

+ * The CSV file contains columns: dataset, QPS, QPS StdDev, Mean Latency, Recall@10, + * Index Construction Time. + * + * @param outputBasePath base path for output files (without extension) + * @return a result handler that writes to CSV and JSON files + * @see FileOutputHandler + */ + static ResultHandler toFiles(String outputBasePath) { + return new FileOutputHandler(outputBasePath); + } + + /** + * Implementation that writes benchmark results to CSV summary and JSON details files. + * Uses {@link BenchmarkSummarizer} to calculate aggregate statistics across multiple + * benchmark runs. + */ + class FileOutputHandler implements ResultHandler { + private static final Logger logger = LoggerFactory.getLogger(FileOutputHandler.class); + private final String outputBasePath; + + public FileOutputHandler(String outputBasePath) { + this.outputBasePath = outputBasePath; + } + + @Override + public void handleResults(List results) throws IOException { + if (results.isEmpty()) { + logger.warn("No results to write"); + return; + } + + // Calculate summary statistics + SummaryStats stats = BenchmarkSummarizer.summarize(results); + logger.info("Benchmark summary: {}", stats.toString()); + + // Write detailed results to JSON + File detailsFile = new File(outputBasePath + ".json"); + ObjectMapper mapper = new ObjectMapper(); + mapper.writerWithDefaultPrettyPrinter().writeValue(detailsFile, results); + logger.info("Detailed results written to {}", detailsFile.getAbsolutePath()); + + // Write summary to CSV + File csvFile = new File(outputBasePath + ".csv"); + writeCsvSummary(results, csvFile); + logger.info("Summary results written to {}", csvFile.getAbsolutePath()); + + // Verify files were created + if (csvFile.exists()) { + logger.info("CSV file size: {} bytes", csvFile.length()); + } else { + logger.error("Failed to create CSV file at {}", csvFile.getAbsolutePath()); + } + + if (detailsFile.exists()) { + logger.info("JSON file size: {} bytes", detailsFile.length()); + } else { + logger.error("Failed to create JSON file at {}", detailsFile.getAbsolutePath()); + } + } + + private void writeCsvSummary(List results, File outputFile) throws IOException { + // Get summary statistics by dataset + Map statsByDataset = BenchmarkSummarizer.summarizeByDataset(results); + + try (FileWriter writer = new FileWriter(outputFile)) { + // Write CSV header + writer.write("dataset,QPS,QPS StdDev,Mean Latency,Recall@10,Index Construction Time\n"); + + // Write one row per dataset with average metrics + for (Map.Entry entry : statsByDataset.entrySet()) { + String dataset = entry.getKey(); + SummaryStats datasetStats = entry.getValue(); + + writer.write(dataset + ","); + writer.write(datasetStats.getAvgQps() + ","); + writer.write(datasetStats.getQpsStdDev() + ","); + writer.write(datasetStats.getAvgLatency() + ","); + writer.write(datasetStats.getAvgRecall() + ","); + writer.write(datasetStats.getIndexConstruction() + "\n"); + } + } + } + } + + /** + * Creates a result handler that delegates to multiple handlers in sequence. + * If any handler throws an exception, subsequent handlers are not called. + * + * @param handlers the handlers to combine + * @return a result handler that invokes all provided handlers + */ + static ResultHandler combining(ResultHandler... handlers) { + return results -> { + for (ResultHandler handler : handlers) { + handler.handleResults(results); + } + }; + } +} diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/TestDataViewWrapper.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/TestDataViewWrapper.java new file mode 100644 index 000000000..117fe9eda --- /dev/null +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/TestDataViewWrapper.java @@ -0,0 +1,90 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.github.jbellis.jvector.benchframe; + +import io.github.jbellis.jvector.example.util.DataSet; +import io.github.jbellis.jvector.example.util.FloatVectorsWrapper; +import io.github.jbellis.jvector.graph.RandomAccessVectorValues; +import io.github.jbellis.jvector.vector.VectorSimilarityFunction; +import io.github.jbellis.jvector.vector.VectorizationProvider; +import io.github.jbellis.jvector.vector.types.VectorFloat; +import io.github.jbellis.jvector.vector.types.VectorTypeSupport; +import io.nosqlbench.vectordata.discovery.TestDataView; +import io.nosqlbench.vectordata.spec.datasets.types.NeighborIndices; +import io.nosqlbench.vectordata.spec.datasets.types.QueryVectors; + +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; + +public class TestDataViewWrapper implements DataSet { + public final TestDataView view; + private static final VectorTypeSupport vts = VectorizationProvider.getInstance().getVectorTypeSupport(); + + public TestDataViewWrapper(TestDataView view) { + this.view = view; + } + + @Override + public String getName() { + return view.getName(); + } + + @Override + public VectorSimilarityFunction getSimilarityFunction() { + var df = view.getDistanceFunction(); + switch (df) { + case EUCLIDEAN: return VectorSimilarityFunction.EUCLIDEAN; + case COSINE: return VectorSimilarityFunction.COSINE; + case DOT_PRODUCT: return VectorSimilarityFunction.DOT_PRODUCT; + default: throw new IllegalArgumentException("Unknown distance function " + df); + } + } + + @Override + public List> getBaseVectors() { + throw new RuntimeException("This method should not be called. Use getBaseRavv() instead."); + } + + @Override + public List> getQueryVectors() { + QueryVectors queryVectors = view.getQueryVectors().orElseThrow(() -> new RuntimeException("unable to load query vectors")); + ArrayList> vectorFlaots = new ArrayList<>(queryVectors.getCount()); + for (float[] qv : queryVectors) { + vectorFlaots.add(vts.createFloatVector(qv)); + } + return vectorFlaots; + + } + + @Override + public List> getGroundTruth() { + Optional gt = view.getNeighborIndices(); + + return List.of(); + } + + @Override + public int getDimension() { + return view.getBaseVectors().get().getVectorDimensions(); + } + + @Override + public RandomAccessVectorValues getBaseRavv() { + return view.getBaseVectors().map(FloatVectorsWrapper::new).orElseThrow(() -> new RuntimeException("unable to load float vectors")); + } +} diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/package-info.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/package-info.java new file mode 100644 index 000000000..d73b951e5 --- /dev/null +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/benchframe/package-info.java @@ -0,0 +1,230 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Unified benchmark framework for JVector graph indexes. This package consolidates the functionality + * from the legacy benchmark classes (Bench, BenchYAML, AutoBenchYAML) into a modular, + * composable architecture using closures and strategy interfaces. + * + *

Quick Start

+ * + *

Command-Line Interface

+ * The recommended way to run benchmarks is via the CLI: + *
+ * # Run with hardcoded parameters (original Bench.java)
+ * benchframe bench dataset-name
+ *
+ * # Run with YAML configuration (original BenchYAML.java)
+ * benchframe benchyaml dataset-name
+ *
+ * # Run CI/CD mode with checkpointing (original AutoBenchYAML.java)
+ * benchframe autobenchyaml -o results/output dataset-name
+ *
+ * # List available datasets
+ * benchframe datasets
+ *
+ * # Access full nbvectors functionality
+ * benchframe nbvectors --help
+ * 
+ * + *

Programmatic Usage

+ * For library usage, use the convenience factory methods: + *
{@code
+ * // Hardcoded defaults (Bench-style)
+ * BenchFrame.likeBench().execute(args);
+ *
+ * // YAML configuration (BenchYAML-style)
+ * BenchFrame.likeBenchYAML().execute(args);
+ *
+ * // CI/CD with checkpointing (AutoBenchYAML-style)
+ * BenchFrame.likeAutoBenchYAML(outputPath, diagnosticLevel).execute(args);
+ * }
+ * + *
+ *

Developer Documentation

+ * + * The sections below provide detailed information for developers working on the BenchFrame itself. + * + *

Package Overview

+ * The benchframe package provides a flexible framework for benchmarking JVector's approximate + * nearest neighbor search implementations. It supports multiple execution modes from simple + * interactive testing to complex CI/CD scenarios with checkpointing and automated result collection. + * + *

Core Components

+ * + *

Main Orchestrator

+ *
    + *
  • {@link io.github.jbellis.jvector.benchframe.BenchFrame} - Main orchestrator that coordinates + * benchmark execution using pluggable strategies
  • + *
  • {@link io.github.jbellis.jvector.benchframe.BenchFrameCLI} - Command-line interface providing + * subcommands for different benchmark modes
  • + *
+ * + *

Configuration

+ *
    + *
  • {@link io.github.jbellis.jvector.benchframe.BenchFrameConfig} - Immutable configuration class + * encapsulating all benchmark parameters. Can be used as a single shared config or via a + * Function for per-dataset configuration (e.g., YAML)
  • + *
+ * + *

Result Handling

+ *
    + *
  • {@link io.github.jbellis.jvector.benchframe.BenchResult} - Result model encapsulating dataset, + * parameters, and metrics
  • + *
  • {@link io.github.jbellis.jvector.benchframe.ResultHandler} - Strategy interface for handling + * results (console, files, etc.)
  • + *
+ * + *

Checkpointing

+ *
    + *
  • {@link io.github.jbellis.jvector.benchframe.CheckpointStrategy} - Strategy interface for + * managing resumable benchmark execution
  • + *
+ * + *

Usage Patterns

+ * + *

Available CLI Subcommands

+ *
    + *
  • bench - Run with hardcoded grid parameters (original Bench.java)
  • + *
  • benchyaml - Run with YAML-based configuration (original BenchYAML.java)
  • + *
  • autobenchyaml - CI/CD mode with checkpointing and file output (original AutoBenchYAML.java)
  • + *
  • datasets - List and manage vector datasets (delegates to nbvectors)
  • + *
  • nbvectors - Access full nbvectors CLI functionality
  • + *
+ * + *

CLI Examples

+ *
+ * # Run with hardcoded parameters
+ * benchframe bench dataset-name
+ *
+ * # Run with YAML configuration
+ * benchframe benchyaml dataset-name
+ *
+ * # Run CI/CD mode with checkpointing (--output required)
+ * benchframe autobenchyaml -o results/output dataset-name
+ * benchframe autobenchyaml -o results/output -d 2 cap-1M
+ *
+ * # List available datasets
+ * benchframe datasets
+ * benchframe datasets search cohere
+ *
+ * # Access nbvectors functionality
+ * benchframe nbvectors --help
+ * benchframe nbvectors catalogs list
+ * 
+ * + *

Environment Variables

+ *
    + *
  • VECTORDATA_CATALOGS - Comma-separated list of additional catalog YAML files + * to load (e.g., "~/.config/custom/catalogs.yaml,~/work/catalogs.yaml")
  • + *
+ * + *

Programmatic Usage - Factory Methods

+ * Factory methods provide pre-configured instances matching legacy behavior: + *
{@code
+ * // Bench-style: hardcoded defaults
+ * BenchFrame frame = BenchFrame.likeBench();
+ * frame.execute(new String[]{"dataset-name"});
+ *
+ * // BenchYAML-style: YAML configuration
+ * BenchFrame frame = BenchFrame.likeBenchYAML();
+ * frame.execute(new String[]{"dataset-name"});
+ *
+ * // AutoBenchYAML-style: CI/CD with checkpointing
+ * BenchFrame frame = BenchFrame.likeAutoBenchYAML("results/benchmark", 2);
+ * frame.execute(new String[]{"dataset-name"});
+ * }
+ * + *

Programmatic Usage - Custom Configuration

+ * The Builder API provides fine-grained control over all aspects: + *
{@code
+ * // With a single shared config
+ * BenchFrame frame = new BenchFrame.Builder()
+ *     .withDatasetNames(List.of("dataset1", "dataset2"))
+ *     .withConfig(BenchFrameConfig.createBenchDefaults())
+ *     .withDataSetSource(DataSetSource.DEFAULT)
+ *     .withResultHandler(ResultHandler.toFiles("results/benchmark"))
+ *     .withCheckpointStrategy(CheckpointStrategy.fileBasedCheckpointing("results/checkpoint"))
+ *     .withDiagnosticLevel(2)
+ *     .build();
+ *
+ * // With per-dataset config function (like YAML)
+ * BenchFrame frame = new BenchFrame.Builder()
+ *     .withDatasetNames(List.of("dataset1", "dataset2"))
+ *     .withConfigFunction(name -> loadCustomConfig(name))
+ *     .build();
+ *
+ * frame.execute(new String[]{".*"});
+ * }
+ * + *

Extension Points

+ * The framework is designed for extension through closures and strategy interfaces: + * + *

Custom Configuration Function

+ *
{@code
+ * Function customConfigFn = datasetName -> {
+ *     // Load from database, REST API, etc.
+ *     return new BenchFrameConfig.Builder()
+ *         .withDatasetName(datasetName)
+ *         .withMGrid(List.of(16, 32, 64))
+ *         .build();
+ * };
+ * }
+ * + *

Custom Result Handler

+ *
{@code
+ * ResultHandler customHandler = results -> {
+ *     // Send to monitoring system
+ *     monitoringSystem.record(results);
+ *     // Upload to cloud storage
+ *     cloudStorage.upload("benchmarks", results);
+ * };
+ * }
+ * + *

Custom Checkpoint Strategy

+ *
{@code
+ * CheckpointStrategy customStrategy = new CheckpointStrategy() {
+ *     public boolean shouldSkipDataset(String name) {
+ *         return database.isCompleted(name);
+ *     }
+ *     public void recordCompletion(String name, List results) {
+ *         database.markCompleted(name, results);
+ *     }
+ *     public List getPreviousResults() {
+ *         return database.loadPreviousResults();
+ *     }
+ * };
+ * }
+ * + *

Architecture Benefits

+ *
    + *
  • Modularity: Clean separation of concerns through strategy interfaces
  • + *
  • Composability: Mix and match strategies for different scenarios
  • + *
  • Testability: Easy to test components in isolation with mock strategies
  • + *
  • Extensibility: Add new strategies without modifying existing code
  • + *
  • Backward Compatibility: Factory methods preserve legacy behavior
  • + *
+ * + *

Thread Safety

+ * The framework components are generally not thread-safe and are designed for single-threaded + * benchmark execution. {@link io.github.jbellis.jvector.benchframe.BenchFrameConfig} instances + * are immutable and thread-safe once constructed. + * + * @see io.github.jbellis.jvector.benchframe.BenchFrame + * @see io.github.jbellis.jvector.benchframe.BenchFrameCLI + * @see io.github.jbellis.jvector.example.Grid + */ +package io.github.jbellis.jvector.benchframe; diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/AutoBenchYAML.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/AutoBenchYAML.java index f8aa81575..47582e227 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/AutoBenchYAML.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/AutoBenchYAML.java @@ -16,209 +16,90 @@ package io.github.jbellis.jvector.example; -import com.fasterxml.jackson.databind.ObjectMapper; -import io.github.jbellis.jvector.example.util.BenchmarkSummarizer; -import io.github.jbellis.jvector.example.util.BenchmarkSummarizer.SummaryStats; -import io.github.jbellis.jvector.example.util.CheckpointManager; -import io.github.jbellis.jvector.example.util.DataSet; -import io.github.jbellis.jvector.example.util.DataSetLoader; -import io.github.jbellis.jvector.example.yaml.ConstructionParameters; -import io.github.jbellis.jvector.example.yaml.MultiConfig; -import io.github.jbellis.jvector.example.yaml.SearchParameters; -import io.github.jbellis.jvector.graph.disk.feature.FeatureId; - +import io.github.jbellis.jvector.benchframe.BenchFrame; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.File; -import java.io.FileWriter; import java.io.IOException; -import java.util.ArrayList; import java.util.Arrays; -import java.util.List; -import java.util.Map; -import java.util.regex.Pattern; -import java.util.stream.Collectors; /** * Automated benchmark runner for GitHub Actions workflow. * This class is specifically designed to handle the --output argument * for regression testing in the run-bench.yml workflow. - * + * * The benchmark runner supports checkpointing to allow resuming from failures. * It creates a checkpoint file (outputPath + ".checkpoint.json") that records * which datasets have been fully processed. If the benchmark is restarted, * it will skip datasets that have already been processed, allowing it to * continue from where it left off rather than starting over from the beginning. + * + * This class has been refactored to use BenchFrame for modularity and DRY principles. + * All shared functionality is now in reusable modules. */ +@Deprecated public class AutoBenchYAML { private static final Logger logger = LoggerFactory.getLogger(AutoBenchYAML.class); - /** - * Returns a list of all dataset names. - * This replaces the need to load datasets.yml which may not be available in all environments. - */ - private static List getAllDatasetNames() { - List allDatasets = new ArrayList<>(); - allDatasets.add("cap-1M"); - allDatasets.add("cap-6M"); - allDatasets.add("cohere-english-v3-1M"); - allDatasets.add("cohere-english-v3-10M"); - allDatasets.add("dpr-1M"); - allDatasets.add("dpr-10M"); - - return allDatasets; - } - public static void main(String[] args) throws IOException { - // Check for --output argument (required for this class) - String outputPath = null; - for (int i = 0; i < args.length - 1; i++) { - if (args[i].equals("--output")) outputPath = args[i+1]; - } - + // Parse command-line arguments + String outputPath = extractArgument(args, "--output"); if (outputPath == null) { logger.error("Error: --output argument is required for AutoBenchYAML"); System.exit(1); } - logger.info("Heap space available is {}", Runtime.getRuntime().maxMemory()); - - // Initialize checkpoint manager - CheckpointManager checkpointManager = new CheckpointManager(outputPath); - logger.info("Initialized checkpoint manager. Already completed datasets: {}", checkpointManager.getCompletedDatasets()); + int diagnosticLevel = extractIntArgument(args, "--diag", 0); + String[] filteredArgs = filterArguments(args, "--output", outputPath, "--diag", String.valueOf(diagnosticLevel)); - // Filter out --output, --config and their arguments from the args - String finalOutputPath = outputPath; - String configPath = null; - int diagnostic_level = 0; - for (int i = 0; i < args.length - 1; i++) { - if (args[i].equals("--config")) configPath = args[i+1]; - if (args[i].equals("--diag")) diagnostic_level = Integer.parseInt(args[i+1]); - } - if (diagnostic_level > 0) { - Grid.setDiagnosticLevel(diagnostic_level); - } - String finalConfigPath = configPath; - String[] filteredArgs = Arrays.stream(args) - .filter(arg -> !arg.equals("--output") && !arg.equals(finalOutputPath) && - !arg.equals("--config") && !arg.equals(finalConfigPath)) - .toArray(String[]::new); - - // Log the filtered arguments for debugging + logger.info("Heap space available is {}", Runtime.getRuntime().maxMemory()); logger.info("Filtered arguments: {}", Arrays.toString(filteredArgs)); - // generate a regex that matches any regex in filteredArgs, or if filteredArgs is empty/null, match everything - var regex = filteredArgs.length == 0 ? ".*" : Arrays.stream(filteredArgs).flatMap(s -> Arrays.stream(s.split("\\s"))).map(s -> "(?:" + s + ")").collect(Collectors.joining("|")); - logger.info("Generated regex pattern: {}", regex); - - // compile regex and do substring matching using find - var pattern = Pattern.compile(regex); - - var datasetNames = getAllDatasetNames().stream().filter(dn -> pattern.matcher(dn).find()).collect(Collectors.toList()); - - logger.info("Executing the following datasets: {}", datasetNames); - List results = new ArrayList<>(); - // Add results from checkpoint if present - results.addAll(checkpointManager.getCompletedResults()); - - // Process datasets from regex patterns - if (!datasetNames.isEmpty()) { - for (var datasetName : datasetNames) { - // Skip already completed datasets - if (checkpointManager.isDatasetCompleted(datasetName)) { - logger.info("Skipping already completed dataset: {}", datasetName); - continue; - } - - logger.info("Loading dataset: {}", datasetName); - try { - DataSet ds = DataSetLoader.loadDataSet(datasetName); - logger.info("Dataset loaded: {} with {} vectors", datasetName, ds.baseVectors.size()); - - String normalizedDatasetName = datasetName; - if (normalizedDatasetName.endsWith(".hdf5")) { - normalizedDatasetName = normalizedDatasetName.substring(0, normalizedDatasetName.length() - ".hdf5".length()); - } - - MultiConfig config; - if (finalConfigPath != null) { - config = MultiConfig.getConfig(finalConfigPath); - // Override dataset name if not specified in custom config - if (config.dataset == null || config.dataset.isEmpty()) { - config.dataset = normalizedDatasetName; - } - } else { - config = MultiConfig.getDefaultConfig("autoDefault"); - config.dataset = normalizedDatasetName; - } - logger.info("Using configuration: {}", config); - - List datasetResults = Grid.runAllAndCollectResults(ds, - config.construction.outDegree, - config.construction.efConstruction, - config.construction.neighborOverflow, - config.construction.addHierarchy, - config.construction.getFeatureSets(), - config.construction.getCompressorParameters(), - config.search.getCompressorParameters(), - config.search.topKOverquery, - config.search.useSearchPruning); - results.addAll(datasetResults); + // Execute benchmark using convenience method + BenchFrame.likeAutoBenchYAML(outputPath, diagnosticLevel).execute(filteredArgs); + } - logger.info("Benchmark completed for dataset: {}", datasetName); - // Mark dataset as completed and update checkpoint, passing results - checkpointManager.markDatasetCompleted(datasetName, datasetResults); - } catch (Exception e) { - logger.error("Exception while processing dataset {}", datasetName, e); - } + /** + * Extract a string argument value from command-line args + */ + private static String extractArgument(String[] args, String flag) { + for (int i = 0; i < args.length - 1; i++) { + if (args[i].equals(flag)) { + return args[i + 1]; } } + return null; + } - // Calculate summary statistics + /** + * Extract an integer argument value from command-line args + */ + private static int extractIntArgument(String[] args, String flag, int defaultValue) { + String value = extractArgument(args, flag); + if (value == null) { + return defaultValue; + } try { - SummaryStats stats = BenchmarkSummarizer.summarize(results); - logger.info("Benchmark summary: {}", stats.toString()); - - // Write results to csv file and details to json - File detailsFile = new File(outputPath + ".json"); - ObjectMapper mapper = new ObjectMapper(); - mapper.writerWithDefaultPrettyPrinter().writeValue(detailsFile, results); - - File outputFile = new File(outputPath + ".csv"); - - // Get summary statistics by dataset - Map statsByDataset = BenchmarkSummarizer.summarizeByDataset(results); - - // Write CSV data - try (FileWriter writer = new FileWriter(outputFile)) { - // Write CSV header - writer.write("dataset,QPS,QPS StdDev,Mean Latency,Recall@10,Index Construction Time\n"); - - // Write one row per dataset with average metrics - for (Map.Entry entry : statsByDataset.entrySet()) { - String dataset = entry.getKey(); - SummaryStats datasetStats = entry.getValue(); - - writer.write(dataset + ","); - writer.write(datasetStats.getAvgQps() + ","); - writer.write(datasetStats.getQpsStdDev() + ","); - writer.write(datasetStats.getAvgLatency() + ","); - writer.write(datasetStats.getAvgRecall() + ","); - writer.write(datasetStats.getIndexConstruction() + "\n"); - } - } - - logger.info("Benchmark results written to {} (file exists: {})", outputPath, outputFile.exists()); - // Double check that the file was created and log its size - if (outputFile.exists()) { - logger.info("Output file size: {} bytes", outputFile.length()); - } else { - logger.error("Failed to create output file at {}", outputPath); - } - } catch (Exception e) { - logger.error("Exception during final processing", e); + return Integer.parseInt(value); + } catch (NumberFormatException e) { + logger.warn("Invalid integer value for {}: {}", flag, value); + return defaultValue; } } + /** + * Filter out specific arguments and their values from the args array + */ + private static String[] filterArguments(String[] args, String... toFilter) { + return Arrays.stream(args) + .filter(arg -> { + for (String filter : toFilter) { + if (arg.equals(filter)) { + return false; + } + } + return true; + }) + .toArray(String[]::new); + } } diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java index 4623cbe9d..6b675acab 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench.java @@ -16,82 +16,22 @@ package io.github.jbellis.jvector.example; -import io.github.jbellis.jvector.example.util.CompressorParameters; -import io.github.jbellis.jvector.example.util.CompressorParameters.PQParameters; -import io.github.jbellis.jvector.example.util.DataSet; -import io.github.jbellis.jvector.example.util.DataSetLoader; -import io.github.jbellis.jvector.example.yaml.DatasetCollection; -import io.github.jbellis.jvector.graph.disk.feature.FeatureId; -import io.github.jbellis.jvector.vector.VectorSimilarityFunction; +import io.github.jbellis.jvector.benchframe.BenchFrame; import java.io.IOException; -import java.util.Arrays; -import java.util.EnumSet; -import java.util.List; -import java.util.Map; -import java.util.function.Function; -import java.util.regex.Pattern; -import java.util.stream.Collectors; - -import static io.github.jbellis.jvector.quantization.KMeansPlusPlusClusterer.UNWEIGHTED; /** - * Tests GraphIndexes against vectors from various datasets + * Tests GraphIndexes against vectors from various datasets using hardcoded grid parameters. + * + * This class has been refactored to use BenchFrame for modularity and DRY principles. + * All shared functionality is now in reusable modules. + * + * @deprecated Use {@link BenchFrame#likeBench()} directly instead. This class will be removed in a future release. */ +@Deprecated(forRemoval = true) public class Bench { public static void main(String[] args) throws IOException { System.out.println("Heap space available is " + Runtime.getRuntime().maxMemory()); - - var mGrid = List.of(32); // List.of(16, 24, 32, 48, 64, 96, 128); - var efConstructionGrid = List.of(100); // List.of(60, 80, 100, 120, 160, 200, 400, 600, 800); - var topKGrid = Map.of( - 10, // topK - List.of(1.0, 2.0, 5.0, 10.0), // oq - 100, // topK - List.of(1.0, 2.0) // oq - ); // rerankK = oq * topK - var neighborOverflowGrid = List.of(1.2f); // List.of(1.2f, 2.0f); - var addHierarchyGrid = List.of(true); // List.of(false, true); - var refineFinalGraphGrid = List.of(true); // List.of(false, true); - var usePruningGrid = List.of(true); // List.of(false, true); - List> buildCompression = Arrays.asList( - ds -> new PQParameters(ds.getDimension() / 8, - 256, - ds.similarityFunction == VectorSimilarityFunction.EUCLIDEAN, - UNWEIGHTED), - __ -> CompressorParameters.NONE - ); - List> searchCompression = Arrays.asList( - __ -> CompressorParameters.NONE, - // ds -> new CompressorParameters.BQParameters(), - ds -> new PQParameters(ds.getDimension() / 8, - 256, - ds.similarityFunction == VectorSimilarityFunction.EUCLIDEAN, - UNWEIGHTED) - ); - List> featureSets = Arrays.asList( - EnumSet.of(FeatureId.NVQ_VECTORS), -// EnumSet.of(FeatureId.NVQ_VECTORS, FeatureId.FUSED_ADC), - EnumSet.of(FeatureId.INLINE_VECTORS) - ); - - // args is list of regexes, possibly needing to be split by whitespace. - // generate a regex that matches any regex in args, or if args is empty/null, match everything - var regex = args.length == 0 ? ".*" : Arrays.stream(args).flatMap(s -> Arrays.stream(s.split("\\s"))).map(s -> "(?:" + s + ")").collect(Collectors.joining("|")); - // compile regex and do substring matching using find - var pattern = Pattern.compile(regex); - - execute(pattern, buildCompression, featureSets, searchCompression, mGrid, efConstructionGrid, neighborOverflowGrid, addHierarchyGrid, refineFinalGraphGrid, topKGrid, usePruningGrid); - } - - private static void execute(Pattern pattern, List> buildCompression, List> featureSets, List> compressionGrid, List mGrid, List efConstructionGrid, List neighborOverflowGrid, List addHierarchyGrid, List refineFinalGraphGrid, Map> topKGrid, List usePruningGrid) throws IOException { - var datasetCollection = DatasetCollection.load(); - var datasetNames = datasetCollection.getAll().stream().filter(dn -> pattern.matcher(dn).find()).collect(Collectors.toList()); - System.out.println("Executing the following datasets: " + datasetNames); - - for (var datasetName : datasetNames) { - DataSet ds = DataSetLoader.loadDataSet(datasetName); - Grid.runAll(ds, mGrid, efConstructionGrid, neighborOverflowGrid, addHierarchyGrid, refineFinalGraphGrid, featureSets, buildCompression, compressionGrid, topKGrid, usePruningGrid); - } + BenchFrame.likeBench().execute(args); } } diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench2D.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench2D.java deleted file mode 100644 index dc639f5ea..000000000 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Bench2D.java +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright DataStax, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.github.jbellis.jvector.example; - -import io.github.jbellis.jvector.example.util.*; -import io.github.jbellis.jvector.example.util.CompressorParameters.PQParameters; -import io.github.jbellis.jvector.graph.disk.feature.FeatureId; - -import java.io.IOException; -import java.util.Arrays; -import java.util.EnumSet; -import java.util.List; -import java.util.Map; -import java.util.function.Function; - -import static io.github.jbellis.jvector.quantization.KMeansPlusPlusClusterer.UNWEIGHTED; - -/** - * Tests GraphIndexes against vectors from a 2D dataset - */ -public class Bench2D { - public static void main(String[] args) throws IOException { - System.out.println("Heap space available is " + Runtime.getRuntime().maxMemory()); - - var mGrid = List.of(32); // List.of(16, 24, 32, 48, 64, 96, 128); - var efConstructionGrid = List.of(100); // List.of(60, 80, 100, 120, 160, 200, 400, 600, 800); - var topKGrid = Map.of( - 10, // topK - List.of(1.0, 2.0, 5.0, 10.0, 20.0) // oq - ); // rerankK = oq * topK - var neighborOverflowGrid = List.of(1.2f); // List.of(1.2f, 2.0f); - var addHierarchyGrid = List.of(true); // List.of(false, true); - var refineFinalGraphGrid = List.of(true); // List.of(false, true); - var usePruningGrid = List.of(false); // List.of(false, true); - List> buildCompression = Arrays.asList(__ -> CompressorParameters.NONE); - List> searchCompression = Arrays.asList( - __ -> CompressorParameters.NONE, - ds -> new PQParameters(ds.getDimension(), 256, true, UNWEIGHTED) - ); - List> featureSets = Arrays.asList( - EnumSet.of(FeatureId.NVQ_VECTORS), - EnumSet.of(FeatureId.INLINE_VECTORS) - ); - - // 2D grid, built and calculated at runtime - var grid2d = DataSetCreator.create2DGrid(4_000_000, 10_000, 100); - - Grid.runAll(grid2d, mGrid, efConstructionGrid, neighborOverflowGrid, addHierarchyGrid, refineFinalGraphGrid, - featureSets, buildCompression, searchCompression, topKGrid, usePruningGrid); - } -} diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchResult.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchResult.java deleted file mode 100644 index 5eeeff736..000000000 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchResult.java +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Copyright DataStax, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package io.github.jbellis.jvector.example; - -import java.util.Map; - -public class BenchResult { - public String dataset; - public Map parameters; - public Map metrics; - - public BenchResult() {} - public BenchResult(String dataset, Map parameters, Map metrics) { - this.dataset = dataset; - this.parameters = parameters; - this.metrics = metrics; - } -} diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java index e81a84863..ab2c5991b 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/BenchYAML.java @@ -16,72 +16,22 @@ package io.github.jbellis.jvector.example; -import io.github.jbellis.jvector.example.util.DataSet; -import io.github.jbellis.jvector.example.util.DataSetLoader; -import io.github.jbellis.jvector.example.yaml.DatasetCollection; -import io.github.jbellis.jvector.example.yaml.MultiConfig; +import io.github.jbellis.jvector.benchframe.BenchFrame; import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.regex.Pattern; -import java.util.stream.Collectors; /** - * Tests GraphIndexes against vectors from various datasets + * Tests GraphIndexes against vectors from various datasets using YAML-based configuration. + * + * This class has been refactored to use BenchFrame for modularity and DRY principles. + * All shared functionality is now in reusable modules. + * + * @deprecated Use {@link BenchFrame#likeBenchYAML()} directly instead. This class will be removed in a future release. */ +@Deprecated(forRemoval = true) public class BenchYAML { public static void main(String[] args) throws IOException { - // args is one of: - // - a list of regexes, possibly needing to be split by whitespace. - // - a list of YAML files - System.out.println("Heap space available is " + Runtime.getRuntime().maxMemory()); - - // generate a regex that matches any regex in args, or if args is empty/null, match everything - var regex = args.length == 0 ? ".*" : Arrays.stream(args).flatMap(s -> Arrays.stream(s.split("\\s"))).map(s -> "(?:" + s + ")").collect(Collectors.joining("|")); - // compile regex and do substring matching using find - var pattern = Pattern.compile(regex); - - var datasetCollection = DatasetCollection.load(); - var datasetNames = datasetCollection.getAll().stream().filter(dn -> pattern.matcher(dn).find()).collect(Collectors.toList()); - - List allConfigs = new ArrayList<>(); - - if (!datasetNames.isEmpty()) { - System.out.println("Executing the following datasets: " + datasetNames); - - for (var datasetName : datasetNames) { - DataSet ds = DataSetLoader.loadDataSet(datasetName); - - if (datasetName.endsWith(".hdf5")) { - datasetName = datasetName.substring(0, datasetName.length() - ".hdf5".length()); - } - MultiConfig config = MultiConfig.getDefaultConfig(datasetName); - allConfigs.add(config); - } - } - - // get the list of YAML files from args - List configNames = Arrays.stream(args).filter(s -> s.endsWith(".yml")).collect(Collectors.toList()); - - if (!configNames.isEmpty()) { - for (var configName : configNames) { - MultiConfig config = MultiConfig.getDefaultConfig(configName); - allConfigs.add(config); - } - } - - for (var config : allConfigs) { - String datasetName = config.dataset; - - DataSet ds = DataSetLoader.loadDataSet(datasetName); - - Grid.runAll(ds, config.construction.outDegree, config.construction.efConstruction, - config.construction.neighborOverflow, config.construction.addHierarchy, config.construction.refineFinalGraph, - config.construction.getFeatureSets(), config.construction.getCompressorParameters(), - config.search.getCompressorParameters(), config.search.topKOverquery, config.search.useSearchPruning, config.search.benchmarks); - } + BenchFrame.likeBenchYAML().execute(args); } } diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java index a4d62645f..c3e756cb2 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/Grid.java @@ -17,6 +17,7 @@ package io.github.jbellis.jvector.example; import io.github.jbellis.jvector.disk.ReaderSupplierFactory; +import io.github.jbellis.jvector.benchframe.BenchResult; import io.github.jbellis.jvector.example.benchmarks.AccuracyBenchmark; import io.github.jbellis.jvector.example.benchmarks.BenchmarkTablePrinter; import io.github.jbellis.jvector.example.benchmarks.CountBenchmark; @@ -87,7 +88,7 @@ public class Grid { private static int diagnostic_level; - static void runAll(DataSet ds, + public static void runAll(DataSet ds, List mGrid, List efConstructionGrid, List neighborOverflowGrid, @@ -175,7 +176,7 @@ static void runOneGraph(List> featureSets, } else { long start = System.nanoTime(); cv = compressor.encodeAll(ds.getBaseRavv()); - System.out.format("%s encoded %d vectors [%.2f MB] in %.2fs%n", compressor, ds.baseVectors.size(), (cv.ramBytesUsed() / 1024f / 1024f), (System.nanoTime() - start) / 1_000_000_000.0); + System.out.format("%s encoded %d vectors [%.2f MB] in %.2fs%n", compressor, ds.getBaseVectors().size(), (cv.ramBytesUsed() / 1024f / 1024f), (System.nanoTime() - start) / 1_000_000_000.0); } indexes.forEach((features, index) -> { @@ -211,7 +212,7 @@ private static Map, ImmutableGraphIndex> buildOnDisk(List, ImmutableGraphIndex> buildOnDisk(List, ImmutableGraphIndex> indexes = new HashMap<>(); @@ -369,7 +370,7 @@ private static Map, ImmutableGraphIndex> buildInMemory(List, ImmutableGraphIndex> indexes = new HashMap<>(); long start; - var bsp = BuildScoreProvider.randomAccessScoreProvider(floatVectors, ds.similarityFunction); + var bsp = BuildScoreProvider.randomAccessScoreProvider(floatVectors, ds.getSimilarityFunction()); GraphIndexBuilder builder = new GraphIndexBuilder(bsp, floatVectors.dimension(), M, @@ -590,9 +591,10 @@ public static List runAllAndCollectResults( ); for (Metric metric : metricsList) { Map metrics = java.util.Map.of(metric.getHeader(), metric.getValue()); - results.add(new BenchResult(ds.name, params, metrics)); + results.add(new BenchResult(ds.getName(), params, metrics)); } - results.add(new BenchResult(ds.name, params, Map.of("Index Build Time", indexBuildTimes.get(ds.name)))); + results.add(new BenchResult(ds.getName(), params, Map.of("Index " + + "Build Time", indexBuildTimes.get(ds.getName())))); } } } @@ -675,17 +677,17 @@ public static class ConfiguredSystem implements AutoCloseable { public SearchScoreProvider scoreProviderFor(VectorFloat queryVector, ImmutableGraphIndex.View view) { // if we're not compressing then just use the exact score function if (cv == null) { - return DefaultSearchScoreProvider.exact(queryVector, ds.similarityFunction, ds.getBaseRavv()); + return DefaultSearchScoreProvider.exact(queryVector, ds.getSimilarityFunction(), ds.getBaseRavv()); } var scoringView = (ImmutableGraphIndex.ScoringView) view; ScoreFunction.ApproximateScoreFunction asf; if (features.contains(FeatureId.FUSED_ADC)) { - asf = scoringView.approximateScoreFunctionFor(queryVector, ds.similarityFunction); + asf = scoringView.approximateScoreFunctionFor(queryVector, ds.getSimilarityFunction()); } else { - asf = cv.precomputedScoreFunctionFor(queryVector, ds.similarityFunction); + asf = cv.precomputedScoreFunctionFor(queryVector, ds.getSimilarityFunction()); } - var rr = scoringView.rerankerFor(queryVector, ds.similarityFunction); + var rr = scoringView.rerankerFor(queryVector, ds.getSimilarityFunction()); return new DefaultSearchScoreProvider(asf, rr); } diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/HelloVectorWorld.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/HelloVectorWorld.java index a09d1a0e7..a53e682dd 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/HelloVectorWorld.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/HelloVectorWorld.java @@ -31,7 +31,8 @@ public static void main(String[] args) throws IOException { String datasetName = "ada002-100k"; - var mfd = DownloadHelper.maybeDownloadFvecs(datasetName); + var mfd = DownloadHelper.maybeDownloadFvecs(datasetName) + .orElseThrow(() -> new IllegalArgumentException("Unknown dataset: " + datasetName)); DataSet ds = mfd.load(); MultiConfig config = MultiConfig.getConfig(datasetName); diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/AccuracyBenchmark.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/AccuracyBenchmark.java index a99aca6f8..4cb72d1a5 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/AccuracyBenchmark.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/AccuracyBenchmark.java @@ -88,7 +88,7 @@ public List runBenchmark( throw new RuntimeException("At least one metric must be displayed"); } - int totalQueries = cs.getDataSet().queryVectors.size(); + int totalQueries = cs.getDataSet().getQueryVectors().size(); // execute all queries in parallel and collect results List results = IntStream.range(0, totalQueries) @@ -101,14 +101,14 @@ public List runBenchmark( if (computeRecall) { // compute recall for this run double recall = AccuracyMetrics.recallFromSearchResults( - cs.getDataSet().groundTruth, results, topK, topK + cs.getDataSet().getGroundTruth(), results, topK, topK ); list.add(Metric.of("Recall@" + topK, formatRecall, recall)); } if (computeMAP) { // compute recall for this run double map = AccuracyMetrics.meanAveragePrecisionAtK( - cs.getDataSet().groundTruth, results, topK + cs.getDataSet().getGroundTruth(), results, topK ); list.add(Metric.of("MAP@" + topK, formatMAP, map)); } diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/CountBenchmark.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/CountBenchmark.java index d4fe68456..cd5d228c2 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/CountBenchmark.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/CountBenchmark.java @@ -105,7 +105,7 @@ public List runBenchmark( LongAdder nodesVisited = new LongAdder(); LongAdder nodesExpanded = new LongAdder(); LongAdder nodesExpandedBaseLayer = new LongAdder(); - int totalQueries = cs.getDataSet().queryVectors.size(); + int totalQueries = cs.getDataSet().getQueryVectors().size(); for (int run = 0; run < queryRuns; run++) { IntStream.range(0, totalQueries) diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/ExecutionTimeBenchmark.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/ExecutionTimeBenchmark.java index 449a8409f..9872142d5 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/ExecutionTimeBenchmark.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/ExecutionTimeBenchmark.java @@ -56,7 +56,7 @@ public List runBenchmark( boolean usePruning, int queryRuns) { - int totalQueries = cs.getDataSet().queryVectors.size(); + int totalQueries = cs.getDataSet().getQueryVectors().size(); double totalRuntime = 0; for (int run = 0; run < queryRuns; run++) { diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/LatencyBenchmark.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/LatencyBenchmark.java index 861a8d2be..eefc5ee5c 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/LatencyBenchmark.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/LatencyBenchmark.java @@ -104,7 +104,7 @@ public List runBenchmark( throw new IllegalArgumentException("At least one parameter must be set to true"); } - int totalQueries = cs.getDataSet().queryVectors.size(); + int totalQueries = cs.getDataSet().getQueryVectors().size(); double mean = 0.0; double m2 = 0.0; int count = 0; diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/QueryExecutor.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/QueryExecutor.java index 9ec728808..3c202c28b 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/QueryExecutor.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/QueryExecutor.java @@ -33,7 +33,7 @@ public class QueryExecutor { * @return the SearchResult for query i. */ public static SearchResult executeQuery(ConfiguredSystem cs, int topK, int rerankK, boolean usePruning, int i) { - var queryVector = cs.getDataSet().queryVectors.get(i); + var queryVector = cs.getDataSet().getQueryVectors().get(i); var searcher = cs.getSearcher(); searcher.usePruning(usePruning); var sf = cs.scoreProviderFor(queryVector, searcher.getView()); diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/ThroughputBenchmark.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/ThroughputBenchmark.java index 27b99fa71..c00893fa5 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/ThroughputBenchmark.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/benchmarks/ThroughputBenchmark.java @@ -137,7 +137,7 @@ public List runBenchmark( throw new RuntimeException("At least one metric must be displayed"); } - int totalQueries = cs.getDataSet().queryVectors.size(); + int totalQueries = cs.getDataSet().getQueryVectors().size(); int dim = cs.getDataSet().getDimension(); // Warmup Phase with diagnostics diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/BenchmarkSummarizer.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/BenchmarkSummarizer.java index dba6064ab..88e406551 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/BenchmarkSummarizer.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/BenchmarkSummarizer.java @@ -15,7 +15,7 @@ */ package io.github.jbellis.jvector.example.util; -import io.github.jbellis.jvector.example.BenchResult; +import io.github.jbellis.jvector.benchframe.BenchResult; import java.util.List; import java.util.Map; diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/CheckpointManager.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/CheckpointManager.java index 4145100b2..d09347c5b 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/CheckpointManager.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/CheckpointManager.java @@ -16,6 +16,7 @@ package io.github.jbellis.jvector.example.util; import com.fasterxml.jackson.databind.ObjectMapper; +import io.github.jbellis.jvector.benchframe.BenchResult; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -35,7 +36,7 @@ public class CheckpointManager { private final String checkpointPath; private final ObjectMapper mapper; private final Set completedDatasets; - private final List completedResults; + private final List completedResults; /** * Creates a new CheckpointManager for the given output path. @@ -88,7 +89,7 @@ public boolean isDatasetCompleted(String datasetName) { * @param datasetName The name of the dataset * @param resultsForDataset The results for the dataset */ - public void markDatasetCompleted(String datasetName, List resultsForDataset) { + public void markDatasetCompleted(String datasetName, List resultsForDataset) { completedDatasets.add(datasetName); if (resultsForDataset != null) { completedResults.addAll(resultsForDataset); @@ -123,7 +124,7 @@ public Set getCompletedDatasets() { /** * Returns the list of completed BenchResults. */ - public List getCompletedResults() { + public List getCompletedResults() { return new ArrayList<>(completedResults); } @@ -132,13 +133,13 @@ public List getCompletedResults() */ private static class CheckpointData { private List completedDatasets; - private List completedResults; + private List completedResults; public CheckpointData() { // Default constructor for Jackson } - public CheckpointData(List completedDatasets, List completedResults) { + public CheckpointData(List completedDatasets, List completedResults) { this.completedDatasets = completedDatasets; this.completedResults = completedResults; } @@ -151,11 +152,11 @@ public void setCompletedDatasets(List completedDatasets) { this.completedDatasets = completedDatasets; } - public List getCompletedResults() { + public List getCompletedResults() { return completedResults; } - public void setCompletedResults(List completedResults) { + public void setCompletedResults(List completedResults) { this.completedResults = completedResults; } } diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/CompressorParameters.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/CompressorParameters.java index e1ffebb9b..f84b69938 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/CompressorParameters.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/CompressorParameters.java @@ -55,7 +55,7 @@ public VectorCompressor computeCompressor(DataSet ds) { @Override public String idStringFor(DataSet ds) { - return String.format("PQ_%s_%d_%d_%s_%s", ds.name, m, k, isCentered, anisotropicThreshold); + return String.format("PQ_%s_%d_%d_%s_%s", ds.getName(), m, k, isCentered, anisotropicThreshold); } @Override @@ -85,7 +85,7 @@ public VectorCompressor computeCompressor(DataSet ds) { @Override public String idStringFor(DataSet ds) { - return String.format("NVQ_%s_%d_%s", ds.name, nSubVectors); + return String.format("NVQ_%s_%d_%s", ds.getName(), nSubVectors); } @Override diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSet.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSet.java index e193cd6ad..4b39ad23b 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSet.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSet.java @@ -16,151 +16,19 @@ package io.github.jbellis.jvector.example.util; -import io.github.jbellis.jvector.graph.ListRandomAccessVectorValues; import io.github.jbellis.jvector.graph.RandomAccessVectorValues; import io.github.jbellis.jvector.vector.VectorSimilarityFunction; -import io.github.jbellis.jvector.vector.VectorUtil; import io.github.jbellis.jvector.vector.types.VectorFloat; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.TreeSet; -public class DataSet { - public final String name; - public final VectorSimilarityFunction similarityFunction; - public final List> baseVectors; - public final List> queryVectors; - public final List> groundTruth; - private RandomAccessVectorValues baseRavv; +public interface DataSet { + String getName(); + VectorSimilarityFunction getSimilarityFunction(); + List> getBaseVectors(); + List> getQueryVectors(); + List> getGroundTruth(); + int getDimension(); + RandomAccessVectorValues getBaseRavv(); - public DataSet(String name, - VectorSimilarityFunction similarityFunction, - List> baseVectors, - List> queryVectors, - List> groundTruth) - { - if (baseVectors.isEmpty()) { - throw new IllegalArgumentException("Base vectors must not be empty"); - } - if (queryVectors.isEmpty()) { - throw new IllegalArgumentException("Query vectors must not be empty"); - } - if (groundTruth.isEmpty()) { - throw new IllegalArgumentException("Ground truth vectors must not be empty"); - } - - if (baseVectors.get(0).length() != queryVectors.get(0).length()) { - throw new IllegalArgumentException("Base and query vectors must have the same dimensionality"); - } - if (queryVectors.size() != groundTruth.size()) { - throw new IllegalArgumentException("Query and ground truth lists must be the same size"); - } - - this.name = name; - this.similarityFunction = similarityFunction; - this.baseVectors = baseVectors; - this.queryVectors = queryVectors; - this.groundTruth = groundTruth; - - System.out.format("%n%s: %d base and %d query vectors created, dimensions %d%n", - name, baseVectors.size(), queryVectors.size(), baseVectors.get(0).length()); - } - - /** - * Return a dataset containing the given vectors, scrubbed free from zero vectors and normalized to unit length. - * Note: This only scrubs and normalizes for dot product similarity. - */ - public static DataSet getScrubbedDataSet(String pathStr, - VectorSimilarityFunction vsf, - List> baseVectors, - List> queryVectors, - List> groundTruth) - { - // remove zero vectors and duplicates, noting that this will change the indexes of the ground truth answers - List> scrubbedBaseVectors; - List> scrubbedQueryVectors; - List> gtSet; - scrubbedBaseVectors = new ArrayList<>(baseVectors.size()); - scrubbedQueryVectors = new ArrayList<>(queryVectors.size()); - gtSet = new ArrayList<>(groundTruth.size()); - var uniqueVectors = new TreeSet>((a, b) -> { - assert a.length() == b.length(); - for (int i = 0; i < a.length(); i++) { - if (a.get(i) < b.get(i)) { - return -1; - } - if (a.get(i) > b.get(i)) { - return 1; - } - } - return 0; - }); - Map rawToScrubbed = new HashMap<>(); - { - int j = 0; - for (int i = 0; i < baseVectors.size(); i++) { - VectorFloat v = baseVectors.get(i); - var valid = (vsf == VectorSimilarityFunction.EUCLIDEAN) || Math.abs(normOf(v)) > 1e-5; - if (valid && uniqueVectors.add(v)) { - scrubbedBaseVectors.add(v); - rawToScrubbed.put(i, j++); - } - } - } - // also remove zero query vectors and query vectors that are present in the base set - for (int i = 0; i < queryVectors.size(); i++) { - VectorFloat v = queryVectors.get(i); - var valid = (vsf == VectorSimilarityFunction.EUCLIDEAN) || Math.abs(normOf(v)) > 1e-5; - var dupe = uniqueVectors.contains(v); - if (valid && !dupe) { - scrubbedQueryVectors.add(v); - var gt = new ArrayList(); - for (int j : groundTruth.get(i)) { - gt.add(rawToScrubbed.get(j)); - } - gtSet.add(gt); - } - } - - // now that the zero vectors are removed, we can normalize if it looks like they aren't already - if (vsf == VectorSimilarityFunction.DOT_PRODUCT) { - if (Math.abs(normOf(baseVectors.get(0)) - 1.0) > 1e-5) { - normalizeAll(scrubbedBaseVectors); - normalizeAll(scrubbedQueryVectors); - } - } - - assert scrubbedQueryVectors.size() == gtSet.size(); - return new DataSet(pathStr, vsf, scrubbedBaseVectors, scrubbedQueryVectors, gtSet); - } - - private static void normalizeAll(Iterable> vectors) { - for (VectorFloat v : vectors) { - VectorUtil.l2normalize(v); - } - } - - private static float normOf(VectorFloat baseVector) { - float norm = 0; - for (int i = 0; i < baseVector.length(); i++) { - norm += baseVector.get(i) * baseVector.get(i); - } - return (float) Math.sqrt(norm); - } - - public int getDimension() { - return baseVectors.get(0).length(); - } - - public RandomAccessVectorValues getBaseRavv() { - if (baseRavv == null) { - baseRavv = new ListRandomAccessVectorValues(baseVectors, getDimension()); - } - return baseRavv; - } } diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetCreator.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetCreator.java index 1cd532160..40a709f6a 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetCreator.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetCreator.java @@ -73,6 +73,6 @@ public static DataSet create2DGrid(int nPoints, int nQueries, int topK) { var groundTruth = queries.stream().map(Map.Entry::getValue).collect(Collectors.toList()); String name = "2D" + gridWidth; - return new DataSet(name, VectorSimilarityFunction.EUCLIDEAN, baseVectors, queryVectors, groundTruth); + return new SimpleDataSet(name, VectorSimilarityFunction.EUCLIDEAN, baseVectors, queryVectors, groundTruth); } } diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetLoader.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetLoader.java index e90a6f275..75b764bbb 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetLoader.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetLoader.java @@ -16,18 +16,168 @@ package io.github.jbellis.jvector.example.util; +import io.github.jbellis.jvector.benchframe.TestDataViewWrapper; +import io.nosqlbench.nbdatatools.api.concurrent.ProgressIndicator; +import io.nosqlbench.vectordata.discovery.TestDataSources; +import io.nosqlbench.vectordata.discovery.TestDataView; +import io.nosqlbench.vectordata.downloader.Catalog; +import io.nosqlbench.vectordata.downloader.DatasetEntry; + import java.io.IOException; +import java.util.Optional; +import java.util.concurrent.CompletableFuture; + +public class DataSetLoader implements DataSetSource { + + private final DataSetSource[] loaders; + + public DataSetLoader(DataSetSource... loaders) { + this.loaders = loaders; + } + + @Override + public Optional apply(String name) { + for (DataSetSource loader : loaders) { + Optional result = loader.apply(name); + if (result.isPresent()) { + return result; + } + } + return Optional.empty(); + } + + @Override + public String toString() { + return "DataSetLoader{loaders=" + loaders.length + "}"; + } + + public final static DataSetSource FVecsDownloader = new DataSetSource() { + @Override + public Optional apply(String name) { + Optional mfdOpt = DownloadHelper.maybeDownloadFvecs(name); + if (mfdOpt.isEmpty()) { + return Optional.empty(); + } + + try { + var ds = mfdOpt.get().load(); + return Optional.of(ds); + } catch (IOException e) { + System.err.println("error while trying to load dataset: " + e + ", this error handling " + + "path needs to be updated"); + return Optional.empty(); + } + } + + @Override + public String toString() { + return "FVecsDownloader"; + } + }; + + public final static DataSetSource HDF5Loader = new DataSetSource() { + + @Override + public Optional apply(String name) { + if (name.endsWith(".hdf5")) { + DownloadHelper.maybeDownloadHdf5(name); + return Optional.of(Hdf5Loader.load(name)); + } + return Optional.empty(); + } + + @Override + public String toString() { + return "HDF5Loader"; + } + }; + + /** + * VectorData downloader that loads datasets from the vectordata catalog system. + * Supports optional additional catalogs via VECTORDATA_CATALOGS environment variable. + * + * Environment variable format: + * VECTORDATA_CATALOGS=~/.config/custom1/catalogs.yaml,~/.config/custom2/catalogs.yaml + */ + public static final DataSetSource vectorDataDownloader = new DataSetSource() { + private final Catalog catalog = initializeCatalog(); -public class DataSetLoader { - public static DataSet loadDataSet(String fileName) throws IOException { - DataSet ds; - if (fileName.endsWith(".hdf5")) { - DownloadHelper.maybeDownloadHdf5(fileName); - ds = Hdf5Loader.load(fileName); - } else { - var mfd = DownloadHelper.maybeDownloadFvecs(fileName); - ds = mfd.load(); + private Catalog initializeCatalog() { + TestDataSources sources = new TestDataSources().configure(); + + // Add additional catalogs from environment variable + String envCatalogs = System.getenv("VECTORDATA_CATALOGS"); + if (envCatalogs != null && !envCatalogs.trim().isEmpty()) { + String[] catalogPaths = envCatalogs.split(","); + for (String catalogPath : catalogPaths) { + String trimmedPath = catalogPath.trim(); + if (!trimmedPath.isEmpty()) { + System.out.println("Adding optional catalog from VECTORDATA_CATALOGS: " + trimmedPath); + sources.addOptionalCatalogs(trimmedPath); + } } - return ds; + } + + return sources.catalog(); + } + + @Override + public Optional apply(String name) { + name = name.contains(":") ? name : name + ":default"; + + TestDataView tdv = catalog.profile(name); + System.out.println("prebuffering dataset '" + name + "' (assumed performance oriented testing)"); + + CompletableFuture statusFuture = tdv.getBaseVectors().orElseThrow().prebuffer(); + if (statusFuture instanceof ProgressIndicator) { + ((ProgressIndicator) statusFuture).monitorProgress(1000); + } + + TestDataViewWrapper tdw = new TestDataViewWrapper(tdv); + System.out.println("Loaded " + tdw.getName() + " from streaming source"); + return Optional.of(tdw); + } + + @Override + public String toString() { + String envCatalogs = System.getenv("VECTORDATA_CATALOGS"); + return "VectorDataDownloader{defaultCatalog=~/.config/vectordata/catalogs.yaml" + + (envCatalogs != null ? ", additionalCatalogs=" + envCatalogs : "") + "}"; } + }; + + /** + * Creates a VectorDataDownloader with a specific catalog path. + * Use this when you need a custom catalog location programmatically. + * For most use cases, prefer using the VECTORDATA_CATALOGS environment variable instead. + * + * @param catalogPath path to the catalog YAML file (e.g., "~/.config/vectordata/catalogs.yaml") + * @return a DataSetSource that can load from the specified catalog + */ + public static DataSetSource createVectorDataDownloader(String catalogPath) { + Catalog catalog = new TestDataSources() + .configure() + .addOptionalCatalogs(catalogPath) + .catalog(); + + return name -> { + Optional dsentryOption = catalog.matchOne(name); + if (dsentryOption.isEmpty()) { + return Optional.empty(); + } + + DatasetEntry dsentry = dsentryOption.get(); + TestDataView tdv = dsentry.select().profile(name); + + System.out.println("prebuffering dataset (assumed performance oriented testing)"); + CompletableFuture statusFuture = tdv.getBaseVectors().orElseThrow().prebuffer(); + if (statusFuture instanceof ProgressIndicator) { + ((ProgressIndicator) statusFuture).monitorProgress(1000); + } + + TestDataViewWrapper tdw = new TestDataViewWrapper(tdv); + System.out.println("Loaded " + tdw.getName() + " from streaming source"); + return Optional.of(tdw); + }; + } } diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetSource.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetSource.java new file mode 100644 index 000000000..b8e2042c8 --- /dev/null +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetSource.java @@ -0,0 +1,40 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.github.jbellis.jvector.example.util; + +import java.util.Optional; +import java.util.function.Function; + +public interface DataSetSource extends Function> { + public DataSetSource DEFAULT = new DataSetLoader(DataSetLoader.HDF5Loader, DataSetLoader.FVecsDownloader, + DataSetLoader.vectorDataDownloader); + + public default DataSetSource and(DataSetSource... loaders) { + return new DataSetSource() { + @Override + public Optional apply(String name) { + for (var loader : loaders) { + var ds = loader.apply(name); + if (ds.isPresent()) { + return ds; + } + } + return Optional.empty(); + } + }; + } +} diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetUtils.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetUtils.java new file mode 100644 index 000000000..0a4a62421 --- /dev/null +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DataSetUtils.java @@ -0,0 +1,112 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.github.jbellis.jvector.example.util; + +import io.github.jbellis.jvector.vector.VectorSimilarityFunction; +import io.github.jbellis.jvector.vector.VectorUtil; +import io.github.jbellis.jvector.vector.types.VectorFloat; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.TreeSet; + +public class DataSetUtils { + + /** + * Return a dataset containing the given vectors, scrubbed free from zero vectors and normalized to unit length. + * Note: This only scrubs and normalizes for dot product similarity. + */ + public static DataSet getScrubbedDataSet(String pathStr, + VectorSimilarityFunction vsf, + List> baseVectors, + List> queryVectors, + List> groundTruth) + { + // remove zero vectors and duplicates, noting that this will change the indexes of the ground truth answers + List> scrubbedBaseVectors; + List> scrubbedQueryVectors; + List> gtSet; + scrubbedBaseVectors = new ArrayList<>(baseVectors.size()); + scrubbedQueryVectors = new ArrayList<>(queryVectors.size()); + gtSet = new ArrayList<>(groundTruth.size()); + var uniqueVectors = new TreeSet>((a, b) -> { + assert a.length() == b.length(); + for (int i = 0; i < a.length(); i++) { + if (a.get(i) < b.get(i)) { + return -1; + } + if (a.get(i) > b.get(i)) { + return 1; + } + } + return 0; + }); + Map rawToScrubbed = new HashMap<>(); + { + int j = 0; + for (int i = 0; i < baseVectors.size(); i++) { + VectorFloat v = baseVectors.get(i); + var valid = (vsf == VectorSimilarityFunction.EUCLIDEAN) || Math.abs(normOf(v)) > 1e-5; + if (valid && uniqueVectors.add(v)) { + scrubbedBaseVectors.add(v); + rawToScrubbed.put(i, j++); + } + } + } + // also remove zero query vectors and query vectors that are present in the base set + for (int i = 0; i < queryVectors.size(); i++) { + VectorFloat v = queryVectors.get(i); + var valid = (vsf == VectorSimilarityFunction.EUCLIDEAN) || Math.abs(normOf(v)) > 1e-5; + var dupe = uniqueVectors.contains(v); + if (valid && !dupe) { + scrubbedQueryVectors.add(v); + var gt = new ArrayList(); + for (int j : groundTruth.get(i)) { + gt.add(rawToScrubbed.get(j)); + } + gtSet.add(gt); + } + } + + // now that the zero vectors are removed, we can normalize if it looks like they aren't already + if (vsf == VectorSimilarityFunction.DOT_PRODUCT) { + if (Math.abs(normOf(baseVectors.get(0)) - 1.0) > 1e-5) { + normalizeAll(scrubbedBaseVectors); + normalizeAll(scrubbedQueryVectors); + } + } + + assert scrubbedQueryVectors.size() == gtSet.size(); + return new SimpleDataSet(pathStr, vsf, scrubbedBaseVectors, scrubbedQueryVectors, gtSet); + } + + private static void normalizeAll(Iterable> vectors) { + for (VectorFloat v : vectors) { + VectorUtil.l2normalize(v); + } + } + + private static float normOf(VectorFloat baseVector) { + float norm = 0; + for (int i = 0; i < baseVector.length(); i++) { + norm += baseVector.get(i) * baseVector.get(i); + } + return (float) Math.sqrt(norm); + } +} \ No newline at end of file diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DownloadHelper.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DownloadHelper.java index 8725a6f65..052388d3d 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DownloadHelper.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/DownloadHelper.java @@ -36,6 +36,7 @@ import java.nio.file.Paths; import java.nio.file.StandardCopyOption; import java.util.List; +import java.util.Optional; import java.util.Set; public class DownloadHelper { @@ -55,11 +56,11 @@ private static S3AsyncClientBuilder s3AsyncClientBuilder() { .credentialsProvider(AnonymousCredentialsProvider.create()); } - public static MultiFileDatasource maybeDownloadFvecs(String name) { + public static Optional maybeDownloadFvecs(String name) { String bucket = infraDatasets.contains(name) ? infraBucketName : bucketName; var mfd = MultiFileDatasource.byName.get(name); if (mfd == null) { - throw new IllegalArgumentException("Unknown dataset: " + name); + return Optional.empty(); } // TODO how to detect and recover from incomplete downloads? @@ -68,6 +69,7 @@ public static MultiFileDatasource maybeDownloadFvecs(String name) { Files.createDirectories(Paths.get(fvecDir).resolve(mfd.directory())); } catch (IOException e) { System.err.println("Failed to create directory: " + e.getMessage()); + return Optional.empty(); } try (S3AsyncClient s3Client = s3AsyncClientBuilder().build()) { @@ -104,11 +106,11 @@ public static MultiFileDatasource maybeDownloadFvecs(String name) { } tm.close(); } catch (Exception e) { - System.out.println("Error downloading data from S3: " + e.getMessage()); - System.exit(1); + System.err.println("Error downloading data from S3: " + e.getMessage()); + return Optional.empty(); } - return mfd; + return Optional.of(mfd); } public static void maybeDownloadHdf5(String datasetName) { diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/FloatVectorsWrapper.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/FloatVectorsWrapper.java new file mode 100644 index 000000000..3b0447d44 --- /dev/null +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/FloatVectorsWrapper.java @@ -0,0 +1,69 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.github.jbellis.jvector.example.util; + +import io.github.jbellis.jvector.graph.RandomAccessVectorValues; +import io.github.jbellis.jvector.vector.ArrayVectorFloat; +import io.github.jbellis.jvector.vector.VectorizationProvider; +import io.github.jbellis.jvector.vector.types.VectorFloat; +import io.github.jbellis.jvector.vector.types.VectorTypeSupport; +import io.nosqlbench.vectordata.spec.datasets.types.FloatVectors; + +import java.util.function.Supplier; + +/// Wrapper that adapts a nosqlbench FloatVectors instance to implement RandomAccessVectorValues +public class FloatVectorsWrapper implements RandomAccessVectorValues { + private static final VectorTypeSupport vts = VectorizationProvider.getInstance().getVectorTypeSupport(); + + private final FloatVectors floatVectors; + private final int dimension; + + public FloatVectorsWrapper(FloatVectors floatVectors) { + this.floatVectors = floatVectors; + this.dimension = floatVectors.getVectorDimensions(); + } + + @Override + public int size() { + return floatVectors.getCount(); + } + + @Override + public int dimension() { + return floatVectors.getVectorDimensions(); + } + + @Override + public VectorFloat getVector(int nodeId) { + return vts.createFloatVector(floatVectors.get(nodeId)); + } + + @Override + public boolean isValueShared() { + return true; + } + + @Override + public RandomAccessVectorValues copy() { + return new FloatVectorsWrapper(floatVectors); + } + + @Override + public Supplier threadLocalSupplier() { + return () -> new FloatVectorsWrapper(floatVectors); + } +} \ No newline at end of file diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/Hdf5Loader.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/Hdf5Loader.java index 7dfdccc07..baca10f5f 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/Hdf5Loader.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/Hdf5Loader.java @@ -82,6 +82,6 @@ else if (filename.contains("-euclidean")) { } } - return DataSet.getScrubbedDataSet(path.getFileName().toString(), similarityFunction, Arrays.asList(baseVectors), Arrays.asList(queryVectors), gtSets); + return DataSetUtils.getScrubbedDataSet(path.getFileName().toString(), similarityFunction, Arrays.asList(baseVectors), Arrays.asList(queryVectors), gtSets); } } diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/MultiFileDatasource.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/MultiFileDatasource.java index 6f875e23c..8bba2bd88 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/MultiFileDatasource.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/MultiFileDatasource.java @@ -51,7 +51,7 @@ public DataSet load() throws IOException { var baseVectors = SiftLoader.readFvecs("fvec/" + basePath); var queryVectors = SiftLoader.readFvecs("fvec/" + queriesPath); var gtVectors = SiftLoader.readIvecs("fvec/" + groundTruthPath); - return DataSet.getScrubbedDataSet(name, VectorSimilarityFunction.COSINE, baseVectors, queryVectors, gtVectors); + return DataSetUtils.getScrubbedDataSet(name, VectorSimilarityFunction.COSINE, baseVectors, queryVectors, gtVectors); } public static Map byName = new HashMap<>() {{ diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/SimpleDataSet.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/SimpleDataSet.java new file mode 100644 index 000000000..0d3c752dd --- /dev/null +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/util/SimpleDataSet.java @@ -0,0 +1,104 @@ +/* + * Copyright DataStax, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.github.jbellis.jvector.example.util; + +import io.github.jbellis.jvector.graph.ListRandomAccessVectorValues; +import io.github.jbellis.jvector.graph.RandomAccessVectorValues; +import io.github.jbellis.jvector.vector.VectorSimilarityFunction; +import io.github.jbellis.jvector.vector.types.VectorFloat; + +import java.util.List; + +public class SimpleDataSet implements DataSet { + private final String name; + private final VectorSimilarityFunction similarityFunction; + private final List> baseVectors; + private final List> queryVectors; + private final List> groundTruth; + private RandomAccessVectorValues baseRavv; + + public SimpleDataSet(String name, + VectorSimilarityFunction similarityFunction, + List> baseVectors, + List> queryVectors, + List> groundTruth) + { + if (baseVectors.isEmpty()) { + throw new IllegalArgumentException("Base vectors must not be empty"); + } + if (queryVectors.isEmpty()) { + throw new IllegalArgumentException("Query vectors must not be empty"); + } + if (groundTruth.isEmpty()) { + throw new IllegalArgumentException("Ground truth vectors must not be empty"); + } + + if (baseVectors.get(0).length() != queryVectors.get(0).length()) { + throw new IllegalArgumentException("Base and query vectors must have the same dimensionality"); + } + if (queryVectors.size() != groundTruth.size()) { + throw new IllegalArgumentException("Query and ground truth lists must be the same size"); + } + + this.name = name; + this.similarityFunction = similarityFunction; + this.baseVectors = baseVectors; + this.queryVectors = queryVectors; + this.groundTruth = groundTruth; + + System.out.format("%n%s: %d base and %d query vectors created, dimensions %d%n", + name, baseVectors.size(), queryVectors.size(), baseVectors.get(0).length()); + } + + @Override + public String getName() { + return name; + } + + @Override + public VectorSimilarityFunction getSimilarityFunction() { + return similarityFunction; + } + + @Override + public List> getBaseVectors() { + return baseVectors; + } + + @Override + public List> getQueryVectors() { + return queryVectors; + } + + @Override + public List> getGroundTruth() { + return groundTruth; + } + + @Override + public int getDimension() { + return baseVectors.get(0).length(); + } + + @Override + public RandomAccessVectorValues getBaseRavv() { + if (baseRavv == null) { + baseRavv = new ListRandomAccessVectorValues(baseVectors, getDimension()); + } + return baseRavv; + } +} \ No newline at end of file diff --git a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/Compression.java b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/Compression.java index fe7d4d82d..ef1013e80 100644 --- a/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/Compression.java +++ b/jvector-examples/src/main/java/io/github/jbellis/jvector/example/yaml/Compression.java @@ -42,7 +42,7 @@ public Function getCompressorParameters() { return ds -> { boolean centerData; if (strCenterData == null) { - centerData = ds.similarityFunction == VectorSimilarityFunction.EUCLIDEAN; + centerData = ds.getSimilarityFunction() == VectorSimilarityFunction.EUCLIDEAN; } else { centerData = strCenterData.equals("Yes");; } diff --git a/jvector-examples/src/main/resources/logback.xml b/jvector-examples/src/main/resources/logback.xml deleted file mode 100644 index 0a7d8846a..000000000 --- a/jvector-examples/src/main/resources/logback.xml +++ /dev/null @@ -1,19 +0,0 @@ - - - - - - true - - %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n - - - - - - - - - - - diff --git a/jvector-examples/src/test/java/io/github/jbellis/jvector/example/util/BenchmarkSummarizerTest.java b/jvector-examples/src/test/java/io/github/jbellis/jvector/example/util/BenchmarkSummarizerTest.java index 6168d5dca..f71a2c64f 100644 --- a/jvector-examples/src/test/java/io/github/jbellis/jvector/example/util/BenchmarkSummarizerTest.java +++ b/jvector-examples/src/test/java/io/github/jbellis/jvector/example/util/BenchmarkSummarizerTest.java @@ -15,9 +15,9 @@ */ package io.github.jbellis.jvector.example.util; -import io.github.jbellis.jvector.example.BenchResult; +import io.github.jbellis.jvector.benchframe.BenchResult; import io.github.jbellis.jvector.example.util.BenchmarkSummarizer.SummaryStats; -import org.junit.Test; +import org.junit.jupiter.api.Test; import java.util.ArrayList; import java.util.HashMap; diff --git a/jvector-examples/src/test/java/io/github/jbellis/jvector/example/util/SummarizerTest.java b/jvector-examples/src/test/java/io/github/jbellis/jvector/example/util/SummarizerTest.java index 3dbf7f403..163840193 100644 --- a/jvector-examples/src/test/java/io/github/jbellis/jvector/example/util/SummarizerTest.java +++ b/jvector-examples/src/test/java/io/github/jbellis/jvector/example/util/SummarizerTest.java @@ -15,7 +15,7 @@ */ package io.github.jbellis.jvector.example.util; -import io.github.jbellis.jvector.example.BenchResult; +import io.github.jbellis.jvector.benchframe.BenchResult; import io.github.jbellis.jvector.example.util.BenchmarkSummarizer.SummaryStats; import java.util.ArrayList; diff --git a/jvector-examples/yaml-configs/datasets.yml b/jvector-examples/yaml-configs/datasets.yml index 3fd67217e..8146196cf 100644 --- a/jvector-examples/yaml-configs/datasets.yml +++ b/jvector-examples/yaml-configs/datasets.yml @@ -1,3 +1,7 @@ +streamable: + - cohere_msmarco + + neighborhood-watch-100k: - ada002-100k - cohere-english-v3-100k diff --git a/jvector-native/pom.xml b/jvector-native/pom.xml index daf84fe6a..130e19d48 100644 --- a/jvector-native/pom.xml +++ b/jvector-native/pom.xml @@ -49,6 +49,17 @@
+ + org.apache.maven.plugins + maven-javadoc-plugin + + + --add-modules=jdk.incubator.vector + + 22 + false + +
diff --git a/jvector-tests/src/test/java/io/github/jbellis/jvector/microbench/GraphBuildBench.java b/jvector-tests/src/test/java/io/github/jbellis/jvector/microbench/GraphBuildBench.java index 9db25e642..df699680a 100644 --- a/jvector-tests/src/test/java/io/github/jbellis/jvector/microbench/GraphBuildBench.java +++ b/jvector-tests/src/test/java/io/github/jbellis/jvector/microbench/GraphBuildBench.java @@ -45,7 +45,7 @@ public static class Parameters { public Parameters() { this.ds = Hdf5Loader.load("hdf5/glove-100-angular.hdf5"); - this.ravv = new ListRandomAccessVectorValues(ds.baseVectors, ds.baseVectors.get(0).length()); + this.ravv = new ListRandomAccessVectorValues(ds.getBaseVectors(), ds.getBaseVectors().get(0).length()); } } @@ -54,7 +54,7 @@ public Parameters() { @OutputTimeUnit(TimeUnit.SECONDS) public void testGraphBuild(Blackhole bh, Parameters p) { long start = System.nanoTime(); - GraphIndexBuilder graphIndexBuilder = new GraphIndexBuilder(p.ravv, p.ds.similarityFunction, 8, 60, 1.2f, 1.4f, false); + GraphIndexBuilder graphIndexBuilder = new GraphIndexBuilder(p.ravv, p.ds.getSimilarityFunction(), 8, 60, 1.2f, 1.4f, false); graphIndexBuilder.build(p.ravv); System.out.format("Build M=%d ef=%d in %.2fs%n", 32, 600, (System.nanoTime() - start) / 1_000_000_000.0); @@ -65,7 +65,7 @@ public void testGraphBuild(Blackhole bh, Parameters p) { @OutputTimeUnit(TimeUnit.SECONDS) public void testGraphBuildWithHierarchy(Blackhole bh, Parameters p) { long start = System.nanoTime(); - GraphIndexBuilder graphIndexBuilder = new GraphIndexBuilder(p.ravv, p.ds.similarityFunction, 8, 60, 1.2f, 1.4f, true); + GraphIndexBuilder graphIndexBuilder = new GraphIndexBuilder(p.ravv, p.ds.getSimilarityFunction(), 8, 60, 1.2f, 1.4f, true); graphIndexBuilder.build(p.ravv); System.out.format("Build M=%d ef=%d in %.2fs%n", 32, 600, (System.nanoTime() - start) / 1_000_000_000.0); diff --git a/jvector-tests/src/test/java/io/github/jbellis/jvector/quantization/TestProductQuantization.java b/jvector-tests/src/test/java/io/github/jbellis/jvector/quantization/TestProductQuantization.java index db35b52d1..ab4087f8f 100644 --- a/jvector-tests/src/test/java/io/github/jbellis/jvector/quantization/TestProductQuantization.java +++ b/jvector-tests/src/test/java/io/github/jbellis/jvector/quantization/TestProductQuantization.java @@ -352,7 +352,7 @@ public void testPQLayoutEdgeCases() { int[][] testCases = { // Minimal cases {1, 1}, {1, 2}, - + // Power-of-2 boundaries for compressedDimension (layoutBytesPerVector changes) {10, 1}, {10, 2}, {10, 3}, {10, 4}, {10, 5}, {10, 7}, {10, 8}, {10, 9}, @@ -360,20 +360,20 @@ public void testPQLayoutEdgeCases() { {10, 31}, {10, 32}, {10, 33}, {10, 63}, {10, 64}, {10, 65}, {10, 127}, {10, 128}, {10, 129}, - + // Cases where addressableVectorsPerChunk becomes interesting {1073741823, 1}, // layoutBytesPerVector=2, addressableVectorsPerChunk=1073741823 - {1073741823, 2}, // layoutBytesPerVector=4, addressableVectorsPerChunk=536870911 + {1073741823, 2}, // layoutBytesPerVector=4, addressableVectorsPerChunk=536870911 {1073741824, 2}, // vectorCount > addressableVectorsPerChunk, creates chunks - + // Large dimension cases (small addressableVectorsPerChunk) {1000, 1024}, // layoutBytesPerVector=2048, addressableVectorsPerChunk=1048575 {2000000, 1024}, // vectorCount > addressableVectorsPerChunk - + // Integer overflow boundary cases {536870911, 4}, // layoutBytesPerVector=8, exactly fits in one chunk {536870912, 4}, // one more than above, creates multiple chunks - + // Edge case where lastChunkVectors becomes non-zero {100, 1073741824} // layoutBytesPerVector huge, addressableVectorsPerChunk=1, creates 100 chunks }; diff --git a/jvector-twenty/pom.xml b/jvector-twenty/pom.xml index ae6aa659b..53f81ecb9 100644 --- a/jvector-twenty/pom.xml +++ b/jvector-twenty/pom.xml @@ -39,6 +39,17 @@ + + org.apache.maven.plugins + maven-javadoc-plugin + + + --add-modules=jdk.incubator.vector + + 22 + false + + diff --git a/pom.xml b/pom.xml index ec2d326c9..9b294e798 100644 --- a/pom.xml +++ b/pom.xml @@ -198,10 +198,17 @@ + + org.junit.jupiter + junit-jupiter-api + 5.9.1 + test + org.junit.jupiter junit-jupiter-engine 5.9.1 + test com.carrotsearch.randomizedtesting diff --git a/rat-excludes.txt b/rat-excludes.txt index e9dd9fb37..b858fc86e 100644 --- a/rat-excludes.txt +++ b/rat-excludes.txt @@ -25,4 +25,5 @@ results.csv scripts/test_node_setup.sh scripts/jmh_results_formatter.py yaml-configs/*.yml -src/main/resources/logback.xml \ No newline at end of file +local/* +*/target/* \ No newline at end of file diff --git a/testrig b/testrig new file mode 100755 index 000000000..61a4b524d --- /dev/null +++ b/testrig @@ -0,0 +1,38 @@ +#!/bin/bash + +# Copyright DataStax, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +export JAVA_OPTS="--add-modules jdk.incubator.vector" +if [ ! -d "jvector-examples/target/classes" -o "$1" = "-r" ] +then + shift + printf "building project before invoking testrig...\n" 1>&2 + mvn -Pjdk22 compile package -pl :jvector-examples -am -DskipTests + status=$? + if (( status != 0 )) + then + printf "failed build, resolve issues before trying to run testrig\n" + exit $status + else + printf "built project successfully, continuing...\n" 1>&2 + echo -e "\a" + sleep 1 + echo -e "\a" + sleep 1 + fi +fi + +java $JAVA_OPTS -Xmx32g -jar ./jvector-examples/target/jvector-examples-*-SNAPSHOT-jar-with-dependencies.jar $*