Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ annoy = ["annoy"]
faiss = ["faiss-cpu"]
usearch = ["usearch"]
voyager = ["voyager"]
turbovec = ["turbovec"]
backends = [
"hnswlib",
"pynndescent>=0.5.10",
Expand Down
98 changes: 59 additions & 39 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import pytest

from vicinity import Vicinity
from vicinity.backends import OptionalDependencyError
from vicinity.datatypes import Backend

random_gen = np.random.default_rng(42)
Expand All @@ -22,13 +23,13 @@


@pytest.fixture(scope="session")
def items() -> list[str]:
def items() -> list[str | dict[str, object]]:
"""Fixture providing a list of item names."""
return [f"item{i}" if i % 2 == 0 else {"name": f"item{i}", "id": i} for i in range(1, 10001)]


@pytest.fixture(scope="session")
def non_serializable_items() -> list[str]:
def non_serializable_items() -> list[object]:
"""Fixture providing a list of non-serializable items."""

class NonSerializable:
Expand Down Expand Up @@ -58,46 +59,56 @@ def query_vector() -> np.ndarray:
(Backend.PYNNDESCENT, None),
(Backend.USEARCH, None),
(Backend.VOYAGER, None),
(Backend.TURBOVEC, None),
]


# Create human-readable ids for each backend type
BACKEND_IDS = [f"{backend.name}-{index_type}" if index_type else backend.name for backend, index_type in BACKEND_PARAMS]


@pytest.fixture(params=BACKEND_PARAMS)
def backend_type(request: pytest.FixtureRequest) -> Backend:
"""Fixture parametrizing over all backend types defined in Backend."""
backend, _ = request.param
try:
from vicinity.backends import get_backend_class

get_backend_class(backend)
except OptionalDependencyError as e:
pytest.skip(str(e))
return request.param


@pytest.fixture(params=BACKEND_PARAMS, ids=BACKEND_IDS)
def vicinity_instance(request: pytest.FixtureRequest, items: list[str], vectors: np.ndarray) -> Vicinity:
"""Fixture providing a Vicinity instance for each backend type."""
backend_type, index_type = request.param
# Handle FAISS backend with specific FAISS index types
if backend_type == Backend.FAISS:
if index_type in ("pq", "ivfpq", "ivfpqr"):
# Use smaller values for pq indexes since the dataset is small
return Vicinity.from_vectors_and_items(
vectors,
items,
backend_type=backend_type,
index_type=index_type,
m=2,
nbits=4,
)
else:
return Vicinity.from_vectors_and_items(
vectors,
items,
backend_type=backend_type,
index_type=index_type,
nlist=2,
nbits=32,
)

return Vicinity.from_vectors_and_items(vectors, items, backend_type=backend_type)
try:
# Handle FAISS backend with specific FAISS index types
if backend_type == Backend.FAISS:
if index_type in ("pq", "ivfpq", "ivfpqr"):
# Use smaller values for pq indexes since the dataset is small
return Vicinity.from_vectors_and_items(
vectors,
items,
backend_type=backend_type,
index_type=index_type,
m=2,
nbits=4,
)
else:
return Vicinity.from_vectors_and_items(
vectors,
items,
backend_type=backend_type,
index_type=index_type,
nlist=2,
nbits=32,
)

return Vicinity.from_vectors_and_items(vectors, items, backend_type=backend_type)
except OptionalDependencyError as e:
pytest.skip(str(e))


@pytest.fixture(params=BACKEND_PARAMS, ids=BACKEND_IDS)
Expand All @@ -106,19 +117,28 @@ def vicinity_instance_with_stored_vectors(
) -> Vicinity:
"""Fixture providing a Vicinity instance for each backend type."""
backend_type, index_type = request.param
# Handle FAISS backend with specific FAISS index types
if backend_type == Backend.FAISS:
if index_type in ("pq", "ivfpq", "ivfpqr"):
# Use smaller values for pq indexes since the dataset is small
return Vicinity.from_vectors_and_items(
vectors, items, backend_type=backend_type, index_type=index_type, m=2, nbits=4, store_vectors=True
)
else:
return Vicinity.from_vectors_and_items(
vectors, items, backend_type=backend_type, index_type=index_type, nlist=2, nbits=32, store_vectors=True
)

return Vicinity.from_vectors_and_items(vectors, items, backend_type=backend_type, store_vectors=True)
try:
# Handle FAISS backend with specific FAISS index types
if backend_type == Backend.FAISS:
if index_type in ("pq", "ivfpq", "ivfpqr"):
# Use smaller values for pq indexes since the dataset is small
return Vicinity.from_vectors_and_items(
vectors, items, backend_type=backend_type, index_type=index_type, m=2, nbits=4, store_vectors=True
)
else:
return Vicinity.from_vectors_and_items(
vectors,
items,
backend_type=backend_type,
index_type=index_type,
nlist=2,
nbits=32,
store_vectors=True,
)

return Vicinity.from_vectors_and_items(vectors, items, backend_type=backend_type, store_vectors=True)
except OptionalDependencyError as e:
pytest.skip(str(e))


@pytest.fixture()
Expand Down
5 changes: 3 additions & 2 deletions tests/test_vicinity.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,9 @@ def test_vicinity_insert(vicinity_instance: Vicinity, query_vector: np.ndarray)
:param vicinity_instance: A Vicinity instance.
:param query_vector: A query vector.
"""
if vicinity_instance.backend.backend_type in {Backend.HNSW, Backend.ANNOY, Backend.PYNNDESCENT}:
# Skip insert for HNSW or Annoy backends.
if vicinity_instance.backend.backend_type in {Backend.HNSW, Backend.ANNOY, Backend.PYNNDESCENT, Backend.TURBOVEC}:
# HNSW, Annoy, PyNNDescent don't support insert; TurboVec quantization distortion
# makes rank-based assertions unreliable at small dimensions.
return
new_item = ["item10001"]
new_vector = query_vector
Expand Down
17 changes: 16 additions & 1 deletion uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions vicinity/backends/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,5 +62,11 @@ def get_backend_class(backend: Backend | str) -> type[AbstractBackend]:

return VoyagerBackend

elif backend == Backend.TURBOVEC:
_require("turbovec", backend, "turbovec")
from vicinity.backends.turbovec import TurboVecBackend

return TurboVecBackend


__all__ = ["get_backend_class", "AbstractBackend", "BasicVectorStore"]
112 changes: 112 additions & 0 deletions vicinity/backends/turbovec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path
from typing import Any

import numpy as np
from numpy import typing as npt
from turbovec import TurboQuantIndex

from vicinity.backends.base import AbstractBackend, BaseArgs
from vicinity.datatypes import Backend, QueryResult
from vicinity.utils import Metric


@dataclass
class TurboVecArgs(BaseArgs):
dim: int = 0
metric: Metric = Metric.COSINE
bit_width: int = 4


class TurboVecBackend(AbstractBackend[TurboVecArgs]):
argument_class = TurboVecArgs
supported_metrics = {Metric.COSINE}

def __init__(
self,
index: TurboQuantIndex,
arguments: TurboVecArgs,
) -> None:
"""Initialize the backend using TurboVec."""
super().__init__(arguments)
self.index = index

@classmethod
def from_vectors(
cls: type[TurboVecBackend],
vectors: npt.NDArray,
metric: str | Metric = Metric.COSINE,
bit_width: int = 4,
**kwargs: Any,
) -> TurboVecBackend:
"""Create a new instance from vectors."""
metric_enum = Metric.from_string(metric)

if metric_enum not in cls.supported_metrics:
raise ValueError(f"Metric '{metric_enum.value}' is not supported by TurboVecBackend.")

if bit_width not in (2, 3, 4):
raise ValueError(f"bit_width must be 2, 3, or 4, got {bit_width}.")

dim = vectors.shape[1]
if dim % 8 != 0:
raise ValueError(f"dim must be a multiple of 8, got {dim}.")
index = TurboQuantIndex(dim=dim, bit_width=bit_width)
index.add(vectors.astype(np.float32))
arguments = TurboVecArgs(dim=dim, metric=metric_enum, bit_width=bit_width)
return cls(index, arguments)

@property
def backend_type(self) -> Backend:
"""The type of the backend."""
return Backend.TURBOVEC

@property
def dim(self) -> int:
"""Get the dimension of the space."""
return self.arguments.dim

def __len__(self) -> int:
"""Get the number of vectors."""
return len(self.index)

@classmethod
def load(cls: type[TurboVecBackend], path: Path) -> TurboVecBackend:
"""Load the index from a path."""
index_path = path / "index.tq"
arguments = TurboVecArgs.load(path / "arguments.json")
index = TurboQuantIndex.load(str(index_path))
return cls(index, arguments=arguments)

def save(self, path: Path) -> None:
"""Save the index to a path."""
self.index.write(str(path / "index.tq"))
self.arguments.dump(path / "arguments.json")

def query(self, vectors: npt.NDArray, k: int) -> QueryResult:
"""Query the backend and return results as tuples of keys and distances."""
k = min(k, len(self))
scores_batch, indices_batch = self.index.search(vectors.astype(np.float32), k=k)
# TurboVec returns cosine similarity scores; convert to cosine distance
distances_batch = 1.0 - scores_batch
return [(indices_batch[i], distances_batch[i].astype(np.float32)) for i in range(len(vectors))]

def insert(self, vectors: npt.NDArray) -> None:
"""Insert vectors into the backend."""
self.index.add(vectors.astype(np.float32))

def delete(self, indices: list[int]) -> None:
"""Delete vectors from the index (not supported by TurboVec)."""
raise NotImplementedError("Dynamic deletion is not supported in TurboVec.")

def threshold(self, vectors: npt.NDArray, threshold: float, max_k: int) -> QueryResult:
"""Query vectors within a distance threshold and return keys and distances."""
out: QueryResult = []
for keys_row, distances_row in self.query(vectors, max_k):
keys_row = np.array(keys_row)
distances_row = np.array(distances_row, dtype=np.float32)
mask = distances_row <= threshold
out.append((keys_row[mask], distances_row[mask]))
return out
1 change: 1 addition & 0 deletions vicinity/datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,4 @@ class Backend(str, Enum):
FAISS = "faiss"
USEARCH = "usearch"
VOYAGER = "voyager"
TURBOVEC = "turbovec"
Loading