diff --git a/pyproject.toml b/pyproject.toml index 67b110f..632ac9c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,6 +60,7 @@ annoy = ["annoy"] faiss = ["faiss-cpu"] usearch = ["usearch"] voyager = ["voyager"] +turbovec = ["turbovec"] backends = [ "hnswlib", "pynndescent>=0.5.10", diff --git a/tests/conftest.py b/tests/conftest.py index 63a8199..7da6326 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,6 +4,7 @@ import pytest from vicinity import Vicinity +from vicinity.backends import OptionalDependencyError from vicinity.datatypes import Backend random_gen = np.random.default_rng(42) @@ -22,13 +23,13 @@ @pytest.fixture(scope="session") -def items() -> list[str]: +def items() -> list[str | dict[str, object]]: """Fixture providing a list of item names.""" return [f"item{i}" if i % 2 == 0 else {"name": f"item{i}", "id": i} for i in range(1, 10001)] @pytest.fixture(scope="session") -def non_serializable_items() -> list[str]: +def non_serializable_items() -> list[object]: """Fixture providing a list of non-serializable items.""" class NonSerializable: @@ -58,9 +59,9 @@ def query_vector() -> np.ndarray: (Backend.PYNNDESCENT, None), (Backend.USEARCH, None), (Backend.VOYAGER, None), + (Backend.TURBOVEC, None), ] - # Create human-readable ids for each backend type BACKEND_IDS = [f"{backend.name}-{index_type}" if index_type else backend.name for backend, index_type in BACKEND_PARAMS] @@ -68,6 +69,13 @@ def query_vector() -> np.ndarray: @pytest.fixture(params=BACKEND_PARAMS) def backend_type(request: pytest.FixtureRequest) -> Backend: """Fixture parametrizing over all backend types defined in Backend.""" + backend, _ = request.param + try: + from vicinity.backends import get_backend_class + + get_backend_class(backend) + except OptionalDependencyError as e: + pytest.skip(str(e)) return request.param @@ -75,29 +83,32 @@ def backend_type(request: pytest.FixtureRequest) -> Backend: def vicinity_instance(request: pytest.FixtureRequest, items: list[str], vectors: np.ndarray) -> Vicinity: """Fixture providing a Vicinity instance for each backend type.""" backend_type, index_type = request.param - # Handle FAISS backend with specific FAISS index types - if backend_type == Backend.FAISS: - if index_type in ("pq", "ivfpq", "ivfpqr"): - # Use smaller values for pq indexes since the dataset is small - return Vicinity.from_vectors_and_items( - vectors, - items, - backend_type=backend_type, - index_type=index_type, - m=2, - nbits=4, - ) - else: - return Vicinity.from_vectors_and_items( - vectors, - items, - backend_type=backend_type, - index_type=index_type, - nlist=2, - nbits=32, - ) - - return Vicinity.from_vectors_and_items(vectors, items, backend_type=backend_type) + try: + # Handle FAISS backend with specific FAISS index types + if backend_type == Backend.FAISS: + if index_type in ("pq", "ivfpq", "ivfpqr"): + # Use smaller values for pq indexes since the dataset is small + return Vicinity.from_vectors_and_items( + vectors, + items, + backend_type=backend_type, + index_type=index_type, + m=2, + nbits=4, + ) + else: + return Vicinity.from_vectors_and_items( + vectors, + items, + backend_type=backend_type, + index_type=index_type, + nlist=2, + nbits=32, + ) + + return Vicinity.from_vectors_and_items(vectors, items, backend_type=backend_type) + except OptionalDependencyError as e: + pytest.skip(str(e)) @pytest.fixture(params=BACKEND_PARAMS, ids=BACKEND_IDS) @@ -106,19 +117,28 @@ def vicinity_instance_with_stored_vectors( ) -> Vicinity: """Fixture providing a Vicinity instance for each backend type.""" backend_type, index_type = request.param - # Handle FAISS backend with specific FAISS index types - if backend_type == Backend.FAISS: - if index_type in ("pq", "ivfpq", "ivfpqr"): - # Use smaller values for pq indexes since the dataset is small - return Vicinity.from_vectors_and_items( - vectors, items, backend_type=backend_type, index_type=index_type, m=2, nbits=4, store_vectors=True - ) - else: - return Vicinity.from_vectors_and_items( - vectors, items, backend_type=backend_type, index_type=index_type, nlist=2, nbits=32, store_vectors=True - ) - - return Vicinity.from_vectors_and_items(vectors, items, backend_type=backend_type, store_vectors=True) + try: + # Handle FAISS backend with specific FAISS index types + if backend_type == Backend.FAISS: + if index_type in ("pq", "ivfpq", "ivfpqr"): + # Use smaller values for pq indexes since the dataset is small + return Vicinity.from_vectors_and_items( + vectors, items, backend_type=backend_type, index_type=index_type, m=2, nbits=4, store_vectors=True + ) + else: + return Vicinity.from_vectors_and_items( + vectors, + items, + backend_type=backend_type, + index_type=index_type, + nlist=2, + nbits=32, + store_vectors=True, + ) + + return Vicinity.from_vectors_and_items(vectors, items, backend_type=backend_type, store_vectors=True) + except OptionalDependencyError as e: + pytest.skip(str(e)) @pytest.fixture() diff --git a/tests/test_vicinity.py b/tests/test_vicinity.py index a30c864..e1fa3c0 100644 --- a/tests/test_vicinity.py +++ b/tests/test_vicinity.py @@ -88,8 +88,9 @@ def test_vicinity_insert(vicinity_instance: Vicinity, query_vector: np.ndarray) :param vicinity_instance: A Vicinity instance. :param query_vector: A query vector. """ - if vicinity_instance.backend.backend_type in {Backend.HNSW, Backend.ANNOY, Backend.PYNNDESCENT}: - # Skip insert for HNSW or Annoy backends. + if vicinity_instance.backend.backend_type in {Backend.HNSW, Backend.ANNOY, Backend.PYNNDESCENT, Backend.TURBOVEC}: + # HNSW, Annoy, PyNNDescent don't support insert; TurboVec quantization distortion + # makes rank-based assertions unreliable at small dimensions. return new_item = ["item10001"] new_vector = query_vector diff --git a/uv.lock b/uv.lock index 9023149..b520c42 100644 --- a/uv.lock +++ b/uv.lock @@ -1577,6 +1577,17 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f", size = 85359, upload-time = "2024-04-19T11:11:46.763Z" }, ] +[[package]] +name = "turbovec" +version = "0.1.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b5/05/f9c8de1eea79a69c43b0760a026e34cbeeda063038bbd9b99425d897c902/turbovec-0.1.3.tar.gz", hash = "sha256:140a433438f102e17947875a231f8cce50fd79b5dc381672f6510cd346cfb0d1", size = 44254, upload-time = "2026-04-15T14:40:18.158Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b7/ec/3bc9e28852b6c2a5d2a9e88048c5bca0363e641e3956f4d1f9dc6eecc7a2/turbovec-0.1.3-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:34e50b9448759933b67136ccc7ab2cedfedcd4b4ae41441b94c93d16274fd686", size = 761398, upload-time = "2026-04-15T14:40:13.817Z" }, + { url = "https://files.pythonhosted.org/packages/ea/3e/2240c4401bdce3463d517ba652a7aa31f794bc30b10985694572ef2c30d9/turbovec-0.1.3-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:21b1b77113cfabb9e023b6b922ecd97e3098ec3665395234a1c581a93fe76347", size = 900829, upload-time = "2026-04-15T14:40:15.324Z" }, + { url = "https://files.pythonhosted.org/packages/6a/ce/5406ccb0efa79bf01ba9aa965262003a6c772cf09ceb940023491df0105f/turbovec-0.1.3-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:c14e19bdbb6e833505fb1bc8eee2791bee99355d045c82e6f350f6b39f7663c5", size = 755675, upload-time = "2026-04-15T14:40:16.64Z" }, +] + [[package]] name = "typing-extensions" version = "4.12.2" @@ -1707,6 +1718,9 @@ pynndescent = [ { name = "numpy" }, { name = "pynndescent" }, ] +turbovec = [ + { name = "turbovec" }, +] usearch = [ { name = "usearch" }, ] @@ -1751,6 +1765,7 @@ requires-dist = [ { name = "ruff", marker = "extra == 'dev'" }, { name = "setuptools", marker = "extra == 'dev'" }, { name = "tqdm" }, + { name = "turbovec", marker = "extra == 'turbovec'" }, { name = "usearch", marker = "extra == 'all'" }, { name = "usearch", marker = "extra == 'backends'" }, { name = "usearch", marker = "extra == 'usearch'" }, @@ -1758,7 +1773,7 @@ requires-dist = [ { name = "voyager", marker = "extra == 'backends'" }, { name = "voyager", marker = "extra == 'voyager'" }, ] -provides-extras = ["dev", "huggingface", "integrations", "hnsw", "pynndescent", "annoy", "faiss", "usearch", "voyager", "backends", "all"] +provides-extras = ["dev", "huggingface", "integrations", "hnsw", "pynndescent", "annoy", "faiss", "usearch", "voyager", "turbovec", "backends", "all"] [[package]] name = "virtualenv" diff --git a/vicinity/backends/__init__.py b/vicinity/backends/__init__.py index 6c5720c..c469099 100644 --- a/vicinity/backends/__init__.py +++ b/vicinity/backends/__init__.py @@ -62,5 +62,11 @@ def get_backend_class(backend: Backend | str) -> type[AbstractBackend]: return VoyagerBackend + elif backend == Backend.TURBOVEC: + _require("turbovec", backend, "turbovec") + from vicinity.backends.turbovec import TurboVecBackend + + return TurboVecBackend + __all__ = ["get_backend_class", "AbstractBackend", "BasicVectorStore"] diff --git a/vicinity/backends/turbovec.py b/vicinity/backends/turbovec.py new file mode 100644 index 0000000..8f3d35e --- /dev/null +++ b/vicinity/backends/turbovec.py @@ -0,0 +1,112 @@ +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +import numpy as np +from numpy import typing as npt +from turbovec import TurboQuantIndex + +from vicinity.backends.base import AbstractBackend, BaseArgs +from vicinity.datatypes import Backend, QueryResult +from vicinity.utils import Metric + + +@dataclass +class TurboVecArgs(BaseArgs): + dim: int = 0 + metric: Metric = Metric.COSINE + bit_width: int = 4 + + +class TurboVecBackend(AbstractBackend[TurboVecArgs]): + argument_class = TurboVecArgs + supported_metrics = {Metric.COSINE} + + def __init__( + self, + index: TurboQuantIndex, + arguments: TurboVecArgs, + ) -> None: + """Initialize the backend using TurboVec.""" + super().__init__(arguments) + self.index = index + + @classmethod + def from_vectors( + cls: type[TurboVecBackend], + vectors: npt.NDArray, + metric: str | Metric = Metric.COSINE, + bit_width: int = 4, + **kwargs: Any, + ) -> TurboVecBackend: + """Create a new instance from vectors.""" + metric_enum = Metric.from_string(metric) + + if metric_enum not in cls.supported_metrics: + raise ValueError(f"Metric '{metric_enum.value}' is not supported by TurboVecBackend.") + + if bit_width not in (2, 3, 4): + raise ValueError(f"bit_width must be 2, 3, or 4, got {bit_width}.") + + dim = vectors.shape[1] + if dim % 8 != 0: + raise ValueError(f"dim must be a multiple of 8, got {dim}.") + index = TurboQuantIndex(dim=dim, bit_width=bit_width) + index.add(vectors.astype(np.float32)) + arguments = TurboVecArgs(dim=dim, metric=metric_enum, bit_width=bit_width) + return cls(index, arguments) + + @property + def backend_type(self) -> Backend: + """The type of the backend.""" + return Backend.TURBOVEC + + @property + def dim(self) -> int: + """Get the dimension of the space.""" + return self.arguments.dim + + def __len__(self) -> int: + """Get the number of vectors.""" + return len(self.index) + + @classmethod + def load(cls: type[TurboVecBackend], path: Path) -> TurboVecBackend: + """Load the index from a path.""" + index_path = path / "index.tq" + arguments = TurboVecArgs.load(path / "arguments.json") + index = TurboQuantIndex.load(str(index_path)) + return cls(index, arguments=arguments) + + def save(self, path: Path) -> None: + """Save the index to a path.""" + self.index.write(str(path / "index.tq")) + self.arguments.dump(path / "arguments.json") + + def query(self, vectors: npt.NDArray, k: int) -> QueryResult: + """Query the backend and return results as tuples of keys and distances.""" + k = min(k, len(self)) + scores_batch, indices_batch = self.index.search(vectors.astype(np.float32), k=k) + # TurboVec returns cosine similarity scores; convert to cosine distance + distances_batch = 1.0 - scores_batch + return [(indices_batch[i], distances_batch[i].astype(np.float32)) for i in range(len(vectors))] + + def insert(self, vectors: npt.NDArray) -> None: + """Insert vectors into the backend.""" + self.index.add(vectors.astype(np.float32)) + + def delete(self, indices: list[int]) -> None: + """Delete vectors from the index (not supported by TurboVec).""" + raise NotImplementedError("Dynamic deletion is not supported in TurboVec.") + + def threshold(self, vectors: npt.NDArray, threshold: float, max_k: int) -> QueryResult: + """Query vectors within a distance threshold and return keys and distances.""" + out: QueryResult = [] + for keys_row, distances_row in self.query(vectors, max_k): + keys_row = np.array(keys_row) + distances_row = np.array(distances_row, dtype=np.float32) + mask = distances_row <= threshold + out.append((keys_row[mask], distances_row[mask])) + return out diff --git a/vicinity/datatypes.py b/vicinity/datatypes.py index f09db5e..c332b30 100644 --- a/vicinity/datatypes.py +++ b/vicinity/datatypes.py @@ -25,3 +25,4 @@ class Backend(str, Enum): FAISS = "faiss" USEARCH = "usearch" VOYAGER = "voyager" + TURBOVEC = "turbovec"