diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 1f753a63..d289189e 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -6,7 +6,7 @@ permissions: contents: read jobs: build-linux-x86_64-extension: - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - run: ./scripts/vendor.sh @@ -55,13 +55,12 @@ jobs: name: sqlite-vec-windows-x86_64-extension path: dist/* build-linux-aarch64-extension: - runs-on: ubuntu-latest + runs-on: ubuntu-24.04-arm steps: - uses: actions/checkout@v4 - - run: sudo apt-get install gcc-aarch64-linux-gnu - run: ./scripts/vendor.sh - run: make sqlite-vec.h - - run: make CC=aarch64-linux-gnu-gcc loadable static + - run: make loadable static - uses: actions/upload-artifact@v4 with: name: sqlite-vec-linux-aarch64-extension diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index e34661b9..381aa3df 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -7,7 +7,7 @@ permissions: contents: read jobs: build-linux-x86_64-extension: - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: astral-sh/setup-uv@v3 diff --git a/.gitignore b/.gitignore index 0268d5d3..bc666d39 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ /target +/Cargo.lock .vscode sift/ *.tar.gz @@ -7,12 +8,20 @@ sift/ *.bin *.out venv/ +.venv vendor/ dist/ *.pyc *.db-journal +build/ +*.egg-info/ +sqlite_vec.py +sqlite_vec/ +vec0.so +vec0.dylib +vec0.dll alexandria/ openai/ @@ -22,7 +31,6 @@ examples/dbpedia-openai examples/imdb examples/sotu -sqlite-vec.h tmp/ poetry.lock diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 4f62e16a..b9073942 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -120,3 +120,30 @@ The third character of the block is the constraint operator. It will be one of metadata column KNN filters. The foruth character of the block is a `_` filler. + +#### `VEC0_IDXSTR_KIND_KNN_DISTANCE_CONSTRAINT` (`'*'`) + +`argv[i]` is a constraint on the `distance` column in a KNN query. + +This enables filtering KNN results by distance thresholds, useful for: +- Cursor-based pagination: `WHERE embedding MATCH ? AND k = 10 AND distance > 0.21` +- Range queries: `WHERE embedding MATCH ? AND k = 100 AND distance BETWEEN 0.5 AND 1.0` + +The second character of the block denotes the constraint operator. It will be one of +the values of `enum vec0_distance_constraint_operator`: + +| Operator | Value | Description | SQL Example | +| -------- | ----- | ------------------------ | -------------------- | +| `GT` | `'a'` | Greater than | `distance > 0.5` | +| `GE` | `'b'` | Greater than or equal to | `distance >= 0.5` | +| `LT` | `'c'` | Less than | `distance < 1.0` | +| `LE` | `'d'` | Less than or equal to | `distance <= 1.0` | + +The third and fourth characters of the block are `_` fillers. + +**Note on precision:** Distance values are cast from f64 to f32 for comparison, which may +result in precision loss for very small distance differences. + +**Note on pagination:** When multiple vectors have identical distances, pagination using +`distance > X` may skip some results. For stable pagination, combine distance with rowid: +`WHERE (distance > 0.5) OR (distance = 0.5 AND rowid > 123)` diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 00000000..afff321e --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,162 @@ +# Changelog + +All notable changes to this community fork will be documented in this file. + +## [0.2.4-alpha] - 2026-01-03 + +### Added + +- **Lua binding with IEEE 754 compliant float serialization** ([#237](https://github.com/asg017/sqlite-vec/pull/237)) + - `bindings/lua/sqlite_vec.lua` provides `load()`, `serialize_f32()`, and `serialize_json()` functions + - Lua 5.1+ compatible with lsqlite3 + - IEEE 754 single-precision float encoding with round-half-to-even (banker's rounding) + - Proper handling of special values: NaN, Inf, -Inf, -0.0, subnormals + - Example script and runner in `/examples/simple-lua/` + +## [0.2.3-alpha] - 2025-12-29 + +### Added + +- **Android 16KB page support** ([#254](https://github.com/asg017/sqlite-vec/pull/254)) + - Added `LDFLAGS` support to Makefile for passing linker-specific flags + - Enables Android 15+ compatibility via `-Wl,-z,max-page-size=16384` + - Required for Play Store app submissions on devices with 16KB memory pages + +- **Improved shared library build and installation** ([#149](https://github.com/asg017/sqlite-vec/issues/149)) + - Configurable install paths via `INSTALL_PREFIX`, `INSTALL_LIB_DIR`, `INSTALL_INCLUDE_DIR`, `INSTALL_BIN_DIR` + - Hidden internal symbols with `-fvisibility=hidden`, exposing only public API + - `EXT_CFLAGS` captures user-provided `CFLAGS` and `CPPFLAGS` + +- **Optimize/VACUUM integration test and documentation** + - Added test demonstrating optimize command with VACUUM for full space reclamation + +### Fixed + +- **Linux linking error with libm** ([#252](https://github.com/asg017/sqlite-vec/pull/252)) + - Moved `-lm` flag from `CFLAGS` to `LDLIBS` at end of linker command + - Fixes "undefined symbol: sqrtf" errors on some Linux distributions + - Linker now correctly resolves math library symbols + +### Documentation + +- **Fixed incomplete KNN and Matryoshka guides** ([#208](https://github.com/asg017/sqlite-vec/pull/208), [#209](https://github.com/asg017/sqlite-vec/pull/209)) + - Completed unfinished sentence describing manual KNN method trade-offs + - Added paper citation and Matryoshka naming explanation + +## [0.2.2-alpha] - 2025-12-02 + +### Added + +- **GLOB operator for text metadata columns** ([#191](https://github.com/asg017/sqlite-vec/issues/191)) + - Standard SQL pattern matching with `*` (any characters) and `?` (single character) wildcards + - Case-sensitive matching (unlike LIKE) + - Fast path optimization for prefix-only patterns (e.g., `'prefix*'`) + - Full pattern matching with `sqlite3_strglob()` for complex patterns + +- **IS/IS NOT/IS NULL/IS NOT NULL operators for metadata columns** ([#190](https://github.com/asg017/sqlite-vec/issues/190)) + - **Note**: sqlite-vec metadata columns do not currently support NULL values. These operators provide syntactic compatibility within this limitation. + - `IS` behaves like `=` (all metadata values are non-NULL) + - `IS NOT` behaves like `!=` (all metadata values are non-NULL) + - `IS NULL` always returns false (no NULL values exist in metadata) + - `IS NOT NULL` always returns true (all metadata values are non-NULL) + - Works on all metadata types: INTEGER, FLOAT, TEXT, and BOOLEAN + +### Fixed + +- **All compilation warnings eliminated** + - Fixed critical logic bug: `metadataInIdx` type corrected from `size_t` to `int` (prevented -1 wrapping to SIZE_MAX) + - Fixed 5 sign comparison warnings with proper type casts + - Fixed 7 uninitialized variable warnings by adding initializers and default cases + - Clean compilation with `-Wall -Wextra` (zero warnings) + +## [0.2.1-alpha] - 2025-12-02 + +### Added + +- **LIKE operator for text metadata columns** ([#197](https://github.com/asg017/sqlite-vec/issues/197)) + - Standard SQL pattern matching with `%` and `_` wildcards + - Case-insensitive matching (SQLite default) + +### Fixed + +- **Locale-dependent JSON parsing** ([#241](https://github.com/asg017/sqlite-vec/issues/241)) + - Custom locale-independent float parser fixes JSON parsing in non-C locales + - No platform dependencies, thread-safe + +- **musl libc compilation** (Alpine Linux) + - Removed non-portable preprocessor macros from vendored sqlite3.c + +## [0.2.0-alpha] - 2025-11-28 + +### Added + +- **Distance constraints for KNN queries** ([#166](https://github.com/asg017/sqlite-vec/pull/166)) + - Support GT, GE, LT, LE operators on the `distance` column in KNN queries + - Enables cursor-based pagination: `WHERE embedding MATCH ? AND k = 10 AND distance > 0.5` + - Enables range queries: `WHERE embedding MATCH ? AND k = 100 AND distance BETWEEN 0.5 AND 1.0` + - Works with all vector types (float32, int8, bit) + - Compatible with partition keys, metadata, and auxiliary columns + - Comprehensive test coverage (15 tests) + - Fixed variable shadowing issues from original PR + - Documented precision handling and pagination caveats + +- **Optimize command for space reclamation** ([#210](https://github.com/asg017/sqlite-vec/pull/210)) + - New special command: `INSERT INTO vec_table(vec_table) VALUES('optimize')` + - Reclaims disk space after DELETE operations by compacting shadow tables + - Rebuilds vector chunks with only valid rows + - Updates rowid mappings to maintain data integrity + +- **Cosine distance support for binary vectors** ([#212](https://github.com/asg017/sqlite-vec/pull/212)) + - Added `distance_cosine_bit()` function for binary quantized vectors + - Enables cosine similarity metric on bit-packed vectors + - Useful for memory-efficient semantic search + +- **ALTER TABLE RENAME support** ([#203](https://github.com/asg017/sqlite-vec/pull/203)) + - Implement `vec0Rename()` callback for virtual table module + - Allows renaming vec0 tables with standard SQL: `ALTER TABLE old_name RENAME TO new_name` + - Properly renames all shadow tables and internal metadata + +- **Language bindings and package configurations for GitHub installation** + - Go CGO bindings (`bindings/go/cgo/`) with `Auto()` and serialization helpers + - Python package configuration (`pyproject.toml`, `setup.py`) for `pip install git+...` + - Node.js package configuration (`package.json`) for `npm install vlasky/sqlite-vec` + - Ruby gem configuration (`sqlite-vec.gemspec`) for `gem install` from git + - Rust crate configuration (`Cargo.toml`, `src/lib.rs`) for `cargo add --git` + - All packages support installing from main branch or specific version tags + - Documentation in README with installation table for all languages + +- **Python loadable extension support documentation** + - Added note about Python requiring `--enable-loadable-sqlite-extensions` build flag + - Recommended using `uv` for virtual environments (uses system Python with extension support) + - Documented workarounds for pyenv and custom Python builds + +### Fixed + +- **Memory leak on DELETE operations** ([#243](https://github.com/asg017/sqlite-vec/pull/243)) + - Added `vec0Update_Delete_ClearRowid()` to clear deleted rowids + - Added `vec0Update_Delete_ClearVectors()` to clear deleted vector data + - Prevents memory accumulation from deleted rows + - Vectors and rowids now properly zeroed out on deletion + +- **CI/CD build infrastructure** ([#228](https://github.com/asg017/sqlite-vec/pull/228)) + - Upgraded deprecated ubuntu-20.04 runners to ubuntu-latest + - Added native ARM64 builds using ubuntu-24.04-arm + - Removed cross-compilation dependencies (gcc-aarch64-linux-gnu) + - Fixed macOS link flags for undefined symbols + +## Original Version + +This fork is based on [`asg017/sqlite-vec`](https://github.com/asg017/sqlite-vec) v0.1.7-alpha.2. + +All features and functionality from the original repository are preserved. +See the [original documentation](https://alexgarcia.xyz/sqlite-vec/) for complete usage information. + +--- + +## Notes + +This is a community-maintained fork created to merge pending upstream PRs and provide +continued support while the original author is unavailable. Once development resumes +on the original repository, users are encouraged to switch back. + +All original implementation credit goes to [Alex Garcia](https://github.com/asg017). diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 00000000..4a2cebf0 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,220 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +> [!NOTE] +> This is a community fork of [`asg017/sqlite-vec`](https://github.com/asg017/sqlite-vec) created to merge pending upstream PRs and provide continued support while the original author is unavailable. + +`sqlite-vec` is a lightweight, fast vector search SQLite extension written in pure C with no dependencies. It's a pre-v1 project (current: v0.2.4-alpha) that provides vector similarity search capabilities for SQLite databases across all platforms where SQLite runs. + +Key features: +- Supports float, int8, and binary vector types via `vec0` virtual tables +- Pure C implementation with optional SIMD optimizations (AVX on x86_64, NEON on ARM) +- Multi-language bindings (Python, Node.js, Ruby, Go, Rust, Lua) +- Runs anywhere: Linux/MacOS/Windows, WASM, embedded devices +- Distance constraints for KNN queries (enables pagination and range queries) +- Optimize command for space reclamation after deletes +- ALTER TABLE RENAME support for vec0 tables + +## Building and Testing + +### Build Commands + +Run `./scripts/vendor.sh` first to download vendored dependencies (sqlite3.c, shell.c). + +**Core builds:** +- `make loadable` - Build `dist/vec0.{so,dylib,dll}` loadable extension +- `make static` - Build `dist/libsqlite_vec0.a` static library and `dist/sqlite-vec.h` header +- `make cli` - Build `dist/sqlite3` CLI with sqlite-vec statically linked +- `make all` - Build all three targets above +- `make wasm` - Build WASM version (requires emcc) + +**Platform-specific compiler:** +- Set `CC=` to use a different compiler (default: gcc) +- Set `AR=` to use a different archiver (default: ar) + +**SIMD control:** +- SIMD is auto-enabled on Darwin x86_64 (AVX) and Darwin arm64 (NEON) +- Set `OMIT_SIMD=1` to disable SIMD optimizations + +### Testing + +**Python tests (primary test suite):** +```bash +# Setup test environment with uv +uv sync --directory tests + +# Run all Python tests +make test-loadable python=./tests/.venv/bin/python + +# Run specific test +./tests/.venv/bin/python -m pytest tests/test-loadable.py::test_name -vv -s -x + +# Update snapshots +make test-loadable-snapshot-update + +# Watch mode +make test-loadable-watch +``` + +**Other tests:** +- `make test` - Run basic SQL tests via `test.sql` +- `make test-unit` - Compile and run C unit tests +- `sqlite3 :memory: '.read test.sql'` - Quick smoke test + +**Test structure:** +- `tests/test-loadable.py` - Main comprehensive test suite +- `tests/test-metadata.py` - Metadata column tests +- `tests/test-auxiliary.py` - Auxiliary column tests +- `tests/test-partition-keys.py` - Partition key tests +- `tests/conftest.py` - pytest fixtures (loads extension from `dist/vec0`) + +### Code Quality + +- `make format` - Format C code with clang-format and Python with black +- `make lint` - Check formatting without modifying files + +## Architecture + +### Core Implementation (sqlite-vec.c) + +The entire extension is in a single `sqlite-vec.c` file (~9000 lines). It implements a `vec0` virtual table module using SQLite's virtual table API. + +**Key concepts:** + +1. **vec0 virtual table**: Declared with `CREATE VIRTUAL TABLE x USING vec0(vector_column TYPE[N], ...)` + - Vector column: Must specify type (float, int8, bit) and dimensions + - Metadata columns: Additional indexed columns for filtering + - Auxiliary columns: Non-indexed columns for associated data + - Partition keys: Special columns for pre-filtering via `partition_key=column_name` + - Chunk size: Configurable via `chunk_size=N` (default varies by type) + +2. **Shadow tables**: vec0 creates multiple hidden tables to store data: + - `xyz_chunks` - Chunk metadata (size, validity bitmaps, rowids) + - `xyz_rowids` - Rowid mapping to chunks + - `xyz_vector_chunksNN` - Actual vector data for column NN + - `xyz_auxiliary` - Auxiliary column values + - `xyz_metadatachunksNN` / `xyz_metadatatextNN` - Metadata storage + +3. **Query plans**: Determined in xBestIndex, encoded in idxStr: + - `VEC0_QUERY_PLAN_FULLSCAN` - Full table scan + - `VEC0_QUERY_PLAN_POINT` - Single rowid lookup + - `VEC0_QUERY_PLAN_KNN` - K-nearest neighbors vector search + +See ARCHITECTURE.md for detailed idxStr encoding and shadow table schemas. + +### Language Bindings + +All bindings wrap the core C extension: + +- **Go CGO** (`bindings/go/cgo/`): CGO-based bindings for mattn/go-sqlite3 + - `Auto()` function to register extension via `sqlite3_auto_extension()` + - `SerializeFloat32()` and `SerializeInt8()` helper functions + - Requires CGO enabled and links libm (`-lm`) + +- **Python**: Installable via pip from GitHub + - Package configuration in `pyproject.toml` and `setup.py` + - Helper functions in `bindings/python/extra_init.py` for vector serialization + - Requires Python built with `--enable-loadable-sqlite-extensions` + - Recommend using `uv` for virtual environments (uses system Python with extension support) + +- **Node.js**: Installable via npm from GitHub + - Package configuration in `package.json` + - CJS (`index.cjs`) and ESM (`index.mjs`) entry points + - TypeScript definitions in `index.d.ts` + +- **Ruby**: Installable via gem from GitHub + - Gem specification in `sqlite-vec.gemspec` + - Extension configuration in `extconf.rb` + - Ruby library in `lib/sqlite_vec.rb` + +- **Rust** (`bindings/rust/`): Static linking via build.rs + - Crate configuration in `Cargo.toml` + - Exports `sqlite3_vec_init()` in `src/lib.rs` + +- **Lua** (`bindings/lua/`): Lua 5.1+ compatible binding + - Requires `lsqlite3` module + - `load()` function to load the extension + - `serialize_f32()` for IEEE 754 binary format + - `serialize_json()` for JSON format + - Example in `examples/simple-lua/` + +### Documentation Site + +Built with VitePress (Vue-based static site generator): +- `npm --prefix site run dev` - Development server +- `npm --prefix site run build` - Production build +- Source: `site/` directory +- Deployed via GitHub Actions (`.github/workflows/site.yaml`) + +## Development Workflow + +### Making Changes + +1. Edit `sqlite-vec.c` for core functionality +2. Update `sqlite-vec.h.tmpl` if public API changes (regenerated via `make sqlite-vec.h`) +3. Add tests to `tests/test-loadable.py` or other test files +4. Run `make format` before committing +5. Verify with `make test-loadable` + +### Release Process + +**For this fork:** + +1. Update `VERSION` file (format: `X.Y.Z` or `X.Y.Z-alpha.N`) +2. Update `CHANGELOG.md` with changes +3. Commit changes with descriptive message +4. Create and push git tag: + ```bash + git tag v0.X.Y-alpha + git push origin v0.X.Y-alpha + ``` + +**Note:** This fork does not have CI/CD publishing to package registries (PyPI, npm, crates.io, RubyGems). +Users install directly from GitHub using version tags. + +**Original release process (for reference only):** +The original repository uses `./scripts/publish-release.sh` and CI/CD (`.github/workflows/release.yaml`) to build and publish platform-specific extensions and language packages. + +### Working with Tests + +**Python test fixtures:** +- `@pytest.fixture() db()` in conftest.py provides SQLite connection with extension loaded +- Tests use `db.execute()` for queries +- Snapshot testing available for regression tests + +**Common test patterns:** +```python +def test_example(db): + db.execute("CREATE VIRTUAL TABLE v USING vec0(embedding float[3])") + db.execute("INSERT INTO v(rowid, embedding) VALUES (1, '[1,2,3]')") + result = db.execute("SELECT distance FROM v WHERE embedding MATCH '[1,2,3]'").fetchone() +``` + +### SIMD Optimizations + +SIMD is conditionally compiled based on platform: +- `SQLITE_VEC_ENABLE_AVX` - x86_64 AVX instructions +- `SQLITE_VEC_ENABLE_NEON` - ARM NEON instructions + +Code uses preprocessor directives to select implementations. Distance calculations have both scalar and SIMD variants. + +## Important Notes + +- This is pre-v1 software - breaking changes are expected +- The single-file architecture means recompiling for any change +- Tests must run from repository root (assumes `dist/vec0` exists) +- All bindings depend on the core C extension being built first +- Vector format: JSON arrays `'[1,2,3]'` or raw bytes via helper functions + +**Fork-specific notes:** +- Version v0.2.4-alpha includes: Lua binding with IEEE 754 compliant float serialization (#237) +- Version v0.2.3-alpha includes: Android 16KB page support (#254), LDFLAGS support, documentation fixes (#208, #209) +- Version v0.2.2-alpha includes: GLOB operator for text metadata (#191), IS/IS NOT/IS NULL/IS NOT NULL operators (#190), all compilation warnings fixed (including critical logic bug) +- Version v0.2.1-alpha includes: LIKE operator for text metadata (#197), locale-independent JSON parsing (#241), musl libc compilation fix +- Version v0.2.0-alpha merged upstream PRs: #166 (distance constraints), #210 (optimize), #203 (ALTER TABLE RENAME), #212 (cosine distance for binary), #243 (delete memory leak fix), #228 (CI/CD updates) +- See CHANGELOG.md for complete list of changes from original v0.1.7-alpha.2 +- Installation is via GitHub (git tags), not package registries +- Python users should use `uv` for virtual environments to ensure loadable extension support diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 00000000..049b177e --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "sqlite-vec" +version = "0.2.4-alpha" +edition = "2021" +authors = ["Alex Garcia "] +description = "FFI bindings to the sqlite-vec SQLite extension" +homepage = "https://github.com/vlasky/sqlite-vec" +repository = "https://github.com/vlasky/sqlite-vec" +keywords = ["sqlite", "sqlite-extension"] +license = "MIT OR Apache-2.0" + +[dependencies] + +[build-dependencies] +cc = "1.0" + +[dev-dependencies] +rusqlite = "0.31.0" diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 00000000..5af4732a --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,3 @@ +include vec0.so +include vec0.dylib +include vec0.dll diff --git a/Makefile b/Makefile index 43e1d980..34a8212b 100644 --- a/Makefile +++ b/Makefile @@ -3,9 +3,10 @@ COMMIT=$(shell git rev-parse HEAD) VERSION=$(shell cat VERSION) DATE=$(shell date +'%FT%TZ%z') -INSTALL_LIB_DIR = /usr/local/lib -INSTALL_INCLUDE_DIR = /usr/local/include -INSTALL_BIN_DIR = /usr/local/bin +INSTALL_PREFIX ?= /usr/local +INSTALL_LIB_DIR ?= $(INSTALL_PREFIX)/lib +INSTALL_INCLUDE_DIR ?= $(INSTALL_PREFIX)/include +INSTALL_BIN_DIR ?= $(INSTALL_PREFIX)/bin ifndef CC CC=gcc @@ -14,6 +15,10 @@ ifndef AR AR=ar endif +# Capture user-provided flags +EXT_CFLAGS := $(CFLAGS) $(CPPFLAGS) +EXT_LDFLAGS := $(LDFLAGS) + ifeq ($(shell uname -s),Darwin) CONFIG_DARWIN=y else ifeq ($(OS),Windows_NT) @@ -24,11 +29,14 @@ endif ifdef CONFIG_DARWIN LOADABLE_EXTENSION=dylib +# Let unresolved SQLite symbols resolve against host at load time +# This is standard for SQLite loadable extensions on macOS. +CFLAGS += -undefined dynamic_lookup endif ifdef CONFIG_LINUX LOADABLE_EXTENSION=so -CFLAGS += -lm +LDLIBS += -lm endif ifdef CONFIG_WINDOWS @@ -95,14 +103,16 @@ $(BUILD_DIR): $(prefix) $(TARGET_LOADABLE): sqlite-vec.c sqlite-vec.h $(prefix) $(CC) \ -fPIC -shared \ + -fvisibility=hidden \ -Wall -Wextra \ -Ivendor/ \ -O3 \ - $(CFLAGS) \ - $< -o $@ + $(CFLAGS) $(EXT_CFLAGS) \ + $< -o $@ \ + $(EXT_LDFLAGS) $(LDLIBS) $(TARGET_STATIC): sqlite-vec.c sqlite-vec.h $(prefix) $(OBJS_DIR) - $(CC) -Ivendor/ $(CFLAGS) -DSQLITE_CORE -DSQLITE_VEC_STATIC \ + $(CC) -Ivendor/ -fvisibility=hidden $(CFLAGS) $(EXT_CFLAGS) -DSQLITE_CORE -DSQLITE_VEC_STATIC \ -O3 -c $< -o $(OBJS_DIR)/vec.o $(AR) rcs $@ $(OBJS_DIR)/vec.o @@ -130,7 +140,7 @@ $(LIBS_DIR)/shell.a: $(OBJS_DIR)/shell.o $(LIBS_DIR) $(AR) rcs $@ $< $(OBJS_DIR)/sqlite-vec.o: sqlite-vec.c $(OBJS_DIR) - $(CC) -c -g3 -Ivendor/ -I./ $(CFLAGS) $< -o $@ + $(CC) -c -g3 -fvisibility=hidden -Ivendor/ -I./ $(CFLAGS) $(EXT_CFLAGS) $< -o $@ $(LIBS_DIR)/sqlite-vec.a: $(OBJS_DIR)/sqlite-vec.o $(LIBS_DIR) $(AR) rcs $@ $< @@ -138,14 +148,15 @@ $(LIBS_DIR)/sqlite-vec.a: $(OBJS_DIR)/sqlite-vec.o $(LIBS_DIR) $(TARGET_CLI): sqlite-vec.h $(LIBS_DIR)/sqlite-vec.a $(LIBS_DIR)/shell.a $(LIBS_DIR)/sqlite3.a examples/sqlite3-cli/core_init.c $(prefix) $(CC) -g3 \ + -fvisibility=hidden \ -Ivendor/ -I./ \ -DSQLITE_CORE \ -DSQLITE_VEC_STATIC \ -DSQLITE_THREADSAFE=0 -DSQLITE_ENABLE_FTS4 \ -DSQLITE_ENABLE_STMT_SCANSTATUS -DSQLITE_ENABLE_BYTECODE_VTAB -DSQLITE_ENABLE_EXPLAIN_COMMENTS \ -DSQLITE_EXTRA_INIT=core_init \ - $(CFLAGS) \ - -ldl -lm \ + $(CFLAGS) $(EXT_CFLAGS) \ + $(EXT_LDFLAGS) -ldl -lm \ examples/sqlite3-cli/core_init.c $(LIBS_DIR)/shell.a $(LIBS_DIR)/sqlite3.a $(LIBS_DIR)/sqlite-vec.a -o $@ @@ -193,6 +204,10 @@ test-loadable: loadable test-loadable-snapshot-update: loadable $(PYTHON) -m pytest -vv tests/test-loadable.py --snapshot-update +# Update snapshots for all loadable tests (use after intentional behavior changes) +test-snapshots-update: loadable + $(PYTHON) -m pytest -vv tests/test-*.py --snapshot-update + test-loadable-watch: watchexec --exts c,py,Makefile --clear -- make test-loadable @@ -216,7 +231,7 @@ install: install -m 644 $(TARGET_STATIC) $(INSTALL_LIB_DIR); \ fi @if [ -f $(TARGET_CLI) ]; then \ - sudo install -m 755 $(TARGET_CLI) $(INSTALL_BIN_DIR); \ + install -m 755 $(TARGET_CLI) $(INSTALL_BIN_DIR); \ fi ldconfig diff --git a/README.md b/README.md index f367dcf6..8bd1ae01 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,12 @@ [![](https://dcbadge.vercel.app/api/server/VCtQ8cGhUs)](https://discord.gg/Ve7WeCJFXk) +> [!NOTE] +> **Community Fork Notice:** This is a temporary fork of [`asg017/sqlite-vec`](https://github.com/asg017/sqlite-vec) +> Created to merge pending upstream PRs and provide community support while the original author is unavailable. +> Once development resumes on the original repository, users are encouraged to switch back. +> All credit for the original implementation goes to [Alex Garcia](https://github.com/asg017). + An extremely small, "fast enough" vector search SQLite extension that runs anywhere! A successor to [`sqlite-vss`](https://github.com/asg017/sqlite-vss) @@ -42,23 +48,84 @@ See the Sponsors section for more details. ## Installing -See [Installing `sqlite-vec`](https://alexgarcia.xyz/sqlite-vec/installation.html) -for more details. +### From Original Package Registries + +The original packages on PyPI, npm, RubyGems, and crates.io are maintained by the original author. +For the latest features from this fork, see "Installing from This Fork" below. | Language | Install | More Info | | | -------------- | ---------------------------------------------------- | ------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | Python | `pip install sqlite-vec` | [`sqlite-vec` with Python](https://alexgarcia.xyz/sqlite-vec/python.html) | [![PyPI](https://img.shields.io/pypi/v/sqlite-vec.svg?color=blue&logo=python&logoColor=white)](https://pypi.org/project/sqlite-vec/) | | Node.js | `npm install sqlite-vec` | [`sqlite-vec` with Node.js](https://alexgarcia.xyz/sqlite-vec/js.html) | [![npm](https://img.shields.io/npm/v/sqlite-vec.svg?color=green&logo=nodedotjs&logoColor=white)](https://www.npmjs.com/package/sqlite-vec) | | Ruby | `gem install sqlite-vec` | [`sqlite-vec` with Ruby](https://alexgarcia.xyz/sqlite-vec/ruby.html) | ![Gem](https://img.shields.io/gem/v/sqlite-vec?color=red&logo=rubygems&logoColor=white) | -| Go | `go get -u github.com/asg017/sqlite-vec/bindings/go` | [`sqlite-vec` with Go](https://alexgarcia.xyz/sqlite-vec/go.html) | [![Go Reference](https://pkg.go.dev/badge/github.com/asg017/sqlite-vec-go-bindings/cgo.svg)](https://pkg.go.dev/github.com/asg017/asg017/sqlite-vec-go-bindings/cgo) | | Rust | `cargo add sqlite-vec` | [`sqlite-vec` with Rust](https://alexgarcia.xyz/sqlite-vec/rust.html) | [![Crates.io](https://img.shields.io/crates/v/sqlite-vec?logo=rust)](https://crates.io/crates/sqlite-vec) | | Datasette | `datasette install datasette-sqlite-vec` | [`sqlite-vec` with Datasette](https://alexgarcia.xyz/sqlite-vec/datasette.html) | [![Datasette](https://img.shields.io/pypi/v/datasette-sqlite-vec.svg?color=B6B6D9&label=Datasette+plugin&logoColor=white&logo=python)](https://datasette.io/plugins/datasette-sqlite-vec) | | rqlite | `rqlited -extensions-path=sqlite-vec.tar.gz` | [`sqlite-vec` with rqlite](https://alexgarcia.xyz/sqlite-vec/rqlite.html) | [![rqlite](https://img.shields.io/badge/rqlite-sqlite_extensions-blue)](https://rqlite.io/docs/guides/extensions/) | | `sqlite-utils` | `sqlite-utils install sqlite-utils-sqlite-vec` | [`sqlite-vec` with sqlite-utils](https://alexgarcia.xyz/sqlite-vec/sqlite-utils.html) | [![sqlite-utils](https://img.shields.io/pypi/v/sqlite-utils-sqlite-vec.svg?color=B6B6D9&label=sqlite-utils+plugin&logoColor=white&logo=python)](https://datasette.io/plugins/datasette-sqlite-vec) | -| Github Release | | | ![GitHub tag (latest SemVer pre-release)](https://img.shields.io/github/v/tag/asg017/sqlite-vec?color=lightgrey&include_prereleases&label=Github+release&logo=github) | +### Installing from This Fork + +Install directly from GitHub to get the latest features from this community fork. + +#### Available Languages + +| Language | Install Latest (main branch) | Install Specific Version | +|----------|------------------------------|--------------------------| +| **Go** | `go get github.com/vlasky/sqlite-vec/bindings/go/cgo@main` | `go get github.com/vlasky/sqlite-vec/bindings/go/cgo@v0.2.4-alpha` | +| **Lua** | `luarocks install lsqlite3` then copy [`sqlite_vec.lua`](bindings/lua/) to your project. See [Lua example](/examples/simple-lua/) | Download [`sqlite_vec.lua` at v0.2.4-alpha](https://github.com/vlasky/sqlite-vec/blob/v0.2.4-alpha/bindings/lua/sqlite_vec.lua) | +| **Python** | `pip install git+https://github.com/vlasky/sqlite-vec.git` | `pip install git+https://github.com/vlasky/sqlite-vec.git@v0.2.4-alpha` | +| **Rust** | `cargo add sqlite-vec --git https://github.com/vlasky/sqlite-vec` | `cargo add sqlite-vec --git https://github.com/vlasky/sqlite-vec --tag v0.2.4-alpha` | +| **Node.js** | `npm install vlasky/sqlite-vec` | `npm install vlasky/sqlite-vec#v0.2.4-alpha` | +| **Ruby** | `gem 'sqlite-vec', git: 'https://github.com/vlasky/sqlite-vec'` | `gem 'sqlite-vec', git: 'https://github.com/vlasky/sqlite-vec', tag: 'v0.2.4-alpha'` | + +**Python Note:** Requires Python built with loadable extension support (`--enable-loadable-sqlite-extensions`). If you encounter an error about extension support not being available: +- Use `uv` to create virtual environments (automatically uses system Python which typically has extension support) +- Or use system Python instead of pyenv/custom builds +- Or rebuild your Python with `./configure --enable-loadable-sqlite-extensions` + +**Available version tags:** See [Releases](https://github.com/vlasky/sqlite-vec/releases) + +#### Build from Source + +For direct C usage or other languages: + +```bash +git clone https://github.com/vlasky/sqlite-vec.git +cd sqlite-vec +./scripts/vendor.sh # Download vendored dependencies +make loadable # Builds dist/vec0.so (or .dylib/.dll) +``` + +#### Not Yet Available + +- Pre-built binaries via GitHub Releases +- Package registry publications (PyPI, npm, RubyGems, crates.io) +- Datasette/sqlite-utils plugins + +For these, use the original packages until this fork's CI/CD is configured. + +See the [original documentation](https://alexgarcia.xyz/sqlite-vec/installation.html) for detailed usage information. + +## What's New + +See [CHANGELOG.md](CHANGELOG.md) for a complete list of improvements, bug fixes, and merged upstream PRs. + +## Basic Usage + +**Vector types:** `sqlite-vec` supports three vector types with different trade-offs: + +```sql +-- Float vectors (32-bit floating point, most common) +CREATE VIRTUAL TABLE vec_floats USING vec0(embedding float[384]); + +-- Int8 vectors (8-bit integers, smaller memory footprint) +CREATE VIRTUAL TABLE vec_int8 USING vec0(embedding int8[384]); + +-- Binary vectors (1 bit per dimension, maximum compression) +CREATE VIRTUAL TABLE vec_binary USING vec0(embedding bit[384]); +``` -## Sample usage +**Usage example:** ```sql .load ./vec0 @@ -70,33 +137,178 @@ create virtual table vec_examples using vec0( -- vectors can be provided as JSON or in a compact binary format insert into vec_examples(rowid, sample_embedding) values - (1, '[-0.200, 0.250, 0.341, -0.211, 0.645, 0.935, -0.316, -0.924]'), - (2, '[0.443, -0.501, 0.355, -0.771, 0.707, -0.708, -0.185, 0.362]'), - (3, '[0.716, -0.927, 0.134, 0.052, -0.669, 0.793, -0.634, -0.162]'), - (4, '[-0.710, 0.330, 0.656, 0.041, -0.990, 0.726, 0.385, -0.958]'); + (1, '[0.279, -0.95, -0.45, -0.554, 0.473, 0.353, 0.784, -0.826]'), + (2, '[-0.156, -0.94, -0.563, 0.011, -0.947, -0.602, 0.3, 0.09]'), + (3, '[-0.559, 0.179, 0.619, -0.987, 0.612, 0.396, -0.319, -0.689]'), + (4, '[0.914, -0.327, -0.815, -0.807, 0.695, 0.207, 0.614, 0.459]'), + (5, '[0.072, 0.946, -0.243, 0.104, 0.659, 0.237, 0.723, 0.155]'), + (6, '[0.409, -0.908, -0.544, -0.421, -0.84, -0.534, -0.798, -0.444]'), + (7, '[0.271, -0.27, -0.26, -0.581, -0.466, 0.873, 0.296, 0.218]'), + (8, '[-0.658, 0.458, -0.673, -0.241, 0.979, 0.28, 0.114, 0.369]'), + (9, '[0.686, 0.552, -0.542, -0.936, -0.369, -0.465, -0.578, 0.886]'), + (10, '[0.753, -0.371, 0.311, -0.209, 0.829, -0.082, -0.47, -0.507]'), + (11, '[0.123, -0.475, 0.169, 0.796, -0.201, -0.561, 0.995, 0.019]'), + (12, '[-0.818, -0.906, -0.781, 0.255, 0.584, -0.156, -0.873, -0.237]'), + (13, '[0.992, 0.058, 0.942, 0.722, -0.977, 0.441, 0.363, 0.074]'), + (14, '[-0.466, 0.282, -0.777, -0.13, -0.093, 0.908, 0.752, -0.473]'), + (15, '[0.001, -0.643, 0.825, 0.741, -0.403, 0.278, 0.218, -0.694]'), + (16, '[0.525, 0.079, 0.557, 0.061, -0.999, -0.352, -0.961, 0.858]'), + (17, '[0.757, 0.663, -0.385, -0.884, 0.756, 0.894, -0.829, -0.028]'), + (18, '[-0.862, 0.521, 0.532, -0.743, -0.049, 0.1, -0.47, 0.745]'), + (19, '[-0.154, -0.576, 0.079, 0.46, -0.598, -0.377, 0.99, 0.3]'), + (20, '[-0.124, 0.035, -0.758, -0.551, -0.324, 0.177, -0.54, -0.56]'); --- KNN style query +-- Find 3 nearest neighbors using LIMIT select rowid, distance from vec_examples -where sample_embedding match '[0.890, 0.544, 0.825, 0.961, 0.358, 0.0196, 0.521, 0.175]' +where sample_embedding match '[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]' order by distance -limit 2; +limit 3; +/* +┌───────┬──────────────────┐ +│ rowid │ distance │ +├───────┼──────────────────┤ +│ 5 │ 1.16368770599365 │ +│ 13 │ 1.75137972831726 │ +│ 11 │ 1.83941268920898 │ +└───────┴──────────────────┘ +*/ +``` + +**How vector search works:** The `MATCH` operator finds vectors similar to your query vector. In the example above, `sample_embedding MATCH '[0.5, ...]'` searches for vectors closest to `[0.5, ...]` and returns them ordered by distance (smallest = most similar). + +**Note:** All vector similarity queries require `LIMIT` or `k = ?` (where k is the number of nearest neighbors to return). This prevents accidentally returning too many results on large datasets, since finding all vectors within a distance threshold requires calculating distance to every vector in the table. + +## Advanced Usage + +This fork adds several powerful features for production use: + +### Distance Constraints for KNN Queries + +Filter results by distance thresholds using `>`, `>=`, `<`, `<=` operators on the `distance` column: + +```sql +-- KNN query with distance constraint +-- Requests k=10 neighbors, but only returns those with distance < 1.5 +select rowid, distance +from vec_examples +where sample_embedding match '[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]' + and k = 10 + and distance < 1.5 +order by distance; +/* +┌───────┬──────────────────┐ +│ rowid │ distance │ +├───────┼──────────────────┤ +│ 5 │ 1.16368770599365 │ +└───────┴──────────────────┘ +*/ + +-- KNN query with range constraint: find vectors in a specific distance range +select rowid, distance +from vec_examples +where sample_embedding match '[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]' + and k = 20 + and distance between 1.5 and 2.0 +order by distance; +/* +┌───────┬──────────────────┐ +│ rowid │ distance │ +├───────┼──────────────────┤ +│ 13 │ 1.75137972831726 │ +│ 11 │ 1.83941268920898 │ +│ 7 │ 1.89339029788971 │ +│ 8 │ 1.92658650875092 │ +│ 10 │ 1.93983662128448 │ +└───────┴──────────────────┘ +*/ +``` + +### Cursor-based Pagination + +Instead of using `OFFSET` (which is slow for large datasets), you can use the last result's distance value as a 'cursor' to fetch the next page. This is more efficient because you're filtering directly rather than skipping rows. + +```sql +-- First page: get initial results +select rowid, distance +from vec_examples +where sample_embedding match '[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]' + and k = 3 +order by distance; +/* +┌───────┬──────────────────┐ +│ rowid │ distance │ +├───────┼──────────────────┤ +│ 5 │ 1.16368770599365 │ +│ 13 │ 1.75137972831726 │ +│ 11 │ 1.83941268920898 │ +└───────┴──────────────────┘ +*/ + +-- Next page: use last distance as cursor (distance > 1.83941268920898) +select rowid, distance +from vec_examples +where sample_embedding match '[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]' + and k = 3 + and distance > 1.83941268920898 +order by distance; /* ┌───────┬──────────────────┐ │ rowid │ distance │ ├───────┼──────────────────┤ -│ 2 │ 2.38687372207642 │ -│ 1 │ 2.38978505134583 │ +│ 7 │ 1.89339029788971 │ +│ 8 │ 1.92658650875092 │ +│ 10 │ 1.93983662128448 │ └───────┴──────────────────┘ */ ``` +### Space Reclamation with Optimize + +`optimize` compacts vec shadow tables. To shrink the database file: + +```sql +-- Before creating vec tables: enable autovacuum and apply it (recommended) +PRAGMA auto_vacuum = FULL; -- or INCREMENTAL +VACUUM; -- activates the setting + +-- Use WAL for better concurrency +PRAGMA journal_mode = WAL; +``` + +After deletes, reclaim space: + +**Important**: Both `optimize` and `VACUUM` require exclusive database access. +Close all other connections before running these commands, or run them on +application startup before spawning additional connections. + +```sql +-- Compact shadow tables +INSERT INTO vec_examples(vec_examples) VALUES('optimize'); + +- Flush WAL +PRAGMA wal_checkpoint(TRUNCATE); + +-- Reclaim freed pages (if using auto_vacuum=INCREMENTAL) +PRAGMA incremental_vacuum; + +-- If you did NOT enable autovacuum, run VACUUM (after checkpoint) to shrink the file. +-- With autovacuum on, VACUUM is optional. +VACUUM; +``` + +`VACUUM` should not corrupt vec tables; a checkpoint first is recommended when +using WAL so the rewrite starts from a clean state. + ## Sponsors -Development of `sqlite-vec` is supported by multiple generous sponsors! Mozilla +> [!NOTE] +> The sponsors listed below support the original [`asg017/sqlite-vec`](https://github.com/asg017/sqlite-vec) project by Alex Garcia, not this community fork. + +Development of the original `sqlite-vec` is supported by multiple generous sponsors! Mozilla is the main sponsor through the new Builders project.

diff --git a/VERSION b/VERSION index d9d7c74b..8282b6d5 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.1.7-alpha.2 \ No newline at end of file +0.2.4-alpha diff --git a/bindings/go/cgo/go.mod b/bindings/go/cgo/go.mod new file mode 100644 index 00000000..f1be5ca8 --- /dev/null +++ b/bindings/go/cgo/go.mod @@ -0,0 +1,5 @@ +module github.com/vlasky/sqlite-vec/bindings/go/cgo + +go 1.21 + +require github.com/mattn/go-sqlite3 v1.14.24 diff --git a/bindings/go/cgo/sqlite_vec.go b/bindings/go/cgo/sqlite_vec.go new file mode 100644 index 00000000..932f27be --- /dev/null +++ b/bindings/go/cgo/sqlite_vec.go @@ -0,0 +1,37 @@ +package sqlite_vec + +// #cgo CFLAGS: -DSQLITE_CORE +// #cgo LDFLAGS: -lm +// #include "../../../sqlite-vec.c" +// #include +import "C" +import ( + "encoding/binary" + "math" +) + +// Auto registers sqlite-vec to be automatically loaded on all new SQLite connections. +// Call this function before opening any database connections. +func Auto() { + C.sqlite3_auto_extension((*[0]byte)(C.sqlite3_vec_init)) +} + +// SerializeFloat32 converts a float32 slice into the compact binary format +// that sqlite-vec expects for vector data. +func SerializeFloat32(vector []float32) ([]byte, error) { + buf := make([]byte, len(vector)*4) + for i, v := range vector { + binary.LittleEndian.PutUint32(buf[i*4:], math.Float32bits(v)) + } + return buf, nil +} + +// SerializeInt8 converts an int8 slice into the compact binary format +// that sqlite-vec expects for int8 vector data. +func SerializeInt8(vector []int8) ([]byte, error) { + buf := make([]byte, len(vector)) + for i, v := range vector { + buf[i] = byte(v) + } + return buf, nil +} diff --git a/bindings/lua/sqlite_vec.lua b/bindings/lua/sqlite_vec.lua new file mode 100644 index 00000000..99d82e70 --- /dev/null +++ b/bindings/lua/sqlite_vec.lua @@ -0,0 +1,201 @@ +-- sqlite_vec.lua Lua 5.1 compatible version with JSON fallback +local sqlite3 = require("lsqlite3") + +local M = {} + +-- Function to load extension +function M.load(db) + local possible_paths = { + -- vec0 naming (this fork) + "../../dist/vec0.so", -- Linux + "../../dist/vec0.dll", -- Windows + "../../dist/vec0.dylib", -- macOS + "./dist/vec0.so", + "./dist/vec0.dll", + "./dist/vec0.dylib", + "../dist/vec0.so", + "../dist/vec0.dll", + "../dist/vec0.dylib", + "vec0", + -- sqlite-vec naming (upstream) + "../../sqlite-vec.so", + "../../sqlite-vec.dll", + "../../sqlite-vec.dylib", + "./sqlite-vec.so", + "./sqlite-vec.dll", + "./sqlite-vec.dylib", + "../sqlite-vec.so", + "../sqlite-vec.dll", + "../sqlite-vec.dylib", + "sqlite-vec", + } + + local entry_point = "sqlite3_vec_init" + + if db.enable_load_extension then + db:enable_load_extension(true) + for _, path in ipairs(possible_paths) do + local ok, result = pcall(function() + return db:load_extension(path, entry_point) + end) + -- lsqlite3 load_extension returns true on success + if ok and result then + db:enable_load_extension(false) + return true + end + end + db:enable_load_extension(false) + error("Failed to load extension from all paths") + else + for _, path in ipairs(possible_paths) do + local ok, result = pcall(function() + return db:load_extension(path, entry_point) + end) + -- lsqlite3 load_extension returns true on success + if ok and result then + return true + end + end + error("Failed to load extension from all paths") + end +end + +-- Lua 5.1 compatible float to binary conversion function (IEEE 754 single precision, little-endian) +local function float_to_bytes(f) + -- Handle special cases: NaN, Inf, -Inf, -0.0 + if f ~= f then + -- NaN: exponent=255, mantissa!=0, sign=0 (quiet NaN) + return string.char(0, 0, 192, 127) + elseif f == math.huge then + -- +Inf: exponent=255, mantissa=0, sign=0 + return string.char(0, 0, 128, 127) + elseif f == -math.huge then + -- -Inf: exponent=255, mantissa=0, sign=1 + return string.char(0, 0, 128, 255) + elseif f == 0 then + -- Check for -0.0 vs +0.0 + if 1/f == -math.huge then + -- -0.0: sign=1, exponent=0, mantissa=0 + return string.char(0, 0, 0, 128) + else + -- +0.0 + return string.char(0, 0, 0, 0) + end + end + + local sign = 0 + if f < 0 then + sign = 1 + f = -f + end + + local mantissa, exponent = math.frexp(f) + -- math.frexp returns mantissa in [0.5, 1), we need [1, 2) for IEEE 754 + exponent = exponent - 1 + + local is_subnormal = exponent < -126 + if is_subnormal then + -- Subnormal number: exponent field is 0, mantissa is denormalized + -- Formula: mantissa_stored = value * 2^149 = m * 2^(e + 149) + -- Since exponent = e - 1, we need: m * 2^(exponent + 1 + 149) = m * 2^(exponent + 150) + -- After multiplying by 2^23 later: m * 2^(exponent + 150) becomes the stored mantissa + -- Simplified: mantissa = m * 2^(exponent + 127) before the 2^23 scaling + mantissa = mantissa * 2^(exponent + 127) + exponent = 0 + else + -- Normal number: remove implicit leading 1 + -- frexp returns mantissa in [0.5, 1), convert to [0, 1) for IEEE 754 + mantissa = (mantissa - 0.5) * 2 + exponent = exponent + 127 + end + + -- Round half to even (banker's rounding) for IEEE 754 compliance + local scaled = mantissa * 2^23 + local floor_val = math.floor(scaled) + local frac = scaled - floor_val + -- Use epsilon comparison for 0.5 to handle floating-point precision issues + local is_half = math.abs(frac - 0.5) < 1e-9 + if frac > 0.5 + 1e-9 or (is_half and floor_val % 2 == 1) then + mantissa = floor_val + 1 + else + mantissa = floor_val + end + + -- Handle mantissa overflow from rounding (mantissa >= 2^23) + if mantissa >= 2^23 then + if is_subnormal then + -- Subnormal rounded up to smallest normal + mantissa = 0 + exponent = 1 + else + -- Normal number: carry into exponent + mantissa = 0 + exponent = exponent + 1 + end + end + + -- Handle exponent overflow -> Infinity + if exponent >= 255 then + -- Return ±Infinity + if sign == 1 then + return string.char(0, 0, 128, 255) -- -Inf + else + return string.char(0, 0, 128, 127) -- +Inf + end + end + + -- Encode as little-endian IEEE 754 single precision + local bytes = {} + bytes[1] = mantissa % 256 + mantissa = math.floor(mantissa / 256) + bytes[2] = mantissa % 256 + mantissa = math.floor(mantissa / 256) + bytes[3] = (mantissa % 128) + (exponent % 2) * 128 + exponent = math.floor(exponent / 2) + bytes[4] = exponent + sign * 128 + + return string.char(bytes[1], bytes[2], bytes[3], bytes[4]) +end + +-- Helper function: serialize float vector to binary format (little-endian IEEE 754) +function M.serialize_f32(vector) + local buffer = {} + + if string.pack then + -- Use ""] +description = "FFI bindings to the sqlite-vec SQLite extension" +homepage = "https://github.com/vlasky/sqlite-vec" +repository = "https://github.com/vlasky/sqlite-vec" +keywords = ["sqlite", "sqlite-extension"] +license = "MIT/Apache-2.0" + + +[dependencies] + +[build-dependencies] +cc = "1.0" + +[dev-dependencies] +rusqlite = "0.31.0" diff --git a/build.rs b/build.rs new file mode 100644 index 00000000..79bfb4d8 --- /dev/null +++ b/build.rs @@ -0,0 +1,6 @@ +fn main() { + cc::Build::new() + .file("sqlite-vec.c") + .define("SQLITE_CORE", None) + .compile("sqlite_vec0"); +} diff --git a/examples/simple-lua/.gitignore b/examples/simple-lua/.gitignore new file mode 100644 index 00000000..0ea42ee0 --- /dev/null +++ b/examples/simple-lua/.gitignore @@ -0,0 +1,7 @@ +# Lua bytecode +*.luac + +# SQLite databases +*.db +*.sqlite +*.sqlite3 diff --git a/examples/simple-lua/README.md b/examples/simple-lua/README.md new file mode 100644 index 00000000..50358a10 --- /dev/null +++ b/examples/simple-lua/README.md @@ -0,0 +1,83 @@ +# SQLite-Vec Simple Lua Example + +This example demonstrates how to use sqlite-vec with Lua and the lsqlite3 binding. + +## Prerequisites + +1. **Lua 5.1+** - The example is compatible with Lua 5.1 and later +2. **lsqlite3** - Lua SQLite3 binding +3. **sqlite-vec extension** - Built for your platform + +## Installation + +### Install lsqlite3 + +Using LuaRocks: +```bash +luarocks install lsqlite3 +``` + +Or on Ubuntu/Debian: +```bash +apt install lua-sql-sqlite3 +``` + +### Build sqlite-vec + +From the repository root: +```bash +make loadable +``` + +This creates `dist/vec0.so` (Linux), `dist/vec0.dylib` (macOS), or `dist/vec0.dll` (Windows). + +## Running the Example + +From this directory: +```bash +lua demo.lua +``` + +Or using the run script: +```bash +./run.sh +``` + +## Expected Output + +``` +=== SQLite-Vec Simple Lua Example === +sqlite_version=3.x.x, vec_version=v0.x.x +sqlite-vec extension loaded successfully +Inserting vector data... +Inserted 5 vectors +Executing KNN query... +Results (closest to [0.3, 0.3, 0.3, 0.3]): + rowid=3 distance=0.000000 + rowid=2 distance=0.200000 + rowid=4 distance=0.200000 + +Testing binary serialization... + Binary round-trip: rowid=1 distance=0.000000 + +Demo completed successfully +``` + +## Using the Binding in Your Project + +```lua +local sqlite3 = require("lsqlite3") +local sqlite_vec = require("sqlite_vec") + +local db = sqlite3.open_memory() + +-- Option 1: Auto-detect extension path +sqlite_vec.load(db) + +-- Option 2: Explicit path +sqlite_vec.load(db, "/path/to/vec0.so") + +-- Serialize vectors +local json_vec = sqlite_vec.serialize_json({1.0, 2.0, 3.0}) -- "[1.0,2.0,3.0]" +local binary_vec = sqlite_vec.serialize_f32({1.0, 2.0, 3.0}) -- 12 bytes +``` diff --git a/examples/simple-lua/demo.lua b/examples/simple-lua/demo.lua new file mode 100644 index 00000000..2d47c2f1 --- /dev/null +++ b/examples/simple-lua/demo.lua @@ -0,0 +1,106 @@ +#!/usr/bin/env lua + +-- Simple Lua example demonstrating sqlite-vec usage +-- This example shows how to create vector tables, insert data, and perform KNN queries + +local sqlite3 = require("lsqlite3") + +-- Add bindings directory to package path +package.path = package.path .. ";../../bindings/lua/?.lua" +local sqlite_vec = require("sqlite_vec") + +local function main() + print("=== SQLite-Vec Simple Lua Example ===") + + -- Create in-memory database + local db = sqlite3.open_memory() + if not db then + error("Failed to create database") + end + + -- Load sqlite-vec extension + sqlite_vec.load(db) + + -- Check versions (also verifies extension loaded) + for row in db:nrows("SELECT sqlite_version() as sv, vec_version() as vv") do + print(string.format("sqlite_version=%s, vec_version=%s", row.sv, row.vv)) + print("sqlite-vec extension loaded successfully") + end + + -- Test data + local items = { + {1, {0.1, 0.1, 0.1, 0.1}}, + {2, {0.2, 0.2, 0.2, 0.2}}, + {3, {0.3, 0.3, 0.3, 0.3}}, + {4, {0.4, 0.4, 0.4, 0.4}}, + {5, {0.5, 0.5, 0.5, 0.5}}, + } + local query = {0.3, 0.3, 0.3, 0.3} + + -- Create virtual table + local result = db:exec("CREATE VIRTUAL TABLE vec_items USING vec0(embedding float[4])") + if result ~= sqlite3.OK then + error("Failed to create virtual table: " .. db:errmsg()) + end + + -- Insert data using JSON format + print("Inserting vector data...") + db:exec("BEGIN") + + for _, item in ipairs(items) do + local rowid = item[1] + local vector_json = sqlite_vec.serialize_json(item[2]) + local sql = string.format("INSERT INTO vec_items(rowid, embedding) VALUES (%d, '%s')", + rowid, vector_json) + result = db:exec(sql) + if result ~= sqlite3.OK then + error("Failed to insert item: " .. db:errmsg()) + end + end + + db:exec("COMMIT") + print(string.format("Inserted %d vectors", #items)) + + -- Perform KNN query + print("Executing KNN query...") + local query_json = sqlite_vec.serialize_json(query) + + local sql = string.format([[ + SELECT rowid, distance + FROM vec_items + WHERE embedding MATCH '%s' + ORDER BY distance + LIMIT 3 + ]], query_json) + + print("Results (closest to [0.3, 0.3, 0.3, 0.3]):") + for row in db:nrows(sql) do + print(string.format(" rowid=%d distance=%.6f", row.rowid, row.distance)) + end + + -- Demonstrate binary serialization + print("\nTesting binary serialization...") + db:exec("CREATE VIRTUAL TABLE vec_binary USING vec0(embedding float[4])") + + local stmt = db:prepare("INSERT INTO vec_binary(rowid, embedding) VALUES (1, ?)") + stmt:bind_blob(1, sqlite_vec.serialize_f32({1.0, 2.0, 3.0, 4.0})) + stmt:step() + stmt:finalize() + + stmt = db:prepare("SELECT rowid, distance FROM vec_binary WHERE embedding MATCH ? LIMIT 1") + stmt:bind_blob(1, sqlite_vec.serialize_f32({1.0, 2.0, 3.0, 4.0})) + for row in stmt:nrows() do + print(string.format(" Binary round-trip: rowid=%d distance=%.6f", row.rowid, row.distance)) + end + stmt:finalize() + + db:close() + print("\nDemo completed successfully") +end + +-- Run with error handling +local success, err = pcall(main) +if not success then + print("Error: " .. tostring(err)) + os.exit(1) +end diff --git a/examples/simple-lua/run.sh b/examples/simple-lua/run.sh new file mode 100755 index 00000000..aecf7850 --- /dev/null +++ b/examples/simple-lua/run.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +# Run script for the Lua example + +set -e + +echo "=== SQLite-Vec Lua Example Runner ===" + +# Check if Lua is available +if ! command -v lua &> /dev/null; then + echo "Error: Lua is not installed or not in PATH" + exit 1 +fi + +# Check if lsqlite3 is available +if ! lua -e "require('lsqlite3')" 2>/dev/null; then + echo "Error: lsqlite3 module is not installed" + echo "Install with: luarocks install lsqlite3" + exit 1 +fi + +# Check if sqlite-vec extension exists +if [ ! -f "../../dist/vec0.so" ] && [ ! -f "../../dist/vec0.dylib" ] && [ ! -f "../../dist/vec0.dll" ]; then + echo "Error: sqlite-vec extension not found in ../../dist/" + echo "Build with: cd ../.. && make loadable" + exit 1 +fi + +# Run the demo +lua demo.lua diff --git a/extconf.rb b/extconf.rb new file mode 100644 index 00000000..302b01cd --- /dev/null +++ b/extconf.rb @@ -0,0 +1,16 @@ +require 'mkmf' + +# Run vendor script to get dependencies +system('./scripts/vendor.sh') or abort "Failed to run vendor.sh" + +# Build the loadable extension +system('make loadable') or abort "Failed to build extension" + +# Create a dummy Makefile since we already built with make loadable +File.open("Makefile", "w") do |f| + f.puts "install:" + f.puts "\tmkdir -p $(DESTDIR)$(sitearchdir)" + f.puts "\tcp dist/vec0.so $(DESTDIR)$(sitearchdir)/vec0.so 2>/dev/null || cp dist/vec0.dylib $(DESTDIR)$(sitearchdir)/vec0.dylib 2>/dev/null || cp dist/vec0.dll $(DESTDIR)$(sitearchdir)/vec0.dll 2>/dev/null || true" + f.puts "clean:" + f.puts "\t@true" +end diff --git a/index.cjs b/index.cjs new file mode 100644 index 00000000..f5f96d4d --- /dev/null +++ b/index.cjs @@ -0,0 +1,31 @@ +const { join } = require("node:path"); +const { arch, platform } = require("node:process"); +const { statSync } = require("node:fs"); + +const ENTRYPOINT_BASE_NAME = "vec0"; + +function extensionSuffix(platform) { + if (platform === "win32") return "dll"; + if (platform === "darwin") return "dylib"; + return "so"; +} + +function getLoadablePath() { + const loadablePath = join( + __dirname, + "dist", + `${ENTRYPOINT_BASE_NAME}.${extensionSuffix(platform)}` + ); + + if (!statSync(loadablePath, { throwIfNoEntry: false })) { + throw new Error(`Loadable extension for sqlite-vec not found at ${loadablePath}. Was the extension built? Run: make loadable`); + } + + return loadablePath; +} + +function load(db) { + db.loadExtension(getLoadablePath()); +} + +module.exports = { getLoadablePath, load }; diff --git a/index.d.ts b/index.d.ts new file mode 100644 index 00000000..1867dc8a --- /dev/null +++ b/index.d.ts @@ -0,0 +1,13 @@ +/** + * Returns the full path to the sqlite-vec loadable extension bundled with this package + */ +export declare function getLoadablePath(): string; + +interface Db { + loadExtension(file: string, entrypoint?: string | undefined): void; +} + +/** + * Load the sqlite-vec extension into a SQLite database connection + */ +export declare function load(db: Db): void; diff --git a/index.mjs b/index.mjs new file mode 100644 index 00000000..141435d7 --- /dev/null +++ b/index.mjs @@ -0,0 +1,32 @@ +import { join } from "node:path"; +import { fileURLToPath } from "node:url"; +import { arch, platform } from "node:process"; +import { statSync } from "node:fs"; + +const ENTRYPOINT_BASE_NAME = "vec0"; + +function extensionSuffix(platform) { + if (platform === "win32") return "dll"; + if (platform === "darwin") return "dylib"; + return "so"; +} + +function getLoadablePath() { + const loadablePath = join( + fileURLToPath(new URL(".", import.meta.url)), + "dist", + `${ENTRYPOINT_BASE_NAME}.${extensionSuffix(platform)}` + ); + + if (!statSync(loadablePath, { throwIfNoEntry: false })) { + throw new Error(`Loadable extension for sqlite-vec not found at ${loadablePath}. Was the extension built? Run: make loadable`); + } + + return loadablePath; +} + +function load(db) { + db.loadExtension(getLoadablePath()); +} + +export { getLoadablePath, load }; diff --git a/lib/sqlite_vec.rb b/lib/sqlite_vec.rb new file mode 100644 index 00000000..7bdf281a --- /dev/null +++ b/lib/sqlite_vec.rb @@ -0,0 +1,15 @@ + +module SqliteVec + class Error < StandardError; end + + # Read version from VERSION file + VERSION = File.read(File.expand_path('../VERSION', __dir__)).strip + + def self.loadable_path + File.expand_path('../dist/vec0', __dir__) + end + + def self.load(db) + db.load_extension(self.loadable_path) + end +end diff --git a/package.json b/package.json new file mode 100644 index 00000000..be22e736 --- /dev/null +++ b/package.json @@ -0,0 +1,62 @@ +{ + "name": "sqlite-vec", + "version": "0.2.4-alpha", + "description": "A vector search SQLite extension that runs anywhere", + "main": "./index.cjs", + "module": "./index.mjs", + "types": "./index.d.ts", + "exports": { + ".": { + "require": "./index.cjs", + "import": "./index.mjs", + "types": "./index.d.ts" + } + }, + "scripts": { + "install": "bash -c './scripts/vendor.sh && make loadable'", + "test": "echo \"Error: no test specified\" && exit 1" + }, + "repository": { + "type": "git", + "url": "git+https://github.com/vlasky/sqlite-vec.git" + }, + "keywords": [ + "sqlite", + "vector", + "search", + "embedding", + "similarity" + ], + "author": "Alex Garcia ", + "contributors": [ + "Vlad Lasky" + ], + "license": "(MIT OR Apache-2.0)", + "bugs": { + "url": "https://github.com/vlasky/sqlite-vec/issues" + }, + "homepage": "https://github.com/vlasky/sqlite-vec#readme", + "engines": { + "node": ">=14.0.0" + }, + "os": [ + "darwin", + "linux", + "win32" + ], + "files": [ + "index.cjs", + "index.mjs", + "index.d.ts", + "dist/vec0.so", + "dist/vec0.dylib", + "dist/vec0.dll", + "sqlite-vec.c", + "sqlite-vec.h", + "sqlite-vec.h.tmpl", + "VERSION", + "vendor/", + "scripts/vendor.sh", + "Makefile" + ] +} diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..e236f600 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,42 @@ +[build-system] +requires = ["setuptools>=45", "wheel", "setuptools-scm"] +build-backend = "setuptools.build_meta" + +[project] +name = "sqlite-vec" +version = "0.2.4a0" +description = "A vector search SQLite extension that runs anywhere" +readme = "README.md" +authors = [ + {name = "Alex Garcia", email = "alex@alex.garcia"}, +] +maintainers = [ + {name = "Vlad Lasky"}, +] +license = {text = "MIT OR Apache-2.0"} +classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] +requires-python = ">=3.8" + +[project.urls] +Homepage = "https://github.com/vlasky/sqlite-vec" +Documentation = "https://alexgarcia.xyz/sqlite-vec" +Repository = "https://github.com/vlasky/sqlite-vec" +"Original Repository" = "https://github.com/asg017/sqlite-vec" +Changelog = "https://github.com/vlasky/sqlite-vec/blob/main/CHANGELOG.md" + +[tool.setuptools] +packages = ["sqlite_vec"] + +[tool.setuptools.package-data] +sqlite_vec = ["*.so", "*.dylib", "*.dll"] diff --git a/scripts/vendor.sh b/scripts/vendor.sh index 0706aa5e..1b434317 100755 --- a/scripts/vendor.sh +++ b/scripts/vendor.sh @@ -1,7 +1,6 @@ -#!/bin/bash +#!/usr/bin/env bash mkdir -p vendor curl -o sqlite-amalgamation.zip https://www.sqlite.org/2024/sqlite-amalgamation-3450300.zip -unzip -d unzip sqlite-amalgamation.zip mv sqlite-amalgamation-3450300/* vendor/ rmdir sqlite-amalgamation-3450300 diff --git a/setup.py b/setup.py new file mode 100644 index 00000000..470ab7b8 --- /dev/null +++ b/setup.py @@ -0,0 +1,90 @@ +""" +Setup script for sqlite-vec +Builds the C extension and bundles Python helper functions +""" +import os +import subprocess +from setuptools import setup, Extension +from setuptools.command.build_ext import build_ext +from setuptools.command.build_py import build_py + +# Create sqlite_vec package directory with placeholder __init__.py +# This will be overwritten during build with the real content +os.makedirs('sqlite_vec', exist_ok=True) +if not os.path.exists('sqlite_vec/__init__.py'): + with open('sqlite_vec/__init__.py', 'w') as f: + f.write('# Placeholder - will be generated during build\n') + +class MakeBuild(build_ext): + """Custom build that uses Make to compile the extension""" + + def run(self): + import shutil + + # Run vendor script to get dependencies + subprocess.check_call(['./scripts/vendor.sh'], shell=True) + + # Build loadable extension + subprocess.check_call(['make', 'loadable']) + + # Determine extension suffix + ext_suffix = '.so' + if os.name == 'nt': + ext_suffix = '.dll' + elif os.uname().sysname == 'Darwin': + ext_suffix = '.dylib' + + # Read VERSION file + version = open('VERSION').read().strip() + + # Read helper functions + helper_code = open('bindings/python/extra_init.py').read() + + # Create a Python package that matches upstream API + with open('sqlite_vec/__init__.py', 'w') as f: + f.write(f''' +from os import path +import sqlite3 + +__version__ = "{version}" +__version_info__ = tuple(__version__.split(".")) + +def loadable_path(): + """ Returns the full path to the sqlite-vec loadable SQLite extension bundled with this package """ + + loadable_path = path.join(path.dirname(__file__), "vec0") + return path.normpath(loadable_path) + +def load(conn: sqlite3.Connection) -> None: + """ Load the sqlite-vec SQLite extension into the given database connection. """ + + conn.load_extension(loadable_path()) + +''') + f.write(helper_code) + + # Copy built extension to package directory + src = os.path.join('dist', f'vec0{ext_suffix}') + dst = os.path.join('sqlite_vec', f'vec0{ext_suffix}') + shutil.copy2(src, dst) + + def get_outputs(self): + # Return the path to the built extension + ext_suffix = '.so' + if os.name == 'nt': + ext_suffix = '.dll' + elif os.uname().sysname == 'Darwin': + ext_suffix = '.dylib' + return [os.path.join('sqlite_vec', f'vec0{ext_suffix}')] + +class CustomBuildPy(build_py): + """Ensure build_ext runs before build_py to generate files""" + + def run(self): + self.run_command('build_ext') + super().run() + +setup( + cmdclass={'build_py': CustomBuildPy, 'build_ext': MakeBuild}, + ext_modules=[Extension('vec0', sources=[])], +) diff --git a/site/features/knn.md b/site/features/knn.md index 70b69abb..0ede29de 100644 --- a/site/features/knn.md +++ b/site/features/knn.md @@ -7,7 +7,7 @@ Currently there are two ways to to perform KNN queries with `sqlite-vec`: With `vec0` virtual tables and "manually" with regular tables. The `vec0` virtual table is faster and more compact, but is less flexible and requires `JOIN`s back to your source tables. -The "manual" method is more flexible and +The "manual" method is more flexible and allows for more granular queries, but may be slower and use more space. diff --git a/site/guides/matryoshka.md b/site/guides/matryoshka.md index 7e02add3..12f8de67 100644 --- a/site/guides/matryoshka.md +++ b/site/guides/matryoshka.md @@ -1,7 +1,7 @@ # Matryoshka (Adaptive-Length) Embeddings Matryoshka embeddings are a new class of embedding models introduced in the -TODO-YYY paper [_TODO title_](https://arxiv.org/abs/2205.13147). They allow one +26 May 2022 paper titled [Matryoshka Representation Learning](https://arxiv.org/abs/2205.13147). They allow one to truncate excess dimensions in large vector, without sacrificing much quality. Let's say your embedding model generate 1024-dimensional vectors. If you have 1 @@ -16,7 +16,7 @@ Matryoshka embeddings, on the other hand, _can_ be truncated, without losing muc quality. Using [`mixedbread.ai`](#TODO) `mxbai-embed-large-v1` model, they claim that -They are called "Matryoshka" embeddings because ... TODO +They are called "Matryoshka" embeddings after the "Matryoshka dolls", also known as "Russian nesting dolls", which are a set of wooden dolls of decreasing size that are placed inside one another. In a similar way, Matryoshka embedding can store more important information in earlier dimensions, and less important information in later dimensions. See more about Matryoshka embeddings at [Hugging Face](https://huggingface.co/blog/matryoshka) ## Matryoshka Embeddings with `sqlite-vec` diff --git a/site/using/ruby.md b/site/using/ruby.md index 4c6b7c0e..a8707a0a 100644 --- a/site/using/ruby.md +++ b/site/using/ruby.md @@ -35,6 +35,6 @@ If your embeddings are provided as a list of numbers, use `.pack("f*")` to conve ```ruby embedding = [0.1, 0.2, 0.3, 0.4] -result = db.execute("SELECT vec_length(?)", [query.pack("f*")]]) +result = db.execute("SELECT vec_length(?)", [embedding.pack("f*")]) puts result.first.first # 4 ``` diff --git a/sqlite-vec.c b/sqlite-vec.c index 3cc802f0..6afe07b7 100644 --- a/sqlite-vec.c +++ b/sqlite-vec.c @@ -61,18 +61,6 @@ SQLITE_EXTENSION_INIT1 #define LONGDOUBLE_TYPE long double #endif -#ifndef _WIN32 -#ifndef __EMSCRIPTEN__ -#ifndef __COSMOPOLITAN__ -#ifndef __wasi__ -typedef u_int8_t uint8_t; -typedef u_int16_t uint16_t; -typedef u_int64_t uint64_t; -#endif -#endif -#endif -#endif - typedef int8_t i8; typedef uint8_t u8; typedef int16_t i16; @@ -112,6 +100,95 @@ typedef size_t usize; #define countof(x) (sizeof(x) / sizeof((x)[0])) #define min(a, b) (((a) <= (b)) ? (a) : (b)) +// Locale-independent strtod implementation for parsing JSON floats +// Fixes issue #241: strtod is locale-dependent and breaks with non-C locales +// +// This custom parser always uses '.' as decimal separator regardless of locale. +// Simpler and more portable than strtod_l, with no thread-safety issues. +static double strtod_c(const char *str, char **endptr) { + const char *p = str; + double result = 0.0; + int sign = 1; + int has_digits = 0; + + // Skip leading whitespace + while (*p == ' ' || *p == '\t' || *p == '\n' || *p == '\r') { + p++; + } + + // Handle optional sign + if (*p == '-') { + sign = -1; + p++; + } else if (*p == '+') { + p++; + } + + // Parse integer part + while (*p >= '0' && *p <= '9') { + result = result * 10.0 + (*p - '0'); + p++; + has_digits = 1; + } + + // Parse fractional part + if (*p == '.') { + double fraction = 0.0; + double divisor = 1.0; + p++; + + while (*p >= '0' && *p <= '9') { + fraction = fraction * 10.0 + (*p - '0'); + divisor *= 10.0; + p++; + has_digits = 1; + } + + result += fraction / divisor; + } + + // Parse exponent + if ((*p == 'e' || *p == 'E') && has_digits) { + int exp_sign = 1; + int exponent = 0; + p++; + + if (*p == '-') { + exp_sign = -1; + p++; + } else if (*p == '+') { + p++; + } + + while (*p >= '0' && *p <= '9') { + exponent = exponent * 10 + (*p - '0'); + p++; + } + + // Apply exponent using pow() for accuracy + if (exponent > 0) { + double exp_mult = pow(10.0, (double)exponent); + if (exp_sign == 1) { + result *= exp_mult; + } else { + result /= exp_mult; + } + } + } + + // Set end pointer + if (endptr) { + *endptr = (char *)(has_digits ? p : str); + } + + // Check for overflow/underflow + if (result == HUGE_VAL || result == -HUGE_VAL) { + errno = ERANGE; + } + + return sign * result; +} + enum VectorElementType { // clang-format off SQLITE_VEC_ELEMENT_TYPE_FLOAT32 = 223 + 0, @@ -460,6 +537,58 @@ static double distance_l1_f32(const void *a, const void *b, const void *d) { return l1_f32(a, b, d); } +// https://github.com/facebookresearch/faiss/blob/77e2e79cd0a680adc343b9840dd865da724c579e/faiss/utils/hamming_distance/common.h#L34 +static u8 hamdist_table[256] = { + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, + 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, + 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8}; + +static f32 distance_cosine_bit_u64(u64 *a, u64 *b, size_t n) { + f32 dot = 0; + f32 aMag = 0; + f32 bMag = 0; + + for (size_t i = 0; i < n; i++) { + dot += __builtin_popcountl(a[i] & b[i]); + aMag += __builtin_popcountl(a[i]); + bMag += __builtin_popcountl(b[i]); + } + + return 1 - (dot / (sqrt(aMag) * sqrt(bMag))); +} + +static f32 distance_cosine_bit_u8(u8 *a, u8 *b, size_t n) { + f32 dot = 0; + f32 aMag = 0; + f32 bMag = 0; + + for (size_t i = 0; i < n; i++) { + dot += hamdist_table[a[i] & b[i]]; + aMag += hamdist_table[a[i]]; + bMag += hamdist_table[b[i]]; + } + + return 1 - (dot / (sqrt(aMag) * sqrt(bMag))); +} + +static f32 distance_cosine_bit(const void *pA, const void *pB, + const void *pD) { + size_t dim = *((size_t *)pD); + + if ((dim % 64) == 0) { + return distance_cosine_bit_u64((u64 *)pA, (u64 *)pB, dim / 8 / CHAR_BIT); + } + return distance_cosine_bit_u8((u8 *)pA, (u8 *)pB, dim / CHAR_BIT); +} + static f32 distance_cosine_float(const void *pVect1v, const void *pVect2v, const void *qty_ptr) { f32 *pVect1 = (f32 *)pVect1v; @@ -497,20 +626,6 @@ static f32 distance_cosine_int8(const void *pA, const void *pB, return 1 - (dot / (sqrt(aMag) * sqrt(bMag))); } -// https://github.com/facebookresearch/faiss/blob/77e2e79cd0a680adc343b9840dd865da724c579e/faiss/utils/hamming_distance/common.h#L34 -static u8 hamdist_table[256] = { - 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, - 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, - 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, - 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, - 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, - 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8}; - static f32 distance_hamming_u8(u8 *a, u8 *b, size_t n) { int same = 0; for (unsigned long i = 0; i < n; i++) { @@ -524,7 +639,7 @@ static f32 distance_hamming_u8(u8 *a, u8 *b, size_t n) { // From // https://github.com/ngtcp2/ngtcp2/blob/b64f1e77b5e0d880b93d31f474147fae4a1d17cc/lib/ngtcp2_ringbuf.c, // line 34-43 -static unsigned int __builtin_popcountl(unsigned int x) { +static unsigned int __builtin_popcountl(u64 x) { unsigned int c = 0; for (; x; ++c) { x &= x - 1; @@ -533,7 +648,13 @@ static unsigned int __builtin_popcountl(unsigned int x) { } #else #include +#ifdef _WIN64 #define __builtin_popcountl __popcnt64 +#else +static unsigned int __builtin_popcountl(u64 n) { + return __popcnt((u32)n) + __popcnt((u32)(n >> 32)); +} +#endif #endif #endif @@ -751,7 +872,7 @@ static int fvec_from_value(sqlite3_value *value, f32 **vector, char *endptr; errno = 0; - double result = strtod(ptr, &endptr); + double result = strtod_c(ptr, &endptr); if ((errno != 0 && result == 0) // some interval error? || (errno == ERANGE && (result == HUGE_VAL || result == -HUGE_VAL)) // too big / smalls @@ -1016,7 +1137,7 @@ int ensure_vector_match(sqlite3_value *aValue, sqlite3_value *bValue, void **a, if (rc != SQLITE_OK) { *outError = sqlite3_mprintf("Error reading 2nd vector: %s", error); sqlite3_free(error); - aCleanup(a); + aCleanup(*a); return SQLITE_ERROR; } @@ -1167,9 +1288,8 @@ static void vec_distance_cosine(sqlite3_context *context, int argc, switch (elementType) { case SQLITE_VEC_ELEMENT_TYPE_BIT: { - sqlite3_result_error( - context, "Cannot calculate cosine distance between two bitvectors.", - -1); + f32 result = distance_cosine_bit(a, b, &dimensions); + sqlite3_result_double(context, result); goto finish; } case SQLITE_VEC_ELEMENT_TYPE_FLOAT32: { @@ -3376,6 +3496,7 @@ static sqlite3_module vec_npy_eachModule = { #define VEC0_COLUMN_USERN_START 1 #define VEC0_COLUMN_OFFSET_DISTANCE 1 #define VEC0_COLUMN_OFFSET_K 2 +#define VEC0_COLUMN_OFFSET_TABLE_NAME 3 #define VEC0_SHADOW_INFO_NAME "\"%w\".\"%w_info\"" @@ -3416,7 +3537,7 @@ static sqlite3_module vec_npy_eachModule = { /// 1) schema, 2) original vtab table name #define VEC0_SHADOW_VECTOR_N_CREATE \ "CREATE TABLE " VEC0_SHADOW_VECTOR_N_NAME "(" \ - "rowid PRIMARY KEY," \ + "rowid INTEGER PRIMARY KEY," \ "vectors BLOB NOT NULL" \ ");" @@ -3645,6 +3766,17 @@ int vec0_column_k_idx(vec0_vtab *p) { VEC0_COLUMN_OFFSET_K; } +/** + * @brief Returns the index of the table_name hidden column for the given vec0 table. + * + * @param p vec0 table + * @return int + */ +int vec0_column_table_name_idx(vec0_vtab *p) { + return VEC0_COLUMN_USERN_START + (vec0_num_defined_user_columns(p) - 1) + + VEC0_COLUMN_OFFSET_TABLE_NAME; +} + /** * Returns 1 if the given column-based index is a valid vector column, * 0 otherwise. @@ -4862,6 +4994,9 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, goto error; } + const char *schemaName = argv[1]; + const char *tableName = argv[2]; + sqlite3_str *createStr = sqlite3_str_new(NULL); sqlite3_str_appendall(createStr, "CREATE TABLE x("); if (pkColumnName) { @@ -4903,7 +5038,8 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, } } - sqlite3_str_appendall(createStr, " distance hidden, k hidden) "); + sqlite3_str_appendall(createStr, " distance hidden, k hidden, "); + sqlite3_str_appendf(createStr, "%s hidden) ", tableName); if (pkColumnName) { sqlite3_str_appendall(createStr, "without rowid "); } @@ -4920,9 +5056,6 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, goto error; } - const char *schemaName = argv[1]; - const char *tableName = argv[2]; - pNew->db = db; pNew->pkIsText = pkColumnType == SQLITE_TEXT; pNew->schemaName = sqlite3_mprintf("%s", schemaName); @@ -5094,7 +5227,7 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, } for (int i = 0; i < pNew->numMetadataColumns; i++) { - char *zSql = sqlite3_mprintf("CREATE TABLE " VEC0_SHADOW_METADATA_N_NAME "(rowid PRIMARY KEY, data BLOB NOT NULL);", + char *zSql = sqlite3_mprintf("CREATE TABLE " VEC0_SHADOW_METADATA_N_NAME "(rowid INTEGER PRIMARY KEY, data BLOB NOT NULL);", pNew->schemaName, pNew->tableName, i); if (!zSql) { goto error; @@ -5305,11 +5438,21 @@ static int vec0Close(sqlite3_vtab_cursor *cur) { typedef enum { // If any values are updated, please update the ARCHITECTURE.md docs accordingly! + // ~~~ KNN QUERIES ~~~ // VEC0_IDXSTR_KIND_KNN_MATCH = '{', VEC0_IDXSTR_KIND_KNN_K = '}', VEC0_IDXSTR_KIND_KNN_ROWID_IN = '[', + // argv[i] is a constraint on a PARTITON KEY column in a KNN query + // VEC0_IDXSTR_KIND_KNN_PARTITON_CONSTRAINT = ']', + + // argv[i] is a constraint on the distance column in a KNN query + VEC0_IDXSTR_KIND_KNN_DISTANCE_CONSTRAINT = '*', + + // ~~~ POINT QUERIES ~~~ // VEC0_IDXSTR_KIND_POINT_ID = '!', + + // ~~~ ??? ~~~ // VEC0_IDXSTR_KIND_METADATA_CONSTRAINT = '&', } vec0_idxstr_kind; @@ -5318,11 +5461,22 @@ typedef enum { typedef enum { // If any values are updated, please update the ARCHITECTURE.md docs accordingly! + // Equality constraint on a PARTITON KEY column, ex `user_id = 123` VEC0_PARTITION_OPERATOR_EQ = 'a', + + // "Greater than" constraint on a PARTITON KEY column, ex `year > 2024` VEC0_PARTITION_OPERATOR_GT = 'b', + + // "Less than or equal to" constraint on a PARTITON KEY column, ex `year <= 2024` VEC0_PARTITION_OPERATOR_LE = 'c', + + // "Less than" constraint on a PARTITON KEY column, ex `year < 2024` VEC0_PARTITION_OPERATOR_LT = 'd', + + // "Greater than or equal to" constraint on a PARTITON KEY column, ex `year >= 2024` VEC0_PARTITION_OPERATOR_GE = 'e', + + // "Not equal to" constraint on a PARTITON KEY column, ex `year != 2024` VEC0_PARTITION_OPERATOR_NE = 'f', } vec0_partition_operator; typedef enum { @@ -5333,8 +5487,23 @@ typedef enum { VEC0_METADATA_OPERATOR_GE = 'e', VEC0_METADATA_OPERATOR_NE = 'f', VEC0_METADATA_OPERATOR_IN = 'g', + VEC0_METADATA_OPERATOR_LIKE = 'h', + VEC0_METADATA_OPERATOR_GLOB = 'i', + VEC0_METADATA_OPERATOR_IS = 'j', + VEC0_METADATA_OPERATOR_ISNOT = 'k', + VEC0_METADATA_OPERATOR_ISNULL = 'l', + VEC0_METADATA_OPERATOR_ISNOTNULL = 'm', } vec0_metadata_operator; + +typedef enum { + + VEC0_DISTANCE_CONSTRAINT_GT = 'a', + VEC0_DISTANCE_CONSTRAINT_GE = 'b', + VEC0_DISTANCE_CONSTRAINT_LT = 'c', + VEC0_DISTANCE_CONSTRAINT_LE = 'd', +} vec0_distance_constraint_operator; + static int vec0BestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pIdxInfo) { vec0_vtab *p = (vec0_vtab *)pVTab; /** @@ -5494,6 +5663,7 @@ static int vec0BestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pIdxInfo) { } #endif + // find any PARTITION KEY column constraints for (int i = 0; i < pIdxInfo->nConstraint; i++) { if (!pIdxInfo->aConstraint[i].usable) continue; @@ -5548,6 +5718,7 @@ static int vec0BestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pIdxInfo) { } + // find any metadata column constraints for (int i = 0; i < pIdxInfo->nConstraint; i++) { if (!pIdxInfo->aConstraint[i].usable) continue; @@ -5615,22 +5786,64 @@ static int vec0BestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pIdxInfo) { value = VEC0_METADATA_OPERATOR_NE; break; } + case SQLITE_INDEX_CONSTRAINT_LIKE: { + value = VEC0_METADATA_OPERATOR_LIKE; + break; + } + case SQLITE_INDEX_CONSTRAINT_GLOB: { + value = VEC0_METADATA_OPERATOR_GLOB; + break; + } + case SQLITE_INDEX_CONSTRAINT_IS: { + value = VEC0_METADATA_OPERATOR_IS; + break; + } + case SQLITE_INDEX_CONSTRAINT_ISNOT: { + value = VEC0_METADATA_OPERATOR_ISNOT; + break; + } + case SQLITE_INDEX_CONSTRAINT_ISNULL: { + value = VEC0_METADATA_OPERATOR_ISNULL; + break; + } + case SQLITE_INDEX_CONSTRAINT_ISNOTNULL: { + value = VEC0_METADATA_OPERATOR_ISNOTNULL; + break; + } default: { // IMP: V16511_00582 rc = SQLITE_ERROR; vtab_set_error(pVTab, "An illegal WHERE constraint was provided on a vec0 metadata column in a KNN query. " - "Only one of EQUALS, GREATER_THAN, LESS_THAN_OR_EQUAL, LESS_THAN, GREATER_THAN_OR_EQUAL, NOT_EQUALS is allowed." + "Only one of EQUALS, GREATER_THAN, LESS_THAN_OR_EQUAL, LESS_THAN, GREATER_THAN_OR_EQUAL, NOT_EQUALS, LIKE, GLOB, IS, IS NOT, IS NULL, IS NOT NULL is allowed." ); goto done; } } if(p->metadata_columns[metadata_idx].kind == VEC0_METADATA_COLUMN_KIND_BOOLEAN) { - if(!(value == VEC0_METADATA_OPERATOR_EQ || value == VEC0_METADATA_OPERATOR_NE)) { + if(!(value == VEC0_METADATA_OPERATOR_EQ || value == VEC0_METADATA_OPERATOR_NE || + value == VEC0_METADATA_OPERATOR_IS || value == VEC0_METADATA_OPERATOR_ISNOT || + value == VEC0_METADATA_OPERATOR_ISNULL || value == VEC0_METADATA_OPERATOR_ISNOTNULL)) { // IMP: V10145_26984 rc = SQLITE_ERROR; - vtab_set_error(pVTab, "ONLY EQUALS (=) or NOT_EQUALS (!=) operators are allowed on boolean metadata columns."); + vtab_set_error(pVTab, "ONLY EQUALS (=), NOT_EQUALS (!=), IS, IS NOT, IS NULL, or IS NOT NULL operators are allowed on boolean metadata columns."); + goto done; + } + } + + if(value == VEC0_METADATA_OPERATOR_LIKE) { + if(p->metadata_columns[metadata_idx].kind != VEC0_METADATA_COLUMN_KIND_TEXT) { + rc = SQLITE_ERROR; + vtab_set_error(pVTab, "LIKE operator is only allowed on TEXT metadata columns."); + goto done; + } + } + + if(value == VEC0_METADATA_OPERATOR_GLOB) { + if(p->metadata_columns[metadata_idx].kind != VEC0_METADATA_COLUMN_KIND_TEXT) { + rc = SQLITE_ERROR; + vtab_set_error(pVTab, "GLOB operator is only allowed on TEXT metadata columns."); goto done; } } @@ -5644,6 +5857,58 @@ static int vec0BestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pIdxInfo) { } + // find any distance column constraints + for (int i = 0; i < pIdxInfo->nConstraint; i++) { + if (!pIdxInfo->aConstraint[i].usable) + continue; + + int iColumn = pIdxInfo->aConstraint[i].iColumn; + int op = pIdxInfo->aConstraint[i].op; + if(op == SQLITE_INDEX_CONSTRAINT_LIMIT || op == SQLITE_INDEX_CONSTRAINT_OFFSET) { + continue; + } + if(vec0_column_distance_idx(p) != iColumn) { + continue; + } + + char value = 0; + switch(op) { + case SQLITE_INDEX_CONSTRAINT_GT: { + value = VEC0_DISTANCE_CONSTRAINT_GT; + break; + } + case SQLITE_INDEX_CONSTRAINT_GE: { + value = VEC0_DISTANCE_CONSTRAINT_GE; + break; + } + case SQLITE_INDEX_CONSTRAINT_LT: { + value = VEC0_DISTANCE_CONSTRAINT_LT; + break; + } + case SQLITE_INDEX_CONSTRAINT_LE: { + value = VEC0_DISTANCE_CONSTRAINT_LE; + break; + } + default: { + // IMP TODO + rc = SQLITE_ERROR; + vtab_set_error( + pVTab, + "Illegal WHERE constraint on distance column in a KNN query. " + "Only one of GT, GE, LT, LE constraints are allowed." + ); + goto done; + } + } + + pIdxInfo->aConstraintUsage[i].argvIndex = argvIndex++; + pIdxInfo->aConstraintUsage[i].omit = 1; + sqlite3_str_appendchar(idxStr, 1, VEC0_IDXSTR_KIND_KNN_DISTANCE_CONSTRAINT); + sqlite3_str_appendchar(idxStr, 1, value); + sqlite3_str_appendchar(idxStr, 1, '_'); + sqlite3_str_appendchar(idxStr, 1, '_'); + } + pIdxInfo->idxNum = iMatchVectorTerm; @@ -5672,7 +5937,6 @@ static int vec0BestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pIdxInfo) { } pIdxInfo->needToFreeIdxStr = 1; - rc = SQLITE_OK; done: @@ -5962,6 +6226,41 @@ struct Vec0MetadataInTextEntry { char * zString; }; +// Helper function to detect if a LIKE pattern is prefix-only (e.g., 'abc%') +// Returns 1 if the pattern ends with '%' and has no wildcards in the middle +// Returns 0 otherwise +static int vec0_is_prefix_only_like_pattern(const char *pattern, int n) { + if (n == 0) return 0; + + // Must end with '%' + if (pattern[n - 1] != '%') return 0; + + // Check for wildcards in the prefix (before the trailing '%') + for (int i = 0; i < n - 1; i++) { + if (pattern[i] == '%' || pattern[i] == '_') { + return 0; + } + } + + return 1; +} + +static int vec0_is_prefix_only_glob_pattern(const char *pattern, int n) { + if (n == 0) return 0; + + // Must end with '*' + if (pattern[n - 1] != '*') return 0; + + // Check for wildcards in the prefix (before the trailing '*') + for (int i = 0; i < n - 1; i++) { + if (pattern[i] == '*' || pattern[i] == '?') { + return 0; + } + } + + return 1; +} + int vec0_metadata_filter_text(vec0_vtab * p, sqlite3_value * value, const void * buffer, int size, vec0_metadata_operator op, u8* b, int metadata_idx, int chunk_rowid, struct Array * aMetadataIn, int argv_idx) { int rc; @@ -5980,7 +6279,7 @@ int vec0_metadata_filter_text(vec0_vtab * p, sqlite3_value * value, const void * return rc; } assert(sqlite3_blob_bytes(rowidsBlob) % sizeof(i64) == 0); - assert((sqlite3_blob_bytes(rowidsBlob) / sizeof(i64)) == size); + assert((size_t)(sqlite3_blob_bytes(rowidsBlob) / sizeof(i64)) == (size_t)size); rowids = sqlite3_malloc(sqlite3_blob_bytes(rowidsBlob)); if(!rowids) { @@ -5995,6 +6294,13 @@ int vec0_metadata_filter_text(vec0_vtab * p, sqlite3_value * value, const void * } sqlite3_blob_close(rowidsBlob); + // Map IS/ISNOT to EQ/NE (they behave identically for text) + if(op == VEC0_METADATA_OPERATOR_IS) { + op = VEC0_METADATA_OPERATOR_EQ; + } else if(op == VEC0_METADATA_OPERATOR_ISNOT) { + op = VEC0_METADATA_OPERATOR_NE; + } + switch(op) { int nPrefix; char * sPrefix; @@ -6200,7 +6506,7 @@ int vec0_metadata_filter_text(vec0_vtab * p, sqlite3_value * value, const void * } case VEC0_METADATA_OPERATOR_IN: { - size_t metadataInIdx = -1; + int metadataInIdx = -1; for(size_t i = 0; i < aMetadataIn->length; i++) { struct Vec0MetadataIn * metadataIn = &(((struct Vec0MetadataIn *) aMetadataIn->z)[i]); if(metadataIn->argv_idx == argv_idx) { @@ -6260,60 +6566,252 @@ int vec0_metadata_filter_text(vec0_vtab * p, sqlite3_value * value, const void * break; } - } - rc = SQLITE_OK; + case VEC0_METADATA_OPERATOR_LIKE: { + int is_prefix_only = vec0_is_prefix_only_like_pattern(sTarget, nTarget); - done: - sqlite3_finalize(stmt); - sqlite3_free(rowids); - return rc; + if (is_prefix_only) { + // Fast path: prefix-only pattern (e.g., 'abc%') + // Can use the 12-byte cache for optimization + int nPattern = nTarget - 1; // Exclude trailing '%' -} + for(int i = 0; i < size; i++) { + view = &((u8*) buffer)[i * VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH]; + nPrefix = ((int*) view)[0]; + sPrefix = (char *) &view[4]; -/** - * @brief Fill in bitmap of chunk values, whether or not the values match a metadata constraint - * - * @param p vec0_vtab - * @param metadata_idx index of the metatadata column to perfrom constraints on - * @param value sqlite3_value of the constraints value - * @param blob sqlite3_blob that is already opened on the metdata column's shadow chunk table - * @param chunk_rowid rowid of the chunk to calculate on - * @param b pre-allocated and zero'd out bitmap to write results to - * @param size size of the chunk - * @return int SQLITE_OK on success, error code otherwise - */ -int vec0_set_metadata_filter_bitmap( - vec0_vtab *p, - int metadata_idx, - vec0_metadata_operator op, - sqlite3_value * value, - sqlite3_blob * blob, - i64 chunk_rowid, - u8* b, - int size, - struct Array * aMetadataIn, int argv_idx) { - // TODO: shouldn't this skip in-valid entries from the chunk's validity bitmap? + // String must be at least as long as the pattern prefix + if(nPrefix < nPattern) { + bitmap_set(b, i, 0); + continue; + } - int rc; - rc = sqlite3_blob_reopen(blob, chunk_rowid); - if(rc != SQLITE_OK) { - return rc; - } + // Compare pattern prefix against cached prefix (case-insensitive) + int cmpLen = min(nPattern, VEC0_METADATA_TEXT_VIEW_DATA_LENGTH); + int cmpPrefix = sqlite3_strnicmp(sPrefix, sTarget, cmpLen); - vec0_metadata_column_kind kind = p->metadata_columns[metadata_idx].kind; - int szMatch = 0; - int blobSize = sqlite3_blob_bytes(blob); - switch(kind) { - case VEC0_METADATA_COLUMN_KIND_BOOLEAN: { - szMatch = blobSize == size / CHAR_BIT; - break; - } - case VEC0_METADATA_COLUMN_KIND_INTEGER: { - szMatch = blobSize == size * sizeof(i64); - break; - } - case VEC0_METADATA_COLUMN_KIND_FLOAT: { - szMatch = blobSize == size * sizeof(double); + // For short strings (fits in cache), prefix comparison is enough + if(nPrefix <= VEC0_METADATA_TEXT_VIEW_DATA_LENGTH) { + bitmap_set(b, i, cmpPrefix == 0); + continue; + } + + // For long strings, if cached prefix doesn't match, reject early + if(cmpPrefix != 0) { + bitmap_set(b, i, 0); + continue; + } + + // If pattern fits in cache, it matches + if(nPattern <= VEC0_METADATA_TEXT_VIEW_DATA_LENGTH) { + bitmap_set(b, i, 1); + continue; + } + + // Pattern is longer than cache, need to check full string + rc = vec0_get_metadata_text_long_value(p, &stmt, metadata_idx, rowids[i], &nFull, &sFull); + if(rc != SQLITE_OK) { + goto done; + } + if(nPrefix != nFull) { + rc = SQLITE_ERROR; + goto done; + } + + // Check if full string starts with pattern prefix (case-insensitive) + bitmap_set(b, i, sqlite3_strnicmp(sFull, sTarget, nPattern) == 0); + } + } else { + // Slow path: complex pattern (e.g., '%abc', 'a%b', etc.) + // Must fetch and check full string for each row + for(int i = 0; i < size; i++) { + view = &((u8*) buffer)[i * VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH]; + nPrefix = ((int*) view)[0]; + sPrefix = (char *) &view[4]; + + // For short strings, use cached value directly + if(nPrefix <= VEC0_METADATA_TEXT_VIEW_DATA_LENGTH) { + // sqlite3_strlike returns 0 on match, non-zero otherwise + bitmap_set(b, i, sqlite3_strlike(sTarget, sPrefix, 0) == 0); + continue; + } + + // For long strings, fetch full value + rc = vec0_get_metadata_text_long_value(p, &stmt, metadata_idx, rowids[i], &nFull, &sFull); + if(rc != SQLITE_OK) { + goto done; + } + if(nPrefix != nFull) { + rc = SQLITE_ERROR; + goto done; + } + + // Use SQLite's LIKE implementation + bitmap_set(b, i, sqlite3_strlike(sTarget, sFull, 0) == 0); + } + } + break; + } + + case VEC0_METADATA_OPERATOR_GLOB: { + int is_prefix_only = vec0_is_prefix_only_glob_pattern(sTarget, nTarget); + + if (is_prefix_only) { + // Fast path: prefix-only pattern (e.g., 'abc*') + // Can use the 12-byte cache for optimization + int nPattern = nTarget - 1; // Exclude trailing '*' + + for(int i = 0; i < size; i++) { + view = &((u8*) buffer)[i * VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH]; + nPrefix = ((int*) view)[0]; + sPrefix = (char *) &view[4]; + + // String must be at least as long as the pattern prefix + if(nPrefix < nPattern) { + bitmap_set(b, i, 0); + continue; + } + + // Compare pattern prefix against cached prefix (case-sensitive for GLOB) + int cmpLen = min(nPattern, VEC0_METADATA_TEXT_VIEW_DATA_LENGTH); + int cmpPrefix = strncmp(sPrefix, sTarget, cmpLen); + + // For short strings (fits in cache), prefix comparison is enough + if(nPrefix <= VEC0_METADATA_TEXT_VIEW_DATA_LENGTH) { + bitmap_set(b, i, cmpPrefix == 0); + continue; + } + + // For long strings, if cached prefix doesn't match, reject early + if(cmpPrefix != 0) { + bitmap_set(b, i, 0); + continue; + } + + // If pattern fits in cache, it matches + if(nPattern <= VEC0_METADATA_TEXT_VIEW_DATA_LENGTH) { + bitmap_set(b, i, 1); + continue; + } + + // Pattern is longer than cache, need to check full string + rc = vec0_get_metadata_text_long_value(p, &stmt, metadata_idx, rowids[i], &nFull, &sFull); + if(rc != SQLITE_OK) { + goto done; + } + if(nPrefix != nFull) { + rc = SQLITE_ERROR; + goto done; + } + + // Check if full string starts with pattern prefix (case-sensitive) + bitmap_set(b, i, strncmp(sFull, sTarget, nPattern) == 0); + } + } else { + // Slow path: complex pattern (e.g., '*abc', 'a*b', etc.) + // Must fetch and check full string for each row + for(int i = 0; i < size; i++) { + view = &((u8*) buffer)[i * VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH]; + nPrefix = ((int*) view)[0]; + sPrefix = (char *) &view[4]; + + // For short strings, use cached value directly + if(nPrefix <= VEC0_METADATA_TEXT_VIEW_DATA_LENGTH) { + // sqlite3_strglob returns 0 on match, non-zero otherwise + bitmap_set(b, i, sqlite3_strglob(sTarget, sPrefix) == 0); + continue; + } + + // For long strings, fetch full value + rc = vec0_get_metadata_text_long_value(p, &stmt, metadata_idx, rowids[i], &nFull, &sFull); + if(rc != SQLITE_OK) { + goto done; + } + if(nPrefix != nFull) { + rc = SQLITE_ERROR; + goto done; + } + + // Use SQLite's GLOB implementation + bitmap_set(b, i, sqlite3_strglob(sTarget, sFull) == 0); + } + } + break; + } + + case VEC0_METADATA_OPERATOR_IS: + case VEC0_METADATA_OPERATOR_ISNOT: { + // Should never be reached - IS/ISNOT are mapped to EQ/NE before the switch + break; + } + + case VEC0_METADATA_OPERATOR_ISNULL: { + // IS NULL always returns false (metadata columns don't support NULL) + // All bits stay 0 (already initialized) + break; + } + + case VEC0_METADATA_OPERATOR_ISNOTNULL: { + // IS NOT NULL always returns true (metadata columns don't support NULL) + for(int i = 0; i < size; i++) { + bitmap_set(b, i, 1); + } + break; + } + + } + rc = SQLITE_OK; + + done: + sqlite3_finalize(stmt); + sqlite3_free(rowids); + return rc; + +} + +/** + * @brief Fill in bitmap of chunk values, whether or not the values match a metadata constraint + * + * @param p vec0_vtab + * @param metadata_idx index of the metatadata column to perfrom constraints on + * @param value sqlite3_value of the constraints value + * @param blob sqlite3_blob that is already opened on the metdata column's shadow chunk table + * @param chunk_rowid rowid of the chunk to calculate on + * @param b pre-allocated and zero'd out bitmap to write results to + * @param size size of the chunk + * @return int SQLITE_OK on success, error code otherwise + */ +int vec0_set_metadata_filter_bitmap( + vec0_vtab *p, + int metadata_idx, + vec0_metadata_operator op, + sqlite3_value * value, + sqlite3_blob * blob, + i64 chunk_rowid, + u8* b, + int size, + struct Array * aMetadataIn, int argv_idx) { + // TODO: shouldn't this skip in-valid entries from the chunk's validity bitmap? + + int rc; + rc = sqlite3_blob_reopen(blob, chunk_rowid); + if(rc != SQLITE_OK) { + return rc; + } + + vec0_metadata_column_kind kind = p->metadata_columns[metadata_idx].kind; + int szMatch = 0; + int blobSize = sqlite3_blob_bytes(blob); + switch(kind) { + case VEC0_METADATA_COLUMN_KIND_BOOLEAN: { + szMatch = blobSize == size / CHAR_BIT; + break; + } + case VEC0_METADATA_COLUMN_KIND_INTEGER: { + szMatch = blobSize == (int)(size * sizeof(i64)); + break; + } + case VEC0_METADATA_COLUMN_KIND_FLOAT: { + szMatch = blobSize == (int)(size * sizeof(double)); break; } case VEC0_METADATA_COLUMN_KIND_TEXT: { @@ -6335,11 +6833,41 @@ int vec0_set_metadata_filter_bitmap( switch(kind) { case VEC0_METADATA_COLUMN_KIND_BOOLEAN: { int target = sqlite3_value_int(value); - if( (target && op == VEC0_METADATA_OPERATOR_EQ) || (!target && op == VEC0_METADATA_OPERATOR_NE)) { - for(int i = 0; i < size; i++) { bitmap_set(b, i, bitmap_get((u8*) buffer, i)); } - } - else { - for(int i = 0; i < size; i++) { bitmap_set(b, i, !bitmap_get((u8*) buffer, i)); } + switch(op) { + case VEC0_METADATA_OPERATOR_EQ: + case VEC0_METADATA_OPERATOR_IS: { + // EQ and IS behave identically for booleans + if(target) { + for(int i = 0; i < size; i++) { bitmap_set(b, i, bitmap_get((u8*) buffer, i)); } + } else { + for(int i = 0; i < size; i++) { bitmap_set(b, i, !bitmap_get((u8*) buffer, i)); } + } + break; + } + case VEC0_METADATA_OPERATOR_NE: + case VEC0_METADATA_OPERATOR_ISNOT: { + // NE and IS NOT behave identically for booleans + if(target) { + for(int i = 0; i < size; i++) { bitmap_set(b, i, !bitmap_get((u8*) buffer, i)); } + } else { + for(int i = 0; i < size; i++) { bitmap_set(b, i, bitmap_get((u8*) buffer, i)); } + } + break; + } + case VEC0_METADATA_OPERATOR_ISNULL: { + // IS NULL always returns false (metadata columns don't support NULL) + for(int i = 0; i < size; i++) { bitmap_set(b, i, 0); } + break; + } + case VEC0_METADATA_OPERATOR_ISNOTNULL: { + // IS NOT NULL always returns true (metadata columns don't support NULL) + for(int i = 0; i < size; i++) { bitmap_set(b, i, 1); } + break; + } + default: { + // Should not reach here if xBestIndex validation works correctly + break; + } } break; } @@ -6397,6 +6925,34 @@ int vec0_set_metadata_filter_bitmap( } break; } + case VEC0_METADATA_OPERATOR_LIKE: { + // should never be reached (LIKE only applies to TEXT columns) + break; + } + case VEC0_METADATA_OPERATOR_GLOB: { + // should never be reached (GLOB only applies to TEXT columns) + break; + } + case VEC0_METADATA_OPERATOR_IS: { + // IS behaves like = for non-NULL values + for(int i = 0; i < size; i++) { bitmap_set(b, i, array[i] == target); } + break; + } + case VEC0_METADATA_OPERATOR_ISNOT: { + // IS NOT behaves like != for non-NULL values + for(int i = 0; i < size; i++) { bitmap_set(b, i, array[i] != target); } + break; + } + case VEC0_METADATA_OPERATOR_ISNULL: { + // IS NULL always returns false (metadata columns don't support NULL) + for(int i = 0; i < size; i++) { bitmap_set(b, i, 0); } + break; + } + case VEC0_METADATA_OPERATOR_ISNOTNULL: { + // IS NOT NULL always returns true (metadata columns don't support NULL) + for(int i = 0; i < size; i++) { bitmap_set(b, i, 1); } + break; + } } break; } @@ -6432,6 +6988,34 @@ int vec0_set_metadata_filter_bitmap( // should never be reached break; } + case VEC0_METADATA_OPERATOR_LIKE: { + // should never be reached (LIKE only applies to TEXT columns) + break; + } + case VEC0_METADATA_OPERATOR_GLOB: { + // should never be reached (GLOB only applies to TEXT columns) + break; + } + case VEC0_METADATA_OPERATOR_IS: { + // IS behaves like = for non-NULL values + for(int i = 0; i < size; i++) { bitmap_set(b, i, array[i] == target); } + break; + } + case VEC0_METADATA_OPERATOR_ISNOT: { + // IS NOT behaves like != for non-NULL values + for(int i = 0; i < size; i++) { bitmap_set(b, i, array[i] != target); } + break; + } + case VEC0_METADATA_OPERATOR_ISNULL: { + // IS NULL always returns false (metadata columns don't support NULL) + for(int i = 0; i < size; i++) { bitmap_set(b, i, 0); } + break; + } + case VEC0_METADATA_OPERATOR_ISNOTNULL: { + // IS NOT NULL always returns true (metadata columns don't support NULL) + for(int i = 0; i < size; i++) { bitmap_set(b, i, 1); } + break; + } } break; } @@ -6560,12 +7144,15 @@ int vec0Filter_knn_chunks_iter(vec0_vtab *p, sqlite3_stmt *stmtChunks, int numValueEntries = (idxStrLength-1) / 4; assert(numValueEntries == argc); int hasMetadataFilters = 0; + int hasDistanceConstraints = 0; for(int i = 0; i < argc; i++) { int idx = 1 + (i * 4); char kind = idxStr[idx + 0]; if(kind == VEC0_IDXSTR_KIND_METADATA_CONSTRAINT) { hasMetadataFilters = 1; - break; + } + else if(kind == VEC0_IDXSTR_KIND_KNN_DISTANCE_CONSTRAINT) { + hasDistanceConstraints = 1; } } @@ -6599,7 +7186,7 @@ int vec0Filter_knn_chunks_iter(vec0_vtab *p, sqlite3_stmt *stmtChunks, i64 *chunkRowids = (i64 *)sqlite3_column_blob(stmtChunks, 2); i64 rowidsSize = sqlite3_column_bytes(stmtChunks, 2); - if (rowidsSize != p->chunk_size * sizeof(i64)) { + if (rowidsSize != (i64)(p->chunk_size * sizeof(i64))) { // IMP: V02796_19635 vtab_set_error(&p->base, "rowids size doesn't match"); vtab_set_error( @@ -6693,7 +7280,7 @@ int vec0Filter_knn_chunks_iter(vec0_vtab *p, sqlite3_stmt *stmtChunks, continue; }; - f32 result; + f32 result = 0.0f; switch (vector_column->element_type) { case SQLITE_VEC_ELEMENT_TYPE_FLOAT32: { const f32 *base_i = @@ -6752,6 +7339,58 @@ int vec0Filter_knn_chunks_iter(vec0_vtab *p, sqlite3_stmt *stmtChunks, chunk_distances[i] = result; } + if(hasDistanceConstraints) { + for(int i = 0; i < argc; i++) { + int idx = 1 + (i * 4); + char kind = idxStr[idx + 0]; + // Note: SQLite provides distance constraint values as f64 (double), but we + // cast to f32 (float) for comparison. This matches the precision of our + // internal distance calculations (which use f32) and avoids precision + // mismatches. May result in minor precision loss for very small differences. + f32 target = (f32) sqlite3_value_double(argv[i]); + + if(kind != VEC0_IDXSTR_KIND_KNN_DISTANCE_CONSTRAINT) { + continue; + } + vec0_distance_constraint_operator op = idxStr[idx + 1]; + + switch(op) { + case VEC0_DISTANCE_CONSTRAINT_GE: { + for(int j = 0; j < p->chunk_size; j++) { + if(bitmap_get(b, j) && !(chunk_distances[j] >= target)) { + bitmap_set(b, j, 0); + } + } + break; + } + case VEC0_DISTANCE_CONSTRAINT_GT: { + for(int j = 0; j < p->chunk_size; j++) { + if(bitmap_get(b, j) && !(chunk_distances[j] > target)) { + bitmap_set(b, j, 0); + } + } + break; + } + case VEC0_DISTANCE_CONSTRAINT_LE: { + for(int j = 0; j < p->chunk_size; j++) { + if(bitmap_get(b, j) && !(chunk_distances[j] <= target)) { + bitmap_set(b, j, 0); + } + } + break; + } + case VEC0_DISTANCE_CONSTRAINT_LT: { + for(int j = 0; j < p->chunk_size; j++) { + if(bitmap_get(b, j) && !(chunk_distances[j] < target)) { + bitmap_set(b, j, 0); + } + } + break; + } + } + } + } + int used1; min_idx(chunk_distances, p->chunk_size, b, chunk_topk_idxs, min(k, p->chunk_size), bTaken, &used1); @@ -6803,7 +7442,7 @@ int vec0Filter_knn_chunks_iter(vec0_vtab *p, sqlite3_stmt *stmtChunks, int vec0Filter_knn(vec0_cursor *pCur, vec0_vtab *p, int idxNum, const char *idxStr, int argc, sqlite3_value **argv) { - assert(argc == (strlen(idxStr)-1) / 4); + assert(argc == (int)((strlen(idxStr)-1) / 4)); int rc; struct vec0_query_knn_data *knn_data; @@ -7797,8 +8436,8 @@ static int vec0_write_vector_to_vector_blob(sqlite3_blob *blobVectors, i64 chunk_offset, const void *bVector, size_t dimensions, enum VectorElementType element_type) { - int n; - int offset; + int n = 0; + int offset = 0; switch (element_type) { case SQLITE_VEC_ELEMENT_TYPE_FLOAT32: @@ -7813,6 +8452,8 @@ vec0_write_vector_to_vector_blob(sqlite3_blob *blobVectors, i64 chunk_offset, n = dimensions / CHAR_BIT; offset = chunk_offset * dimensions / CHAR_BIT; break; + default: + return SQLITE_ERROR; } return sqlite3_blob_write(blobVectors, bVector, n, offset); @@ -8230,6 +8871,13 @@ int vec0Update_Insert(sqlite3_vtab *pVTab, int argc, sqlite3_value **argv, goto cleanup; } + // Cannot insert a value in the hidden "table_name" column + if (sqlite3_value_type(argv[2 + vec0_column_table_name_idx(p)]) != SQLITE_NULL) { + vtab_set_error(pVTab, "A value was provided for the hidden \"table_name\" column."); + rc = SQLITE_ERROR; + goto cleanup; + } + // Step #1: Insert/get a rowid for this row, from the _rowids table. rc = vec0Update_InsertRowidStep(p, argv[2 + VEC0_COLUMN_ID], &rowid); if (rc != SQLITE_OK) { @@ -8434,6 +9082,101 @@ int vec0Update_Delete_DeleteRowids(vec0_vtab *p, i64 rowid) { return rc; } +// Clear the rowid slot in v_chunks.rowids for the given chunk/offset +int vec0Update_Delete_ClearRowid(vec0_vtab *p, i64 chunk_id, i64 chunk_offset) { + int rc; + sqlite3_blob *blobChunksRowids = NULL; + + rc = sqlite3_blob_open(p->db, p->schemaName, p->shadowChunksName, "rowids", + chunk_id, 1, &blobChunksRowids); + if (rc != SQLITE_OK) { + vtab_set_error(&p->base, "could not open rowids blob for %s.%s.%lld", + p->schemaName, p->shadowChunksName, chunk_id); + return SQLITE_ERROR; + } + + i64 expected = p->chunk_size * sizeof(i64); + i64 actual = sqlite3_blob_bytes(blobChunksRowids); + if (expected != actual) { + vtab_set_error(&p->base, + VEC_INTERAL_ERROR + "rowids blob size mismatch on %s.%s.%lld. Expected %lld, actual %lld", + p->schemaName, p->shadowChunksName, chunk_id, expected, actual); + sqlite3_blob_close(blobChunksRowids); + return SQLITE_ERROR; + } + + i64 zero = 0; + rc = sqlite3_blob_write(blobChunksRowids, &zero, sizeof(i64), + chunk_offset * sizeof(i64)); + int brc = sqlite3_blob_close(blobChunksRowids); + if (rc != SQLITE_OK) { + vtab_set_error(&p->base, "could not write rowids blob on %s.%s.%lld", + p->schemaName, p->shadowChunksName, chunk_id); + return rc; + } + if (brc != SQLITE_OK) { + vtab_set_error(&p->base, + "could not close rowids blob on %s.%s.%lld", + p->schemaName, p->shadowChunksName, chunk_id); + return brc; + } + return SQLITE_OK; +} + +// Clear the vector bytes for each vector column at the given chunk/offset +int vec0Update_Delete_ClearVectors(vec0_vtab *p, i64 chunk_id, i64 chunk_offset) { + for (int i = 0; i < p->numVectorColumns; i++) { + int rc; + sqlite3_blob *blobVectors = NULL; + + rc = sqlite3_blob_open(p->db, p->schemaName, p->shadowVectorChunksNames[i], + "vectors", chunk_id, 1, &blobVectors); + if (rc != SQLITE_OK) { + vtab_set_error(&p->base, "Could not open vectors blob for %s.%s.%lld", + p->schemaName, p->shadowVectorChunksNames[i], chunk_id); + return rc; + } + + i64 expected = p->chunk_size * vector_column_byte_size(p->vector_columns[i]); + i64 actual = sqlite3_blob_bytes(blobVectors); + if (expected != actual) { + vtab_set_error(&p->base, + VEC_INTERAL_ERROR + "vector blob size mismatch on %s.%s.%lld. Expected %lld, actual %lld", + p->schemaName, p->shadowVectorChunksNames[i], chunk_id, expected, actual); + sqlite3_blob_close(blobVectors); + return SQLITE_ERROR; + } + + size_t nbytes = vector_column_byte_size(p->vector_columns[i]); + void *zeros = sqlite3_malloc(nbytes); + if (!zeros) { + sqlite3_blob_close(blobVectors); + return SQLITE_NOMEM; + } + memset(zeros, 0, nbytes); + rc = vec0_write_vector_to_vector_blob(blobVectors, chunk_offset, zeros, + p->vector_columns[i].dimensions, + p->vector_columns[i].element_type); + sqlite3_free(zeros); + + int brc = sqlite3_blob_close(blobVectors); + if (rc != SQLITE_OK) { + vtab_set_error(&p->base, "Could not write to vectors blob for %s.%s.%lld", + p->schemaName, p->shadowVectorChunksNames[i], chunk_id); + return rc; + } + if (brc != SQLITE_OK) { + vtab_set_error(&p->base, + "Could not commit blob transaction for vectors blob for %s.%s.%lld", + p->schemaName, p->shadowVectorChunksNames[i], chunk_id); + return brc; + } + } + return SQLITE_OK; +} + int vec0Update_Delete_DeleteAux(vec0_vtab *p, i64 rowid) { int rc; sqlite3_stmt *stmt = NULL; @@ -8574,9 +9317,17 @@ int vec0Update_Delete(sqlite3_vtab *pVTab, sqlite3_value *idValue) { // 3. zero out rowid in chunks.rowids // https://github.com/asg017/sqlite-vec/issues/54 + rc = vec0Update_Delete_ClearRowid(p, chunk_id, chunk_offset); + if (rc != SQLITE_OK) { + return rc; + } // 4. zero out any data in vector chunks tables // https://github.com/asg017/sqlite-vec/issues/54 + rc = vec0Update_Delete_ClearVectors(p, chunk_id, chunk_offset); + if (rc != SQLITE_OK) { + return rc; + } // 5. delete from _rowids table rc = vec0Update_Delete_DeleteRowids(p, rowid); @@ -8808,8 +9559,331 @@ int vec0Update_Update(sqlite3_vtab *pVTab, int argc, sqlite3_value **argv) { return SQLITE_OK; } +int vec0Update_SpecialInsert_OptimizeCopyMetadata(vec0_vtab *p, int metadata_column_idx, i64 src_chunk_id, i64 src_chunk_offset, i64 dst_chunk_id, i64 dst_chunk_offset) { + int rc; + struct Vec0MetadataColumnDefinition * metadata_column = &p->metadata_columns[metadata_column_idx]; + vec0_metadata_column_kind kind = metadata_column->kind; + + sqlite3_blob *srcBlob, *dstBlob; + rc = sqlite3_blob_open(p->db, p->schemaName, p->shadowMetadataChunksNames[metadata_column_idx], "data", src_chunk_id, 0, &srcBlob); + if (rc != SQLITE_OK) { + vtab_set_error(&p->base, "Failed to open %s blob", p->shadowMetadataChunksNames[metadata_column_idx]); + return rc; + } + rc = sqlite3_blob_open(p->db, p->schemaName, p->shadowMetadataChunksNames[metadata_column_idx], "data", dst_chunk_id, 1, &dstBlob); + if (rc != SQLITE_OK) { + vtab_set_error(&p->base, "Failed to open %s blob", p->shadowMetadataChunksNames[metadata_column_idx]); + sqlite3_blob_close(srcBlob); + return rc; + } + switch (kind) { + case VEC0_METADATA_COLUMN_KIND_BOOLEAN: { + u8 srcBlock, dstBlock; + rc = sqlite3_blob_read(srcBlob, &srcBlock, sizeof(u8), (int) (src_chunk_offset / CHAR_BIT)); + if (rc != SQLITE_OK) { + goto done; + } + int value = (srcBlock >> (src_chunk_offset % CHAR_BIT)) & 1; + + rc = sqlite3_blob_read(dstBlob, &dstBlock, sizeof(u8), (int) (dst_chunk_offset / CHAR_BIT)); + if (rc != SQLITE_OK) { + goto done; + } + if (value) { + dstBlock |= 1 << (dst_chunk_offset % CHAR_BIT); + } else { + dstBlock &= ~(1 << (dst_chunk_offset % CHAR_BIT)); + } + rc = sqlite3_blob_write(dstBlob, &dstBlock, sizeof(u8), dst_chunk_offset / CHAR_BIT); + if (rc != SQLITE_OK) { + goto done; + } + break; + } + case VEC0_METADATA_COLUMN_KIND_INTEGER: { + i64 value; + rc = sqlite3_blob_read(srcBlob, &value, sizeof(i64), src_chunk_offset * sizeof(i64)); + if (rc != SQLITE_OK) { + goto done; + } + rc = sqlite3_blob_write(dstBlob, &value, sizeof(i64), dst_chunk_offset * sizeof(i64)); + if (rc != SQLITE_OK) { + goto done; + } + break; + } + case VEC0_METADATA_COLUMN_KIND_FLOAT: { + double value; + rc = sqlite3_blob_read(srcBlob, &value, sizeof(double), src_chunk_offset * sizeof(double)); + if (rc != SQLITE_OK) { + goto done; + } + rc = sqlite3_blob_write(dstBlob, &value, sizeof(double), dst_chunk_offset * sizeof(double)); + if (rc != SQLITE_OK) { + goto done; + } + break; + } + case VEC0_METADATA_COLUMN_KIND_TEXT: { + u8 view[VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH]; + rc = sqlite3_blob_read(srcBlob, view, VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH, src_chunk_offset * VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH); + if (rc != SQLITE_OK) { + goto done; + } + rc = sqlite3_blob_write(dstBlob, view, VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH, dst_chunk_offset * VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH); + if (rc != SQLITE_OK) { + goto done; + } + break; + } + } +done: + rc = sqlite3_blob_close(srcBlob); + if (rc == SQLITE_OK) { + rc = sqlite3_blob_close(dstBlob); + } + + return rc; +} + +int vec0Update_SpecialInsert_Optimize(vec0_vtab *p) { + sqlite3_stmt *stmt = NULL, *partition_key_stmt = NULL; + int rc; + const char *zSql; + i64 prev_max_chunk_rowid = -1; + sqlite3_value *partitionKeyValues[VEC0_MAX_PARTITION_COLUMNS]; + + // 1) get the current maximum chunk_id + zSql = sqlite3_mprintf("SELECT max(rowid) FROM " VEC0_SHADOW_CHUNKS_NAME, p->schemaName, p->tableName); + if (!zSql) { + rc = SQLITE_NOMEM; + goto done; + } + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, 0); + sqlite3_free((void *)zSql); + if ((rc != SQLITE_OK)) { + rc = SQLITE_ERROR; + goto done; + } + rc = sqlite3_step(stmt); + if (rc != SQLITE_ROW || sqlite3_column_type(stmt, 0) == SQLITE_NULL) { + if (rc == SQLITE_ROW) { + // no chunks to clear + rc = SQLITE_OK; + } else { + rc = SQLITE_ERROR; + } + goto cleanup; + } + prev_max_chunk_rowid = sqlite3_column_int64(stmt, 0); + if (sqlite3_step(stmt) != SQLITE_DONE) { + rc = SQLITE_ERROR; + goto cleanup; + } + sqlite3_finalize(stmt); + + // 2) for each row get the chunk_id for its partition key (if any), if the chunk_id is less than + // the previous maximum chunk_id, a new chunk needs to be created + zSql = sqlite3_mprintf("SELECT rowid, chunk_id, chunk_offset FROM " VEC0_SHADOW_ROWIDS_NAME, + p->schemaName, p->tableName); + if (!zSql) { + rc = SQLITE_NOMEM; + goto done; + } + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL); + sqlite3_free((void *)zSql); + if (rc != SQLITE_OK) { + goto done; + } + + if (p->numPartitionColumns > 0) { + sqlite3_str * s = sqlite3_str_new(NULL); + sqlite3_str_appendall(s, "SELECT "); + for (int i = 0; i < p->numPartitionColumns; i++) { + if (i == 0) sqlite3_str_appendf(s, "partition%02d", i); + else sqlite3_str_appendf(s, ", partition%02d", i); + } + sqlite3_str_appendf(s, " FROM " VEC0_SHADOW_CHUNKS_NAME, p->schemaName, p->tableName); + sqlite3_str_appendall(s, " WHERE chunk_id = ?"); + zSql = sqlite3_str_finish(s); + if (!zSql) { + rc = SQLITE_NOMEM; + goto cleanup; + } + rc = sqlite3_prepare_v2(p->db, zSql, -1, &partition_key_stmt, NULL); + sqlite3_free((void *)zSql); + if (rc != SQLITE_OK) { + goto cleanup; + } + } + + i64 rowid, chunk_id, chunk_offset; + i64 new_chunk_id, new_chunk_offset; + sqlite3_blob *blobChunksValidity = NULL; + const unsigned char *bufferChunksValidity = NULL; + void *vectorDatas[VEC0_MAX_VECTOR_COLUMNS]; + while ((rc = sqlite3_step(stmt)) == SQLITE_ROW) { + rowid = sqlite3_column_int64(stmt, 0); + chunk_id = sqlite3_column_int64(stmt, 1); + chunk_offset = sqlite3_column_int64(stmt, 2); + + // get the partition key for a row + if (p->numPartitionColumns > 0) { + sqlite3_reset(partition_key_stmt); + sqlite3_clear_bindings(partition_key_stmt); + sqlite3_bind_int64(partition_key_stmt, 1, chunk_id); + if (sqlite3_step(partition_key_stmt) != SQLITE_ROW) { + goto cleanup; + } + + for (int i = 0; i < p->numPartitionColumns; i++) { + partitionKeyValues[i] = sqlite3_column_value(partition_key_stmt, i); + } + } + + // get the latest chunk_id for a partition key + rc = vec0_get_latest_chunk_rowid(p, &new_chunk_id, partitionKeyValues); + if (rc != SQLITE_OK) { + goto cleanup; + } + + // create a new chunk if the latest chunk_id for a partition key is less than the previous maximum chunk_id + if (new_chunk_id <= prev_max_chunk_rowid) { + rc = vec0_new_chunk(p, partitionKeyValues, NULL); + if (rc != SQLITE_OK) { + goto cleanup; + } + } + // get the vector data from all vector columns of a row + for (int i = 0; i < p->numVectorColumns; i++) { + rc = vec0_get_vector_data(p, rowid, i, &vectorDatas[i], NULL); + if (rc != SQLITE_OK) { + goto cleanup; + } + } + + // find a valid slot in the new chunk + rc = vec0Update_InsertNextAvailableStep(p, partitionKeyValues, &new_chunk_id, &new_chunk_offset, &blobChunksValidity, &bufferChunksValidity); + if (rc != SQLITE_OK) { + goto cleanup; + } + + // write vector datas to the valid slot + rc = vec0Update_InsertWriteFinalStep(p, new_chunk_id, new_chunk_offset, rowid, vectorDatas, blobChunksValidity, bufferChunksValidity); + if (rc != SQLITE_OK) { + goto cleanup; + } + sqlite3_free((void *)bufferChunksValidity); + if (sqlite3_blob_close(blobChunksValidity) != SQLITE_OK) { + rc = SQLITE_ERROR; + vtab_set_error(&p->base, + VEC_INTERAL_ERROR "unknown error, blobChunksValidity could " + "not be closed, please file an issue"); + goto cleanup; + } + + // copy metadata from previous chunk to new chunk + for (int i = 0; i < p->numMetadataColumns; i++) { + rc = vec0Update_SpecialInsert_OptimizeCopyMetadata(p, i, chunk_id, chunk_offset, new_chunk_id, new_chunk_offset); + if (rc != SQLITE_OK) { + goto cleanup; + } + } + + if (p->numPartitionColumns > 0 && sqlite3_step(partition_key_stmt) != SQLITE_DONE) { + rc = SQLITE_ERROR; + goto cleanup; + } + } + if (rc != SQLITE_DONE) { + goto cleanup; + } + sqlite3_finalize(partition_key_stmt); + sqlite3_finalize(stmt); + partition_key_stmt = NULL; + stmt = NULL; + + // 3) clean up old chunks + zSql = sqlite3_mprintf("DELETE FROM " VEC0_SHADOW_CHUNKS_NAME " WHERE chunk_id <= ?", + p->schemaName, p->tableName); + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, 0); + sqlite3_free((void *)zSql); + if (rc != SQLITE_OK) { + goto cleanup; + } + sqlite3_bind_int64(stmt, 1, prev_max_chunk_rowid); + if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) { + rc = SQLITE_ERROR; + goto cleanup; + } + sqlite3_finalize(stmt); + + // 4) clean up old vector chunks + for (int i = 0; i < p->numVectorColumns; i++) { + zSql = sqlite3_mprintf("DELETE FROM " VEC0_SHADOW_VECTOR_N_NAME " WHERE rowid <= ?", + p->schemaName, p->tableName, i); + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, 0); + sqlite3_free((void *)zSql); + if (rc != SQLITE_OK) { + goto cleanup; + } + sqlite3_bind_int64(stmt, 1, prev_max_chunk_rowid); + if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) { + rc = SQLITE_ERROR; + goto cleanup; + } + sqlite3_finalize(stmt); + } + + // 5) clean up old metadata chunks + for (int i = 0; i < p->numMetadataColumns; i++) { + zSql = sqlite3_mprintf("DELETE FROM " VEC0_SHADOW_METADATA_N_NAME " WHERE rowid <= ?", + p->schemaName, p->tableName, i); + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, 0); + sqlite3_free((void *)zSql); + if (rc != SQLITE_OK) { + goto cleanup; + } + sqlite3_bind_int64(stmt, 1, prev_max_chunk_rowid); + if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) { + rc = SQLITE_ERROR; + goto cleanup; + } + sqlite3_finalize(stmt); + } + + stmt = NULL; + rc = SQLITE_OK; + +cleanup: + sqlite3_finalize(partition_key_stmt); + sqlite3_finalize(stmt); +done: + return rc; +} + +int vec0Update_SpecialInsert(sqlite3_vtab *pVTab, sqlite3_value *pVal) { + vec0_vtab *p = (vec0_vtab *)pVTab; + + const char *cmd = (const char *)sqlite3_value_text(pVal); + int n_bytes = sqlite3_value_bytes(pVal); + + if (!cmd) { + return SQLITE_NOMEM; + } + if (n_bytes == 8 && sqlite3_strnicmp(cmd, "optimize", 8) == 0) { + return vec0Update_SpecialInsert_Optimize(p); + } + return SQLITE_ERROR; +} + static int vec0Update(sqlite3_vtab *pVTab, int argc, sqlite3_value **argv, sqlite_int64 *pRowid) { + // Special insert + if (argc > 1 && sqlite3_value_type(argv[0]) == SQLITE_NULL && + sqlite3_value_type(argv[2 + vec0_column_table_name_idx((vec0_vtab*) pVTab)]) != SQLITE_NULL) { + return vec0Update_SpecialInsert(pVTab, argv[2 + vec0_column_table_name_idx((vec0_vtab*) pVTab)]); + } // DELETE operation if (argc == 1 && sqlite3_value_type(argv[0]) != SQLITE_NULL) { return vec0Update_Delete(pVTab, argv[0]); @@ -8915,6 +9989,111 @@ static int vec0Rollback(sqlite3_vtab *pVTab) { return SQLITE_OK; } +static int vec0Rename(sqlite3_vtab *pVTab, const char *zName) { + vec0_vtab *p = (vec0_vtab *)pVTab; + sqlite3_stmt *stmt; + int rc; + const char *zSql; + + vec0_free_resources(p); + + zSql = sqlite3_mprintf("ALTER TABLE " VEC0_SHADOW_CHUNKS_NAME " RENAME TO \"%w_chunks\"", + p->schemaName, p->tableName, zName); + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, 0); + sqlite3_free((void *)zSql); + if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) { + rc = SQLITE_ERROR; + vtab_set_error(pVTab, "could not rename chunks shadow table"); + goto done; + } + sqlite3_finalize(stmt); + + zSql = sqlite3_mprintf("ALTER TABLE " VEC0_SHADOW_INFO_NAME " RENAME TO \"%w_info\"", p->schemaName, + p->tableName, zName); + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, 0); + sqlite3_free((void *)zSql); + if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) { + rc = SQLITE_ERROR; + vtab_set_error(pVTab, "could not rename info shadow table"); + goto done; + } + sqlite3_finalize(stmt); + + zSql = sqlite3_mprintf("ALTER TABLE " VEC0_SHADOW_ROWIDS_NAME " RENAME TO \"%w_rowids\"", p->schemaName, + p->tableName, zName); + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, 0); + sqlite3_free((void *)zSql); + if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) { + rc = SQLITE_ERROR; + vtab_set_error(pVTab, "could not rename rowids shadow table"); + goto done; + } + sqlite3_finalize(stmt); + + for (int i = 0; i < p->numVectorColumns; i++) { + char *newShadowVectorChunksName = sqlite3_mprintf("%s_vector_chunks%02d", zName, i); + if (!newShadowVectorChunksName) { + return SQLITE_NOMEM; + } + zSql = sqlite3_mprintf("ALTER TABLE \"%w\".\"%w\" RENAME TO \"%w\"", p->schemaName, + p->shadowVectorChunksNames[i], newShadowVectorChunksName); + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, 0); + sqlite3_free((void *)zSql); + if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) { + rc = SQLITE_ERROR; + vtab_set_error(pVTab, "could not rename vector_chunks shadow table"); + goto done; + } + sqlite3_finalize(stmt); + } + + if(p->numAuxiliaryColumns > 0) { + zSql = sqlite3_mprintf("ALTER TABLE " VEC0_SHADOW_AUXILIARY_NAME " RENAME TO \"%w_auxiliary\"", + p->schemaName, p->tableName, zName); + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, 0); + sqlite3_free((void *)zSql); + if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) { + rc = SQLITE_ERROR; + vtab_set_error(pVTab, "could not rename auxiliary shadow table"); + goto done; + } + sqlite3_finalize(stmt); + } + + for (int i = 0; i < p->numMetadataColumns; i++) { + zSql = sqlite3_mprintf("ALTER TABLE " VEC0_SHADOW_METADATA_N_NAME " RENAME TO \"%w_metadatachunks%02d\"", + p->schemaName, p->tableName, i, zName, i); + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, 0); + sqlite3_free((void *)zSql); + if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) { + rc = SQLITE_ERROR; + vtab_set_error(pVTab, "could not rename metadatachunks shadow table"); + goto done; + } + sqlite3_finalize(stmt); + + if(p->metadata_columns[i].kind == VEC0_METADATA_COLUMN_KIND_TEXT) { + zSql = sqlite3_mprintf("ALTER TABLE " VEC0_SHADOW_METADATA_TEXT_DATA_NAME " RENAME TO \"%w_metadatatext%02d\"", + p->schemaName, p->tableName, i, zName, i); + rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, 0); + sqlite3_free((void *)zSql); + if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) { + rc = SQLITE_ERROR; + vtab_set_error(pVTab, "could not rename metadatatext shadow table"); + goto done; + } + sqlite3_finalize(stmt); + } + } + + stmt = NULL; + rc = SQLITE_OK; + +done: + sqlite3_finalize(stmt); + return rc; +} + static sqlite3_module vec0Module = { /* iVersion */ 3, /* xCreate */ vec0Create, @@ -8935,7 +10114,7 @@ static sqlite3_module vec0Module = { /* xCommit */ vec0Commit, /* xRollback */ vec0Rollback, /* xFindFunction */ 0, - /* xRename */ 0, // https://github.com/asg017/sqlite-vec/issues/43 + /* xRename */ vec0Rename, /* xSavepoint */ 0, /* xRelease */ 0, /* xRollbackTo */ 0, diff --git a/sqlite-vec.gemspec b/sqlite-vec.gemspec new file mode 100644 index 00000000..ff814f38 --- /dev/null +++ b/sqlite-vec.gemspec @@ -0,0 +1,49 @@ +# frozen_string_literal: true + +Gem::Specification.new do |spec| + spec.name = "sqlite-vec" + spec.version = "0.2.4.alpha" + spec.authors = ["Alex Garcia", "Vlad Lasky"] + spec.email = ["alex@alex.garcia"] + + spec.summary = "A vector search SQLite extension that runs anywhere" + spec.description = "sqlite-vec is a SQLite extension for vector search, supporting float, int8, and binary vectors with minimal dependencies" + spec.homepage = "https://github.com/vlasky/sqlite-vec" + spec.licenses = ["MIT", "Apache-2.0"] + spec.required_ruby_version = ">= 2.6.0" + + spec.metadata["homepage_uri"] = spec.homepage + spec.metadata["source_code_uri"] = "https://github.com/vlasky/sqlite-vec" + spec.metadata["changelog_uri"] = "https://github.com/vlasky/sqlite-vec/blob/main/CHANGELOG.md" + + # Specify which files should be added to the gem when it is released. + spec.files = Dir[ + "lib/**/*", + "sqlite-vec.c", + "sqlite-vec.h", + "sqlite-vec.h.tmpl", + "VERSION", + "vendor/**/*", + "scripts/vendor.sh", + "Makefile", + "extconf.rb", + "LICENSE*", + "README.md" + ] + + spec.require_paths = ["lib"] + + # Configure native extension build + spec.extensions = ["extconf.rb"] + + # Build the extension during gem install + spec.post_install_message = <<~MSG + sqlite-vec installed successfully! + + Load the extension in Ruby with: + db.enable_load_extension(true) + db.load_extension('vec0') + + See https://github.com/vlasky/sqlite-vec for documentation. + MSG +end diff --git a/sqlite-vec.h b/sqlite-vec.h new file mode 100644 index 00000000..98f39731 --- /dev/null +++ b/sqlite-vec.h @@ -0,0 +1,41 @@ +#ifndef SQLITE_VEC_H +#define SQLITE_VEC_H + +#ifndef SQLITE_CORE +#include "sqlite3ext.h" +#else +#include "sqlite3.h" +#endif + +#ifdef SQLITE_VEC_STATIC + #define SQLITE_VEC_API __attribute__((visibility("default"))) +#else + #ifdef _WIN32 + #define SQLITE_VEC_API __declspec(dllexport) + #else + #define SQLITE_VEC_API __attribute__((visibility("default"))) + #endif +#endif + +#define SQLITE_VEC_VERSION "v0.2.4-alpha" +// TODO rm +#define SQLITE_VEC_DATE "2026-01-04T19:18:13Z+1100" +#define SQLITE_VEC_SOURCE "c4ec0fc3a6254789d84cfa288313723fb6f2015d" + + +#define SQLITE_VEC_VERSION_MAJOR 0 +#define SQLITE_VEC_VERSION_MINOR 2 +#define SQLITE_VEC_VERSION_PATCH 4 + +#ifdef __cplusplus +extern "C" { +#endif + +SQLITE_VEC_API int sqlite3_vec_init(sqlite3 *db, char **pzErrMsg, + const sqlite3_api_routines *pApi); + +#ifdef __cplusplus +} /* end of the 'extern "C"' block */ +#endif + +#endif /* ifndef SQLITE_VEC_H */ diff --git a/sqlite-vec.h.tmpl b/sqlite-vec.h.tmpl index f49f62f6..702250e6 100644 --- a/sqlite-vec.h.tmpl +++ b/sqlite-vec.h.tmpl @@ -8,12 +8,12 @@ #endif #ifdef SQLITE_VEC_STATIC - #define SQLITE_VEC_API + #define SQLITE_VEC_API __attribute__((visibility("default"))) #else #ifdef _WIN32 #define SQLITE_VEC_API __declspec(dllexport) #else - #define SQLITE_VEC_API + #define SQLITE_VEC_API __attribute__((visibility("default"))) #endif #endif diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 00000000..2a048167 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,26 @@ +#[link(name = "sqlite_vec0")] +extern "C" { + pub fn sqlite3_vec_init(); +} + +#[cfg(test)] +mod tests { + use super::*; + + use rusqlite::{ffi::sqlite3_auto_extension, Connection}; + + #[test] + fn test_rusqlite_auto_extension() { + unsafe { + sqlite3_auto_extension(Some(std::mem::transmute(sqlite3_vec_init as *const ()))); + } + + let conn = Connection::open_in_memory().unwrap(); + + let result: String = conn + .query_row("select vec_version()", [], |x| x.get(0)) + .unwrap(); + + assert!(result.starts_with("v")); + } +} diff --git a/tests/__snapshots__/test-auxiliary.ambr b/tests/__snapshots__/test-auxiliary.ambr index bfe3d2c9..96fb1876 100644 --- a/tests/__snapshots__/test-auxiliary.ambr +++ b/tests/__snapshots__/test-auxiliary.ambr @@ -137,7 +137,7 @@ 'chunk_id': 1, 'size': 8, 'validity': b'\x06', - 'rowids': b'\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + 'rowids': b'\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', }), ]), }), @@ -163,7 +163,7 @@ 'rows': list([ OrderedDict({ 'rowid': 1, - 'vectors': b'\x00\x00\x80?\x00\x00\x00@\x00\x00@@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + 'vectors': b'\x00\x00\x00\x00\x00\x00\x00@\x00\x00@@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', }), ]), }), @@ -333,13 +333,6 @@ 'rootpage': 3, 'sql': None, }), - OrderedDict({ - 'type': 'index', - 'name': 'sqlite_autoindex_v_vector_chunks00_1', - 'tbl_name': 'v_vector_chunks00', - 'rootpage': 8, - 'sql': None, - }), OrderedDict({ 'type': 'table', 'name': 'sqlite_sequence', @@ -358,7 +351,7 @@ 'type': 'table', 'name': 'v_auxiliary', 'tbl_name': 'v_auxiliary', - 'rootpage': 9, + 'rootpage': 8, 'sql': 'CREATE TABLE "v_auxiliary"( rowid integer PRIMARY KEY , value00)', }), OrderedDict({ @@ -387,11 +380,97 @@ 'name': 'v_vector_chunks00', 'tbl_name': 'v_vector_chunks00', 'rootpage': 7, - 'sql': 'CREATE TABLE "v_vector_chunks00"(rowid PRIMARY KEY,vectors BLOB NOT NULL)', + 'sql': 'CREATE TABLE "v_vector_chunks00"(rowid INTEGER PRIMARY KEY,vectors BLOB NOT NULL)', }), ]), }) # --- +# name: test_renames + OrderedDict({ + 'sql': 'select rowid, * from v', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'vector': b'\x00\x00\x80?', + 'name': 'alex', + }), + OrderedDict({ + 'rowid': 2, + 'vector': b'\x00\x00\x00@', + 'name': 'brian', + }), + OrderedDict({ + 'rowid': 3, + 'vector': b'\x00\x00@@', + 'name': 'craig', + }), + ]), + }) +# --- +# name: test_renames.1 + dict({ + 'v_auxiliary': OrderedDict({ + 'sql': 'select * from v_auxiliary', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'value00': 'alex', + }), + OrderedDict({ + 'rowid': 2, + 'value00': 'brian', + }), + OrderedDict({ + 'rowid': 3, + 'value00': 'craig', + }), + ]), + }), + 'v_chunks': OrderedDict({ + 'sql': 'select * from v_chunks', + 'rows': list([ + OrderedDict({ + 'chunk_id': 1, + 'size': 8, + 'validity': b'\x07', + 'rowids': b'\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + }), + ]), + }), + 'v_rowids': OrderedDict({ + 'sql': 'select * from v_rowids', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'id': None, + 'chunk_id': 1, + 'chunk_offset': 0, + }), + OrderedDict({ + 'rowid': 2, + 'id': None, + 'chunk_id': 1, + 'chunk_offset': 1, + }), + OrderedDict({ + 'rowid': 3, + 'id': None, + 'chunk_id': 1, + 'chunk_offset': 2, + }), + ]), + }), + 'v_vector_chunks00': OrderedDict({ + 'sql': 'select * from v_vector_chunks00', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'vectors': b'\x00\x00\x80?\x00\x00\x00@\x00\x00@@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + }), + ]), + }), + }) +# --- # name: test_types OrderedDict({ 'sql': 'select * from v', diff --git a/tests/__snapshots__/test-general.ambr b/tests/__snapshots__/test-general.ambr index 0eac460f..177b58ba 100644 --- a/tests/__snapshots__/test-general.ambr +++ b/tests/__snapshots__/test-general.ambr @@ -33,25 +33,11 @@ 'rootpage': 3, 'sql': None, }), - OrderedDict({ - 'type': 'index', - 'name': 'sqlite_autoindex_v_metadatachunks00_1', - 'tbl_name': 'v_metadatachunks00', - 'rootpage': 10, - 'sql': None, - }), OrderedDict({ 'type': 'index', 'name': 'sqlite_autoindex_v_metadatatext00_1', 'tbl_name': 'v_metadatatext00', - 'rootpage': 12, - 'sql': None, - }), - OrderedDict({ - 'type': 'index', - 'name': 'sqlite_autoindex_v_vector_chunks00_1', - 'tbl_name': 'v_vector_chunks00', - 'rootpage': 8, + 'rootpage': 10, 'sql': None, }), OrderedDict({ @@ -72,7 +58,7 @@ 'type': 'table', 'name': 'v_auxiliary', 'tbl_name': 'v_auxiliary', - 'rootpage': 13, + 'rootpage': 11, 'sql': 'CREATE TABLE "v_auxiliary"( rowid integer PRIMARY KEY , value00)', }), OrderedDict({ @@ -93,14 +79,14 @@ 'type': 'table', 'name': 'v_metadatachunks00', 'tbl_name': 'v_metadatachunks00', - 'rootpage': 9, - 'sql': 'CREATE TABLE "v_metadatachunks00"(rowid PRIMARY KEY, data BLOB NOT NULL)', + 'rootpage': 8, + 'sql': 'CREATE TABLE "v_metadatachunks00"(rowid INTEGER PRIMARY KEY, data BLOB NOT NULL)', }), OrderedDict({ 'type': 'table', 'name': 'v_metadatatext00', 'tbl_name': 'v_metadatatext00', - 'rootpage': 11, + 'rootpage': 9, 'sql': 'CREATE TABLE "v_metadatatext00"(rowid PRIMARY KEY, data TEXT)', }), OrderedDict({ @@ -115,7 +101,7 @@ 'name': 'v_vector_chunks00', 'tbl_name': 'v_vector_chunks00', 'rootpage': 7, - 'sql': 'CREATE TABLE "v_vector_chunks00"(rowid PRIMARY KEY,vectors BLOB NOT NULL)', + 'sql': 'CREATE TABLE "v_vector_chunks00"(rowid INTEGER PRIMARY KEY,vectors BLOB NOT NULL)', }), ]), }) diff --git a/tests/__snapshots__/test-metadata.ambr b/tests/__snapshots__/test-metadata.ambr index 12212ff0..984f1e44 100644 --- a/tests/__snapshots__/test-metadata.ambr +++ b/tests/__snapshots__/test-metadata.ambr @@ -28,7 +28,7 @@ 'chunk_id': 1, 'size': 8, 'validity': b'\x02', - 'rowids': b'\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + 'rowids': b'\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', }), ]), }), @@ -89,7 +89,7 @@ 'rows': list([ OrderedDict({ 'rowid': 1, - 'vectors': b'\x11\x11\x11\x11""""3333\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + 'vectors': b'\x00\x00\x00\x00""""\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', }), ]), }), @@ -264,7 +264,7 @@ 'chunk_id': 1, 'size': 8, 'validity': b'\x06', - 'rowids': b'\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + 'rowids': b'\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', }), ]), }), @@ -335,7 +335,7 @@ 'rows': list([ OrderedDict({ 'rowid': 1, - 'vectors': b'\x11\x11\x11\x11""""3333\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + 'vectors': b'\x00\x00\x00\x00""""3333\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', }), ]), }), @@ -381,6 +381,256 @@ 'message': 'Could not extract metadata value for column t at rowid 1', }) # --- +# name: test_glob[contains *o*] + OrderedDict({ + 'sql': "select rowid, name, distance from v where vector match '[1]' and k = 9 and name glob '*o*'", + 'rows': list([ + OrderedDict({ + 'rowid': 8, + 'name': 'yet_another_string', + 'distance': 0.12000000476837158, + }), + OrderedDict({ + 'rowid': 7, + 'name': 'this_is_another_long_one', + 'distance': 0.23000001907348633, + }), + OrderedDict({ + 'rowid': 6, + 'name': 'this_is_a_very_long_string_name', + 'distance': 0.3399999737739563, + }), + OrderedDict({ + 'rowid': 5, + 'name': 'carol', + 'distance': 0.44999998807907104, + }), + OrderedDict({ + 'rowid': 4, + 'name': 'bobby', + 'distance': 0.5600000023841858, + }), + OrderedDict({ + 'rowid': 3, + 'name': 'bob', + 'distance': 0.6699999570846558, + }), + ]), + }) +# --- +# name: test_glob[error: GLOB on integer column] + dict({ + 'error': 'OperationalError', + 'message': 'GLOB operator is only allowed on TEXT metadata columns.', + }) +# --- +# name: test_glob[match all *] + OrderedDict({ + 'sql': "select rowid, name, distance from v where vector match '[1]' and k = 9 and name glob '*'", + 'rows': list([ + OrderedDict({ + 'rowid': 9, + 'name': 'zebra', + 'distance': 0.009999990463256836, + }), + OrderedDict({ + 'rowid': 8, + 'name': 'yet_another_string', + 'distance': 0.12000000476837158, + }), + OrderedDict({ + 'rowid': 7, + 'name': 'this_is_another_long_one', + 'distance': 0.23000001907348633, + }), + OrderedDict({ + 'rowid': 6, + 'name': 'this_is_a_very_long_string_name', + 'distance': 0.3399999737739563, + }), + OrderedDict({ + 'rowid': 5, + 'name': 'carol', + 'distance': 0.44999998807907104, + }), + OrderedDict({ + 'rowid': 4, + 'name': 'bobby', + 'distance': 0.5600000023841858, + }), + OrderedDict({ + 'rowid': 3, + 'name': 'bob', + 'distance': 0.6699999570846558, + }), + OrderedDict({ + 'rowid': 2, + 'name': 'alex', + 'distance': 0.7799999713897705, + }), + OrderedDict({ + 'rowid': 1, + 'name': 'alice', + 'distance': 0.8899999856948853, + }), + ]), + }) +# --- +# name: test_glob[no matches nomatch*] + OrderedDict({ + 'sql': "select rowid, name, distance from v where vector match '[1]' and k = 9 and name glob 'nomatch*'", + 'rows': list([ + ]), + }) +# --- +# name: test_glob[prefix a*] + OrderedDict({ + 'sql': "select rowid, name, distance from v where vector match '[1]' and k = 5 and name glob 'a*'", + 'rows': list([ + OrderedDict({ + 'rowid': 2, + 'name': 'alex', + 'distance': 0.7799999713897705, + }), + OrderedDict({ + 'rowid': 1, + 'name': 'alice', + 'distance': 0.8899999856948853, + }), + ]), + }) +# --- +# name: test_glob[prefix bob*] + OrderedDict({ + 'sql': "select rowid, name, distance from v where vector match '[1]' and k = 5 and name glob 'bob*'", + 'rows': list([ + OrderedDict({ + 'rowid': 4, + 'name': 'bobby', + 'distance': 0.5600000023841858, + }), + OrderedDict({ + 'rowid': 3, + 'name': 'bob', + 'distance': 0.6699999570846558, + }), + ]), + }) +# --- +# name: test_glob[prefix this_* with long strings] + OrderedDict({ + 'sql': "select rowid, name, distance from v where vector match '[1]' and k = 5 and name glob 'this_*'", + 'rows': list([ + OrderedDict({ + 'rowid': 7, + 'name': 'this_is_another_long_one', + 'distance': 0.23000001907348633, + }), + OrderedDict({ + 'rowid': 6, + 'name': 'this_is_a_very_long_string_name', + 'distance': 0.3399999737739563, + }), + ]), + }) +# --- +# name: test_glob[suffix *ice] + OrderedDict({ + 'sql': "select rowid, name, distance from v where vector match '[1]' and k = 9 and name glob '*ice'", + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'name': 'alice', + 'distance': 0.8899999856948853, + }), + ]), + }) +# --- +# name: test_glob[wildcard pattern a?e?] + OrderedDict({ + 'sql': "select rowid, name, distance from v where vector match '[1]' and k = 9 and name glob 'a?e?'", + 'rows': list([ + OrderedDict({ + 'rowid': 2, + 'name': 'alex', + 'distance': 0.7799999713897705, + }), + ]), + }) +# --- +# name: test_glob_boundary_conditions[boundary: case sensitive at 12 bytes] + dict({ + 'error': 'OperationalError', + 'message': 'Could not filter metadata fields', + }) +# --- +# name: test_glob_boundary_conditions[boundary: prefix pattern at boundary] + dict({ + 'error': 'OperationalError', + 'message': 'Could not filter metadata fields', + }) +# --- +# name: test_glob_case_sensitive[complex pattern case sensitive] + OrderedDict({ + 'sql': "select rowid, name from v where vector match '[1]' and k = 5 and name glob '*APPLE*'", + 'rows': list([ + ]), + }) +# --- +# name: test_glob_case_sensitive[exact case match Apple*] + OrderedDict({ + 'sql': "select rowid, name from v where vector match '[1]' and k = 5 and name glob 'Apple*'", + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'name': 'Apple', + }), + ]), + }) +# --- +# name: test_glob_case_sensitive[exact case match Cherry*] + OrderedDict({ + 'sql': "select rowid, name from v where vector match '[1]' and k = 5 and name glob 'Cherry*'", + 'rows': list([ + OrderedDict({ + 'rowid': 3, + 'name': 'Cherry', + }), + ]), + }) +# --- +# name: test_glob_case_sensitive[lowercase pattern matches long lowercase data] + OrderedDict({ + 'sql': "select rowid, name from v where vector match '[1]' and k = 5 and name glob 'elderberry*'", + 'rows': list([ + OrderedDict({ + 'rowid': 5, + 'name': 'elderberry_is_very_long_string', + }), + ]), + }) +# --- +# name: test_glob_case_sensitive[lowercase pattern should not match uppercase data] + OrderedDict({ + 'sql': "select rowid, name from v where vector match '[1]' and k = 5 and name glob 'apple*'", + 'rows': list([ + ]), + }) +# --- +# name: test_glob_case_sensitive[uppercase pattern should not match long lowercase data] + OrderedDict({ + 'sql': "select rowid, name from v where vector match '[1]' and k = 5 and name glob 'ELDERBERRY*'", + 'rows': list([ + ]), + }) +# --- +# name: test_glob_case_sensitive[uppercase pattern should not match mixed case] + OrderedDict({ + 'sql': "select rowid, name from v where vector match '[1]' and k = 5 and name glob 'CHERRY*'", + 'rows': list([ + ]), + }) +# --- # name: test_idxstr OrderedDict({ 'sql': "select * from vec_movies where synopsis_embedding match '' and k = 0 and is_favorited = true", @@ -549,95 +799,685 @@ ]), }) # --- -# name: test_idxstr[knn-constraint-text >=] +# name: test_idxstr[knn-constraint-text >=] + OrderedDict({ + 'sql': "select * from vec_movies where synopsis_embedding match '' and k = 0 and genre >= NULL", + 'plan': list([ + dict({ + 'detail': 'SCAN vec_movies VIRTUAL TABLE INDEX 0:3{___}___&Be_', + 'id': 2, + 'parent': 0, + }), + ]), + }) +# --- +# name: test_idxstr[knn-constraint-text >] + OrderedDict({ + 'sql': "select * from vec_movies where synopsis_embedding match '' and k = 0 and genre > NULL", + 'plan': list([ + dict({ + 'detail': 'SCAN vec_movies VIRTUAL TABLE INDEX 0:3{___}___&Bb_', + 'id': 2, + 'parent': 0, + }), + ]), + }) +# --- +# name: test_is_boolean_metadata[IS NOT NULL boolean] + OrderedDict({ + 'sql': "select rowid, is_hidden from v where vector match '[1]' and k = 5 and is_hidden is not null", + 'rows': list([ + OrderedDict({ + 'rowid': 5, + 'is_hidden': 0, + }), + OrderedDict({ + 'rowid': 4, + 'is_hidden': 1, + }), + OrderedDict({ + 'rowid': 3, + 'is_hidden': 0, + }), + OrderedDict({ + 'rowid': 2, + 'is_hidden': 1, + }), + OrderedDict({ + 'rowid': 1, + 'is_hidden': 0, + }), + ]), + }) +# --- +# name: test_is_boolean_metadata[IS NULL boolean] + OrderedDict({ + 'sql': "select rowid, is_hidden from v where vector match '[1]' and k = 5 and is_hidden is null", + 'rows': list([ + ]), + }) +# --- +# name: test_is_boolean_metadata[is_hidden IS NOT false] + OrderedDict({ + 'sql': "select rowid, is_hidden from v where vector match '[1]' and k = 5 and is_hidden is not 0", + 'rows': list([ + OrderedDict({ + 'rowid': 4, + 'is_hidden': 1, + }), + OrderedDict({ + 'rowid': 2, + 'is_hidden': 1, + }), + ]), + }) +# --- +# name: test_is_boolean_metadata[is_hidden IS false] + OrderedDict({ + 'sql': "select rowid, is_hidden from v where vector match '[1]' and k = 5 and is_hidden is 0", + 'rows': list([ + OrderedDict({ + 'rowid': 5, + 'is_hidden': 0, + }), + OrderedDict({ + 'rowid': 3, + 'is_hidden': 0, + }), + OrderedDict({ + 'rowid': 1, + 'is_hidden': 0, + }), + ]), + }) +# --- +# name: test_is_boolean_metadata[is_hidden IS true] + OrderedDict({ + 'sql': "select rowid, is_hidden from v where vector match '[1]' and k = 5 and is_hidden is 1", + 'rows': list([ + OrderedDict({ + 'rowid': 4, + 'is_hidden': 1, + }), + OrderedDict({ + 'rowid': 2, + 'is_hidden': 1, + }), + ]), + }) +# --- +# name: test_is_float_metadata[IS 2.5] + OrderedDict({ + 'sql': "select rowid, score from v where vector match '[1]' and k = 5 and score is 2.5", + 'rows': list([ + OrderedDict({ + 'rowid': 4, + 'score': 2.5, + }), + OrderedDict({ + 'rowid': 2, + 'score': 2.5, + }), + ]), + }) +# --- +# name: test_is_float_metadata[IS NOT 2.5] + OrderedDict({ + 'sql': "select rowid, score from v where vector match '[1]' and k = 5 and score is not 2.5", + 'rows': list([ + OrderedDict({ + 'rowid': 5, + 'score': 4.5, + }), + OrderedDict({ + 'rowid': 3, + 'score': 3.5, + }), + OrderedDict({ + 'rowid': 1, + 'score': 1.5, + }), + ]), + }) +# --- +# name: test_is_float_metadata[IS NOT NULL float] + OrderedDict({ + 'sql': "select rowid, score from v where vector match '[1]' and k = 5 and score is not null", + 'rows': list([ + OrderedDict({ + 'rowid': 5, + 'score': 4.5, + }), + OrderedDict({ + 'rowid': 4, + 'score': 2.5, + }), + OrderedDict({ + 'rowid': 3, + 'score': 3.5, + }), + OrderedDict({ + 'rowid': 2, + 'score': 2.5, + }), + OrderedDict({ + 'rowid': 1, + 'score': 1.5, + }), + ]), + }) +# --- +# name: test_is_float_metadata[IS NULL float] + OrderedDict({ + 'sql': "select rowid, score from v where vector match '[1]' and k = 5 and score is null", + 'rows': list([ + ]), + }) +# --- +# name: test_is_integer_metadata[IS 20] + OrderedDict({ + 'sql': "select rowid, age from v where vector match '[1]' and k = 5 and age is 20", + 'rows': list([ + OrderedDict({ + 'rowid': 4, + 'age': 20, + }), + OrderedDict({ + 'rowid': 2, + 'age': 20, + }), + ]), + }) +# --- +# name: test_is_integer_metadata[IS NOT 20] + OrderedDict({ + 'sql': "select rowid, age from v where vector match '[1]' and k = 5 and age is not 20", + 'rows': list([ + OrderedDict({ + 'rowid': 5, + 'age': 40, + }), + OrderedDict({ + 'rowid': 3, + 'age': 30, + }), + OrderedDict({ + 'rowid': 1, + 'age': 10, + }), + ]), + }) +# --- +# name: test_is_integer_metadata[IS NOT NULL] + OrderedDict({ + 'sql': "select rowid, age from v where vector match '[1]' and k = 5 and age is not null", + 'rows': list([ + OrderedDict({ + 'rowid': 5, + 'age': 40, + }), + OrderedDict({ + 'rowid': 4, + 'age': 20, + }), + OrderedDict({ + 'rowid': 3, + 'age': 30, + }), + OrderedDict({ + 'rowid': 2, + 'age': 20, + }), + OrderedDict({ + 'rowid': 1, + 'age': 10, + }), + ]), + }) +# --- +# name: test_is_integer_metadata[IS NULL] + OrderedDict({ + 'sql': "select rowid, age from v where vector match '[1]' and k = 5 and age is null", + 'rows': list([ + ]), + }) +# --- +# name: test_is_text_metadata[IS NOT NULL text] + OrderedDict({ + 'sql': "select rowid, name from v where vector match '[1]' and k = 5 and name is not null", + 'rows': list([ + OrderedDict({ + 'rowid': 5, + 'name': 'david', + }), + OrderedDict({ + 'rowid': 4, + 'name': 'bob', + }), + OrderedDict({ + 'rowid': 3, + 'name': 'carol', + }), + OrderedDict({ + 'rowid': 2, + 'name': 'bob', + }), + OrderedDict({ + 'rowid': 1, + 'name': 'alice', + }), + ]), + }) +# --- +# name: test_is_text_metadata[IS NOT bob] + OrderedDict({ + 'sql': "select rowid, name from v where vector match '[1]' and k = 5 and name is not 'bob'", + 'rows': list([ + OrderedDict({ + 'rowid': 5, + 'name': 'david', + }), + OrderedDict({ + 'rowid': 3, + 'name': 'carol', + }), + OrderedDict({ + 'rowid': 1, + 'name': 'alice', + }), + ]), + }) +# --- +# name: test_is_text_metadata[IS NULL text] + OrderedDict({ + 'sql': "select rowid, name from v where vector match '[1]' and k = 5 and name is null", + 'rows': list([ + ]), + }) +# --- +# name: test_is_text_metadata[IS bob] + OrderedDict({ + 'sql': "select rowid, name from v where vector match '[1]' and k = 5 and name is 'bob'", + 'rows': list([ + OrderedDict({ + 'rowid': 4, + 'name': 'bob', + }), + OrderedDict({ + 'rowid': 2, + 'name': 'bob', + }), + ]), + }) +# --- +# name: test_is_with_long_text[IS NOT long string] + OrderedDict({ + 'sql': "select rowid, name from v where vector match '[1]' and k = 5 and name is not 'this_is_a_very_long_string_name'", + 'rows': list([ + OrderedDict({ + 'rowid': 5, + 'name': 'yet_another_long_one', + }), + OrderedDict({ + 'rowid': 3, + 'name': 'short', + }), + OrderedDict({ + 'rowid': 2, + 'name': 'another_long_string', + }), + ]), + }) +# --- +# name: test_is_with_long_text[IS long string] + OrderedDict({ + 'sql': "select rowid, name from v where vector match '[1]' and k = 5 and name is 'this_is_a_very_long_string_name'", + 'rows': list([ + OrderedDict({ + 'rowid': 4, + 'name': 'this_is_a_very_long_string_name', + }), + OrderedDict({ + 'rowid': 1, + 'name': 'this_is_a_very_long_string_name', + }), + ]), + }) +# --- +# name: test_knn.1 + OrderedDict({ + 'sql': "select *, distance from v where vector match '[5]' and k = 3 and name like 'a%'", + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'vector': b'\x00\x00\x80?', + 'name': 'alex', + 'distance': 4.0, + }), + ]), + }) +# --- +# name: test_knn[sqlite_master] + OrderedDict({ + 'sql': "select * from sqlite_master where type = 'table' order by name", + 'rows': list([ + OrderedDict({ + 'type': 'table', + 'name': 'sqlite_sequence', + 'tbl_name': 'sqlite_sequence', + 'rootpage': 5, + 'sql': 'CREATE TABLE sqlite_sequence(name,seq)', + }), + OrderedDict({ + 'type': 'table', + 'name': 'v', + 'tbl_name': 'v', + 'rootpage': 0, + 'sql': 'CREATE VIRTUAL TABLE v using vec0(vector float[1], name text, chunk_size=8)', + }), + OrderedDict({ + 'type': 'table', + 'name': 'v_chunks', + 'tbl_name': 'v_chunks', + 'rootpage': 4, + 'sql': 'CREATE TABLE "v_chunks"(chunk_id INTEGER PRIMARY KEY AUTOINCREMENT,size INTEGER NOT NULL,validity BLOB NOT NULL,rowids BLOB NOT NULL)', + }), + OrderedDict({ + 'type': 'table', + 'name': 'v_info', + 'tbl_name': 'v_info', + 'rootpage': 2, + 'sql': 'CREATE TABLE "v_info" (key text primary key, value any)', + }), + OrderedDict({ + 'type': 'table', + 'name': 'v_metadatachunks00', + 'tbl_name': 'v_metadatachunks00', + 'rootpage': 8, + 'sql': 'CREATE TABLE "v_metadatachunks00"(rowid INTEGER PRIMARY KEY, data BLOB NOT NULL)', + }), + OrderedDict({ + 'type': 'table', + 'name': 'v_metadatatext00', + 'tbl_name': 'v_metadatatext00', + 'rootpage': 9, + 'sql': 'CREATE TABLE "v_metadatatext00"(rowid PRIMARY KEY, data TEXT)', + }), + OrderedDict({ + 'type': 'table', + 'name': 'v_rowids', + 'tbl_name': 'v_rowids', + 'rootpage': 6, + 'sql': 'CREATE TABLE "v_rowids"(rowid INTEGER PRIMARY KEY AUTOINCREMENT,id,chunk_id INTEGER,chunk_offset INTEGER)', + }), + OrderedDict({ + 'type': 'table', + 'name': 'v_vector_chunks00', + 'tbl_name': 'v_vector_chunks00', + 'rootpage': 7, + 'sql': 'CREATE TABLE "v_vector_chunks00"(rowid INTEGER PRIMARY KEY,vectors BLOB NOT NULL)', + }), + ]), + }) +# --- +# name: test_like[contains %o%] + OrderedDict({ + 'sql': "select rowid, name, distance from v where vector match '[1]' and k = 9 and name like '%o%'", + 'rows': list([ + OrderedDict({ + 'rowid': 8, + 'name': 'yet_another_string', + 'distance': 0.12000000476837158, + }), + OrderedDict({ + 'rowid': 7, + 'name': 'this_is_another_long_one', + 'distance': 0.23000001907348633, + }), + OrderedDict({ + 'rowid': 6, + 'name': 'this_is_a_very_long_string_name', + 'distance': 0.3399999737739563, + }), + OrderedDict({ + 'rowid': 5, + 'name': 'carol', + 'distance': 0.44999998807907104, + }), + OrderedDict({ + 'rowid': 4, + 'name': 'bobby', + 'distance': 0.5600000023841858, + }), + OrderedDict({ + 'rowid': 3, + 'name': 'bob', + 'distance': 0.6699999570846558, + }), + ]), + }) +# --- +# name: test_like[error: LIKE on integer column] + dict({ + 'error': 'OperationalError', + 'message': 'LIKE operator is only allowed on TEXT metadata columns.', + }) +# --- +# name: test_like[match all %] + OrderedDict({ + 'sql': "select rowid, name, distance from v where vector match '[1]' and k = 9 and name like '%'", + 'rows': list([ + OrderedDict({ + 'rowid': 9, + 'name': 'zebra', + 'distance': 0.009999990463256836, + }), + OrderedDict({ + 'rowid': 8, + 'name': 'yet_another_string', + 'distance': 0.12000000476837158, + }), + OrderedDict({ + 'rowid': 7, + 'name': 'this_is_another_long_one', + 'distance': 0.23000001907348633, + }), + OrderedDict({ + 'rowid': 6, + 'name': 'this_is_a_very_long_string_name', + 'distance': 0.3399999737739563, + }), + OrderedDict({ + 'rowid': 5, + 'name': 'carol', + 'distance': 0.44999998807907104, + }), + OrderedDict({ + 'rowid': 4, + 'name': 'bobby', + 'distance': 0.5600000023841858, + }), + OrderedDict({ + 'rowid': 3, + 'name': 'bob', + 'distance': 0.6699999570846558, + }), + OrderedDict({ + 'rowid': 2, + 'name': 'alex', + 'distance': 0.7799999713897705, + }), + OrderedDict({ + 'rowid': 1, + 'name': 'alice', + 'distance': 0.8899999856948853, + }), + ]), + }) +# --- +# name: test_like[no matches nomatch%] + OrderedDict({ + 'sql': "select rowid, name, distance from v where vector match '[1]' and k = 9 and name like 'nomatch%'", + 'rows': list([ + ]), + }) +# --- +# name: test_like[prefix a%] + OrderedDict({ + 'sql': "select rowid, name, distance from v where vector match '[1]' and k = 5 and name like 'a%'", + 'rows': list([ + OrderedDict({ + 'rowid': 2, + 'name': 'alex', + 'distance': 0.7799999713897705, + }), + OrderedDict({ + 'rowid': 1, + 'name': 'alice', + 'distance': 0.8899999856948853, + }), + ]), + }) +# --- +# name: test_like[prefix bob%] + OrderedDict({ + 'sql': "select rowid, name, distance from v where vector match '[1]' and k = 5 and name like 'bob%'", + 'rows': list([ + OrderedDict({ + 'rowid': 4, + 'name': 'bobby', + 'distance': 0.5600000023841858, + }), + OrderedDict({ + 'rowid': 3, + 'name': 'bob', + 'distance': 0.6699999570846558, + }), + ]), + }) +# --- +# name: test_like[prefix this_% with long strings] + OrderedDict({ + 'sql': "select rowid, name, distance from v where vector match '[1]' and k = 5 and name like 'this_%'", + 'rows': list([ + OrderedDict({ + 'rowid': 7, + 'name': 'this_is_another_long_one', + 'distance': 0.23000001907348633, + }), + OrderedDict({ + 'rowid': 6, + 'name': 'this_is_a_very_long_string_name', + 'distance': 0.3399999737739563, + }), + ]), + }) +# --- +# name: test_like[suffix %ice] OrderedDict({ - 'sql': "select * from vec_movies where synopsis_embedding match '' and k = 0 and genre >= NULL", - 'plan': list([ - dict({ - 'detail': 'SCAN vec_movies VIRTUAL TABLE INDEX 0:3{___}___&Be_', - 'id': 2, - 'parent': 0, + 'sql': "select rowid, name, distance from v where vector match '[1]' and k = 9 and name like '%ice'", + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'name': 'alice', + 'distance': 0.8899999856948853, }), ]), }) # --- -# name: test_idxstr[knn-constraint-text >] +# name: test_like[wildcard pattern a_e_] OrderedDict({ - 'sql': "select * from vec_movies where synopsis_embedding match '' and k = 0 and genre > NULL", - 'plan': list([ - dict({ - 'detail': 'SCAN vec_movies VIRTUAL TABLE INDEX 0:3{___}___&Bb_', - 'id': 2, - 'parent': 0, + 'sql': "select rowid, name, distance from v where vector match '[1]' and k = 9 and name like 'a_e_'", + 'rows': list([ + OrderedDict({ + 'rowid': 2, + 'name': 'alex', + 'distance': 0.7799999713897705, }), ]), }) # --- -# name: test_knn.1 +# name: test_like_boundary_conditions[12-byte boundary: exact match] dict({ 'error': 'OperationalError', - 'message': 'An illegal WHERE constraint was provided on a vec0 metadata column in a KNN query. Only one of EQUALS, GREATER_THAN, LESS_THAN_OR_EQUAL, LESS_THAN, GREATER_THAN_OR_EQUAL, NOT_EQUALS is allowed.', + 'message': 'Could not filter metadata fields', }) # --- -# name: test_knn[sqlite_master] +# name: test_like_boundary_conditions[12-byte boundary: prefix matches both 12 and 13 byte strings] + dict({ + 'error': 'OperationalError', + 'message': 'Could not filter metadata fields', + }) +# --- +# name: test_like_boundary_conditions[13-byte boundary: 12-byte pattern] + dict({ + 'error': 'OperationalError', + 'message': 'Could not filter metadata fields', + }) +# --- +# name: test_like_boundary_conditions[boundary: case insensitive at 12 bytes] + dict({ + 'error': 'OperationalError', + 'message': 'Could not filter metadata fields', + }) +# --- +# name: test_like_boundary_conditions[boundary: short pattern on mixed length strings] + dict({ + 'error': 'OperationalError', + 'message': 'Could not filter metadata fields', + }) +# --- +# name: test_like_case_insensitive[complex pattern case insensitive] OrderedDict({ - 'sql': "select * from sqlite_master where type = 'table' order by name", + 'sql': "select rowid, name from v where vector match '[1]' and k = 5 and name like '%APPLE%'", 'rows': list([ OrderedDict({ - 'type': 'table', - 'name': 'sqlite_sequence', - 'tbl_name': 'sqlite_sequence', - 'rootpage': 5, - 'sql': 'CREATE TABLE sqlite_sequence(name,seq)', - }), - OrderedDict({ - 'type': 'table', - 'name': 'v', - 'tbl_name': 'v', - 'rootpage': 0, - 'sql': 'CREATE VIRTUAL TABLE v using vec0(vector float[1], name text, chunk_size=8)', - }), - OrderedDict({ - 'type': 'table', - 'name': 'v_chunks', - 'tbl_name': 'v_chunks', - 'rootpage': 4, - 'sql': 'CREATE TABLE "v_chunks"(chunk_id INTEGER PRIMARY KEY AUTOINCREMENT,size INTEGER NOT NULL,validity BLOB NOT NULL,rowids BLOB NOT NULL)', - }), - OrderedDict({ - 'type': 'table', - 'name': 'v_info', - 'tbl_name': 'v_info', - 'rootpage': 2, - 'sql': 'CREATE TABLE "v_info" (key text primary key, value any)', + 'rowid': 1, + 'name': 'Apple', }), + ]), + }) +# --- +# name: test_like_case_insensitive[lowercase pattern matches uppercase data] + OrderedDict({ + 'sql': "select rowid, name from v where vector match '[1]' and k = 5 and name like 'apple%'", + 'rows': list([ OrderedDict({ - 'type': 'table', - 'name': 'v_metadatachunks00', - 'tbl_name': 'v_metadatachunks00', - 'rootpage': 9, - 'sql': 'CREATE TABLE "v_metadatachunks00"(rowid PRIMARY KEY, data BLOB NOT NULL)', + 'rowid': 1, + 'name': 'Apple', }), + ]), + }) +# --- +# name: test_like_case_insensitive[mixed case pattern matches uppercase data] + OrderedDict({ + 'sql': "select rowid, name from v where vector match '[1]' and k = 5 and name like 'DuRiAn%'", + 'rows': list([ OrderedDict({ - 'type': 'table', - 'name': 'v_metadatatext00', - 'tbl_name': 'v_metadatatext00', - 'rootpage': 11, - 'sql': 'CREATE TABLE "v_metadatatext00"(rowid PRIMARY KEY, data TEXT)', + 'rowid': 4, + 'name': 'DURIAN_IS_LONG', }), + ]), + }) +# --- +# name: test_like_case_insensitive[uppercase pattern matches long lowercase data] + OrderedDict({ + 'sql': "select rowid, name from v where vector match '[1]' and k = 5 and name like 'ELDERBERRY%'", + 'rows': list([ OrderedDict({ - 'type': 'table', - 'name': 'v_rowids', - 'tbl_name': 'v_rowids', - 'rootpage': 6, - 'sql': 'CREATE TABLE "v_rowids"(rowid INTEGER PRIMARY KEY AUTOINCREMENT,id,chunk_id INTEGER,chunk_offset INTEGER)', + 'rowid': 5, + 'name': 'elderberry_is_very_long_string', }), + ]), + }) +# --- +# name: test_like_case_insensitive[uppercase pattern matches mixed case data] + OrderedDict({ + 'sql': "select rowid, name from v where vector match '[1]' and k = 5 and name like 'CHERRY%'", + 'rows': list([ OrderedDict({ - 'type': 'table', - 'name': 'v_vector_chunks00', - 'tbl_name': 'v_vector_chunks00', - 'rootpage': 7, - 'sql': 'CREATE TABLE "v_vector_chunks00"(rowid PRIMARY KEY,vectors BLOB NOT NULL)', + 'rowid': 3, + 'name': 'Cherry', }), ]), }) @@ -1877,35 +2717,35 @@ 'type': 'table', 'name': 'v_metadatachunks00', 'tbl_name': 'v_metadatachunks00', - 'rootpage': 9, - 'sql': 'CREATE TABLE "v_metadatachunks00"(rowid PRIMARY KEY, data BLOB NOT NULL)', + 'rootpage': 8, + 'sql': 'CREATE TABLE "v_metadatachunks00"(rowid INTEGER PRIMARY KEY, data BLOB NOT NULL)', }), OrderedDict({ 'type': 'table', 'name': 'v_metadatachunks01', 'tbl_name': 'v_metadatachunks01', - 'rootpage': 11, - 'sql': 'CREATE TABLE "v_metadatachunks01"(rowid PRIMARY KEY, data BLOB NOT NULL)', + 'rootpage': 9, + 'sql': 'CREATE TABLE "v_metadatachunks01"(rowid INTEGER PRIMARY KEY, data BLOB NOT NULL)', }), OrderedDict({ 'type': 'table', 'name': 'v_metadatachunks02', 'tbl_name': 'v_metadatachunks02', - 'rootpage': 13, - 'sql': 'CREATE TABLE "v_metadatachunks02"(rowid PRIMARY KEY, data BLOB NOT NULL)', + 'rootpage': 10, + 'sql': 'CREATE TABLE "v_metadatachunks02"(rowid INTEGER PRIMARY KEY, data BLOB NOT NULL)', }), OrderedDict({ 'type': 'table', 'name': 'v_metadatachunks03', 'tbl_name': 'v_metadatachunks03', - 'rootpage': 15, - 'sql': 'CREATE TABLE "v_metadatachunks03"(rowid PRIMARY KEY, data BLOB NOT NULL)', + 'rootpage': 11, + 'sql': 'CREATE TABLE "v_metadatachunks03"(rowid INTEGER PRIMARY KEY, data BLOB NOT NULL)', }), OrderedDict({ 'type': 'table', 'name': 'v_metadatatext03', 'tbl_name': 'v_metadatatext03', - 'rootpage': 17, + 'rootpage': 12, 'sql': 'CREATE TABLE "v_metadatatext03"(rowid PRIMARY KEY, data TEXT)', }), OrderedDict({ @@ -1920,11 +2760,155 @@ 'name': 'v_vector_chunks00', 'tbl_name': 'v_vector_chunks00', 'rootpage': 7, - 'sql': 'CREATE TABLE "v_vector_chunks00"(rowid PRIMARY KEY,vectors BLOB NOT NULL)', + 'sql': 'CREATE TABLE "v_vector_chunks00"(rowid INTEGER PRIMARY KEY,vectors BLOB NOT NULL)', + }), + ]), + }) +# --- +# name: test_renames + OrderedDict({ + 'sql': 'insert into v(rowid, vector, b, n, f, t) values (?, ?, ?, ?, ?, ?)', + 'rows': list([ + ]), + }) +# --- +# name: test_renames.1 + OrderedDict({ + 'sql': 'insert into v(rowid, vector, b, n, f, t) values (?, ?, ?, ?, ?, ?)', + 'rows': list([ + ]), + }) +# --- +# name: test_renames.2 + OrderedDict({ + 'sql': 'insert into v(rowid, vector, b, n, f, t) values (?, ?, ?, ?, ?, ?)', + 'rows': list([ + ]), + }) +# --- +# name: test_renames.3 + OrderedDict({ + 'sql': 'select * from v', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'vector': b'\x11\x11\x11\x11', + 'b': 1, + 'n': 1, + 'f': 1.1, + 't': 'test1', + }), + OrderedDict({ + 'rowid': 2, + 'vector': b'""""', + 'b': 1, + 'n': 2, + 'f': 2.2, + 't': 'test2', + }), + OrderedDict({ + 'rowid': 3, + 'vector': b'3333', + 'b': 1, + 'n': 3, + 'f': 3.3, + 't': '1234567890123', }), ]), }) # --- +# name: test_renames.4 + dict({ + 'v_chunks': OrderedDict({ + 'sql': 'select * from v_chunks', + 'rows': list([ + OrderedDict({ + 'chunk_id': 1, + 'size': 8, + 'validity': b'\x07', + 'rowids': b'\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + }), + ]), + }), + 'v_metadatachunks00': OrderedDict({ + 'sql': 'select * from v_metadatachunks00', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'data': b'\x07', + }), + ]), + }), + 'v_metadatachunks01': OrderedDict({ + 'sql': 'select * from v_metadatachunks01', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'data': b'\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + }), + ]), + }), + 'v_metadatachunks02': OrderedDict({ + 'sql': 'select * from v_metadatachunks02', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'data': b'\x9a\x99\x99\x99\x99\x99\xf1?\x9a\x99\x99\x99\x99\x99\x01@ffffff\n@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + }), + ]), + }), + 'v_metadatachunks03': OrderedDict({ + 'sql': 'select * from v_metadatachunks03', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'data': b'\x05\x00\x00\x00test1\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00test2\x00\x00\x00\x00\x00\x00\x00\r\x00\x00\x00123456789012\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + }), + ]), + }), + 'v_metadatatext03': OrderedDict({ + 'sql': 'select * from v_metadatatext03', + 'rows': list([ + OrderedDict({ + 'rowid': 3, + 'data': '1234567890123', + }), + ]), + }), + 'v_rowids': OrderedDict({ + 'sql': 'select * from v_rowids', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'id': None, + 'chunk_id': 1, + 'chunk_offset': 0, + }), + OrderedDict({ + 'rowid': 2, + 'id': None, + 'chunk_id': 1, + 'chunk_offset': 1, + }), + OrderedDict({ + 'rowid': 3, + 'id': None, + 'chunk_id': 1, + 'chunk_offset': 2, + }), + ]), + }), + 'v_vector_chunks00': OrderedDict({ + 'sql': 'select * from v_vector_chunks00', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'vectors': b'\x11\x11\x11\x11""""3333\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + }), + ]), + }), + }) +# --- # name: test_stress dict({ 'vec_movies_auxiliary': OrderedDict({ @@ -2690,7 +3674,7 @@ # name: test_stress[bool-other-op] dict({ 'error': 'OperationalError', - 'message': 'ONLY EQUALS (=) or NOT_EQUALS (!=) operators are allowed on boolean metadata columns.', + 'message': 'ONLY EQUALS (=), NOT_EQUALS (!=), IS, IS NOT, IS NULL, or IS NOT NULL operators are allowed on boolean metadata columns.', }) # --- # name: test_text_knn diff --git a/tests/test-auxiliary.py b/tests/test-auxiliary.py index d1f5f568..dc475739 100644 --- a/tests/test-auxiliary.py +++ b/tests/test-auxiliary.py @@ -108,6 +108,24 @@ def test_deletes(db, snapshot): assert vec0_shadow_table_contents(db, "v") == snapshot() +def test_renames(db, snapshot): + db.execute( + "create virtual table v using vec0(vector float[1], +name text, chunk_size=8)" + ) + db.executemany( + "insert into v(vector, name) values (?, ?)", + [("[1]", "alex"), ("[2]", "brian"), ("[3]", "craig")], + ) + assert exec(db, "select rowid, * from v") == snapshot() + assert vec0_shadow_table_contents(db, "v") == snapshot() + + res = exec(db, "select rowid, * from v") + db.execute( + "alter table v rename to v1" + ) + assert exec(db, "select rowid, * from v1")["rows"] == res["rows"] + + def test_knn(db, snapshot): db.execute("create virtual table v using vec0(vector float[1], +name text)") db.executemany( @@ -126,6 +144,26 @@ def test_knn(db, snapshot): ) == snapshot(name="illegal KNN w/ aux") +def test_vacuum(db, snapshot): + db.execute( + "create virtual table v using vec0(vector float[1], +name text)" + ) + db.executemany( + "insert into v(vector, name) values (?, ?)", + [("[1]", "alex"), ("[2]", "brian"), ("[3]", "craig")], + ) + + exec(db, "delete from v where 1 = 1") + prev_page_count = exec(db, "pragma page_count")["rows"][0]["page_count"] + + db.execute("insert into v(v) values ('optimize')") + db.commit() + db.execute("vacuum") + + cur_page_count = exec(db, "pragma page_count")["rows"][0]["page_count"] + assert cur_page_count < prev_page_count + + def exec(db, sql, parameters=[]): try: rows = db.execute(sql, parameters).fetchall() diff --git a/tests/test-delete-clears-bytes.py b/tests/test-delete-clears-bytes.py new file mode 100644 index 00000000..41ff37f9 --- /dev/null +++ b/tests/test-delete-clears-bytes.py @@ -0,0 +1,106 @@ +import os + + +def test_delete_clears_rowid_and_vectors(): + try: + import pysqlite3 as sqlite3 # uses bundled modern SQLite with extension loading + except ImportError: # fallback if not available + import sqlite3 + + db = sqlite3.connect(":memory:") + db.row_factory = sqlite3.Row + if hasattr(db, "enable_load_extension"): + db.enable_load_extension(True) + ext = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "dist", "vec0")) + try: + # Explicit entrypoint to avoid relying on default name + db.load_extension(ext, "sqlite3_vec_init") + except Exception: + # Some loaders accept missing suffix path without explicit entrypoint + db.load_extension(ext) + + # One vector column with 1 dimension (4 bytes per vector), chunk_size=8 + db.execute("create virtual table v using vec0(vector float[1], chunk_size=8)") + + # Insert two rows with distinct raw vector bytes + db.execute( + "insert into v(rowid, vector) values (?, ?)", + [1, b"\x11\x11\x11\x11"], + ) + db.execute( + "insert into v(rowid, vector) values (?, ?)", + [2, b"\x22\x22\x22\x22"], + ) + + # Sanity check pre-delete: validity has first two bits set (0b00000011) + row = db.execute("select validity, rowids from v_chunks").fetchone() + assert row is not None + assert row[0] == b"\x03" + + # Delete rowid=1 + db.execute("delete from v where rowid = 1") + + # After delete, validity should only have bit 1 set (0b00000010) + row = db.execute("select validity, rowids from v_chunks").fetchone() + assert row[0] == b"\x02" + + # Rowids BLOB: first 8 bytes (slot 0) must be zero; second (slot 1) must be rowid=2 + rowids = row[1] + assert isinstance(rowids, (bytes, bytearray)) + assert len(rowids) == 8 * 8 # chunk_size * sizeof(i64) + assert rowids[0:8] == b"\x00" * 8 + assert rowids[8:16] == b"\x02\x00\x00\x00\x00\x00\x00\x00" + + # Vectors BLOB for the first (and only) vector column + vectors_row = db.execute("select vectors from v_vector_chunks00").fetchone() + vectors = vectors_row[0] + # chunk_size (8) * 4 bytes per float32 = 32 bytes + assert len(vectors) == 32 + # Slot 0 cleared to zeros, slot 1 left as inserted (0x22 0x22 0x22 0x22) + assert vectors[0:4] == b"\x00\x00\x00\x00" + assert vectors[4:8] == b"\x22\x22\x22\x22" + + +def test_vacuum_shrinks_file(tmp_path): + try: + import pysqlite3 as sqlite3 + except ImportError: + import sqlite3 + + db_path = tmp_path / "vacuum_vec.db" + + con = sqlite3.connect(str(db_path)) + con.row_factory = sqlite3.Row + if hasattr(con, "enable_load_extension"): + con.enable_load_extension(True) + ext = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "dist", "vec0")) + try: + con.load_extension(ext) + except Exception: + # Some platforms require the full filename or default entrypoint; fallback already tried + con.load_extension(ext) + + # Use a larger chunk_size to inflate file size more clearly + con.execute("create virtual table v using vec0(vector float[1], chunk_size=4096)") + + # Insert a decent number of rows to grow the DB + N = 10000 + con.executemany( + "insert into v(rowid, vector) values(?, ?)", + ((i, b"\x11\x11\x11\x11") for i in range(1, N + 1)), + ) + con.commit() + + size_after_insert = os.stat(db_path).st_size + assert size_after_insert > 0 + + # Drop the table to free its pages, then VACUUM to rewrite/shrink the file + con.execute("drop table v") + con.commit() + con.execute("VACUUM") + con.close() + + size_after_vacuum = os.stat(db_path).st_size + + # File should shrink after dropping the table and VACUUM + assert size_after_vacuum < size_after_insert diff --git a/tests/test-distance-constraints.py b/tests/test-distance-constraints.py new file mode 100644 index 00000000..e5f037c4 --- /dev/null +++ b/tests/test-distance-constraints.py @@ -0,0 +1,393 @@ +import sqlite3 +import struct +import pytest + + +def _int8(list): + """Helper to pack int8 vectors""" + return struct.pack("%sb" % len(list), *list) + + +def bitmap(bitstring): + """Helper to create bit vectors from binary string""" + return bytes([int(bitstring, 2)]) + + +@pytest.fixture() +def db(): + db = sqlite3.connect(":memory:") + db.row_factory = sqlite3.Row + db.enable_load_extension(True) + db.load_extension("dist/vec0") + db.enable_load_extension(False) + return db + + +def test_distance_gt_basic(db): + """Test distance > X constraint for basic pagination""" + db.execute("CREATE VIRTUAL TABLE v USING vec0(embedding float[3])") + db.executemany( + "INSERT INTO v(rowid, embedding) VALUES (?, ?)", + [ + (1, "[1.0, 0.0, 0.0]"), + (2, "[2.0, 0.0, 0.0]"), + (3, "[3.0, 0.0, 0.0]"), + (4, "[4.0, 0.0, 0.0]"), + (5, "[5.0, 0.0, 0.0]"), + ], + ) + + # First page: k=2 + result = db.execute( + "SELECT rowid, distance FROM v WHERE embedding MATCH '[0.0, 0.0, 0.0]' AND k = 2 ORDER BY distance" + ).fetchall() + assert len(result) == 2 + assert result[0]["rowid"] == 1 + assert result[1]["rowid"] == 2 + last_distance = result[1]["distance"] + + # Second page: distance > last_distance, k=2 + result = db.execute( + "SELECT rowid, distance FROM v WHERE embedding MATCH '[0.0, 0.0, 0.0]' AND k = 2 AND distance > ? ORDER BY distance", + [last_distance], + ).fetchall() + assert len(result) == 2 + assert result[0]["rowid"] == 3 + assert result[1]["rowid"] == 4 + + +def test_distance_ge_basic(db): + """Test distance >= X constraint""" + db.execute("CREATE VIRTUAL TABLE v USING vec0(embedding float[2])") + db.executemany( + "INSERT INTO v(rowid, embedding) VALUES (?, ?)", + [ + (1, "[1.0, 0.0]"), + (2, "[2.0, 0.0]"), + (3, "[3.0, 0.0]"), + ], + ) + + result = db.execute( + "SELECT rowid, distance FROM v WHERE embedding MATCH '[0.0, 0.0]' AND k = 10 AND distance >= 2.0 ORDER BY distance" + ).fetchall() + # Should get rowid 2 (distance=2.0) and rowid 3 (distance=3.0) + assert len(result) == 2 + assert result[0]["rowid"] == 2 + assert result[1]["rowid"] == 3 + + +def test_distance_lt_basic(db): + """Test distance < X constraint for range queries""" + db.execute("CREATE VIRTUAL TABLE v USING vec0(embedding float[2])") + db.executemany( + "INSERT INTO v(rowid, embedding) VALUES (?, ?)", + [ + (1, "[1.0, 0.0]"), + (2, "[2.0, 0.0]"), + (3, "[3.0, 0.0]"), + (4, "[4.0, 0.0]"), + ], + ) + + result = db.execute( + "SELECT rowid, distance FROM v WHERE embedding MATCH '[0.0, 0.0]' AND k = 10 AND distance < 3.0 ORDER BY distance" + ).fetchall() + # Should get rowid 1 and 2 only + assert len(result) == 2 + assert result[0]["rowid"] == 1 + assert result[1]["rowid"] == 2 + + +def test_distance_le_basic(db): + """Test distance <= X constraint""" + db.execute("CREATE VIRTUAL TABLE v USING vec0(embedding float[2])") + db.executemany( + "INSERT INTO v(rowid, embedding) VALUES (?, ?)", + [ + (1, "[1.0, 0.0]"), + (2, "[2.0, 0.0]"), + (3, "[3.0, 0.0]"), + (4, "[4.0, 0.0]"), + ], + ) + + result = db.execute( + "SELECT rowid, distance FROM v WHERE embedding MATCH '[0.0, 0.0]' AND k = 10 AND distance <= 2.0 ORDER BY distance" + ).fetchall() + # Should get rowid 1 and 2 + assert len(result) == 2 + assert result[0]["rowid"] == 1 + assert result[1]["rowid"] == 2 + + +def test_distance_range_query(db): + """Test range query with both lower and upper bounds""" + db.execute("CREATE VIRTUAL TABLE v USING vec0(embedding float[1])") + db.executemany( + "INSERT INTO v(rowid, embedding) VALUES (?, ?)", + [(i, f"[{float(i)}]") for i in range(1, 11)], + ) + + # Get vectors with distance between 3.0 and 6.0 (inclusive on both ends) + result = db.execute( + """SELECT rowid, distance FROM v + WHERE embedding MATCH '[0.0]' + AND k = 20 + AND distance >= 3.0 + AND distance <= 6.0 + ORDER BY distance""" + ).fetchall() + + # Should get rowids 3, 4, 5, 6 (distances 3.0, 4.0, 5.0, 6.0) + assert len(result) == 4 + assert [r["rowid"] for r in result] == [3, 4, 5, 6] + + +def test_distance_with_partition_keys(db): + """Test distance constraints work with partition keys""" + db.execute("CREATE VIRTUAL TABLE v USING vec0(category TEXT partition key, embedding float[2])") + db.executemany( + "INSERT INTO v(rowid, category, embedding) VALUES (?, ?, ?)", + [ + (1, "A", "[1.0, 0.0]"), + (2, "A", "[2.0, 0.0]"), + (3, "A", "[3.0, 0.0]"), + (4, "B", "[1.0, 0.0]"), + (5, "B", "[2.0, 0.0]"), + ], + ) + + # Query only category A with distance filter + result = db.execute( + """SELECT rowid, distance FROM v + WHERE embedding MATCH '[0.0, 0.0]' + AND category = 'A' + AND k = 10 + AND distance > 1.0 + ORDER BY distance""" + ).fetchall() + + # Should only get category A items with distance > 1.0 + assert len(result) == 2 + assert result[0]["rowid"] == 2 + assert result[1]["rowid"] == 3 + + +def test_distance_with_metadata(db): + """Test distance constraints work with metadata columns""" + db.execute("CREATE VIRTUAL TABLE v USING vec0(embedding float[2], label TEXT)") + db.executemany( + "INSERT INTO v(rowid, embedding, label) VALUES (?, ?, ?)", + [ + (1, "[1.0, 0.0]", "important"), + (2, "[2.0, 0.0]", "important"), + (3, "[3.0, 0.0]", "spam"), + (4, "[4.0, 0.0]", "important"), + ], + ) + + # Query with both metadata filter and distance constraint + result = db.execute( + """SELECT rowid, distance FROM v + WHERE embedding MATCH '[0.0, 0.0]' + AND label = 'important' + AND k = 10 + AND distance >= 2.0 + ORDER BY distance""" + ).fetchall() + + # Should get rowid 2 and 4 (both important, distance >= 2.0) + assert len(result) == 2 + assert result[0]["rowid"] == 2 + assert result[1]["rowid"] == 4 + + +def test_distance_empty_result(db): + """Test distance constraint that filters out all results""" + db.execute("CREATE VIRTUAL TABLE v USING vec0(embedding float[2])") + db.executemany( + "INSERT INTO v(rowid, embedding) VALUES (?, ?)", + [ + (1, "[1.0, 0.0]"), + (2, "[2.0, 0.0]"), + ], + ) + + # Distance constraint that excludes everything + result = db.execute( + "SELECT rowid, distance FROM v WHERE embedding MATCH '[0.0, 0.0]' AND k = 10 AND distance > 100.0" + ).fetchall() + + assert len(result) == 0 + + +def test_distance_pagination_multi_page(db): + """Test multi-page pagination scenario""" + db.execute("CREATE VIRTUAL TABLE v USING vec0(embedding float[1])") + db.executemany( + "INSERT INTO v(rowid, embedding) VALUES (?, ?)", + [(i, f"[{float(i)}]") for i in range(1, 21)], # 20 vectors + ) + + page_size = 5 + all_results = [] + last_distance = None + + # Paginate through all results + for page in range(4): # 4 pages of 5 items each + if last_distance is None: + query = "SELECT rowid, distance FROM v WHERE embedding MATCH '[0.0]' AND k = ? ORDER BY distance" + result = db.execute(query, [page_size]).fetchall() + else: + query = "SELECT rowid, distance FROM v WHERE embedding MATCH '[0.0]' AND k = ? AND distance > ? ORDER BY distance" + result = db.execute(query, [page_size, last_distance]).fetchall() + + assert len(result) == page_size + all_results.extend(result) + last_distance = result[-1]["distance"] + + # Verify we got all 20 items in order + assert len(all_results) == 20 + assert [r["rowid"] for r in all_results] == list(range(1, 21)) + + +def test_distance_binary_vectors(db): + """Test distance constraints with binary vectors""" + # Use 32 bits = 4 bytes to satisfy alignment requirements + db.execute("CREATE VIRTUAL TABLE v USING vec0(embedding bit[32])") + # Use vec_bit() constructor to properly type the vectors + db.execute("INSERT INTO v(rowid, embedding) VALUES (1, vec_bit(?))", [b"\x00\x00\x00\x00"]) + db.execute("INSERT INTO v(rowid, embedding) VALUES (2, vec_bit(?))", [b"\x01\x00\x00\x00"]) + db.execute("INSERT INTO v(rowid, embedding) VALUES (3, vec_bit(?))", [b"\x03\x00\x00\x00"]) + db.execute("INSERT INTO v(rowid, embedding) VALUES (4, vec_bit(?))", [b"\x0F\x00\x00\x00"]) + + # Use vec_bit() directly in the MATCH clause to preserve type + result = db.execute( + "SELECT rowid, distance FROM v WHERE embedding MATCH vec_bit(?) AND k = 10 AND distance > 0.0 ORDER BY distance", + [b"\x00\x00\x00\x00"], + ).fetchall() + + # Should exclude exact match (rowid 1, distance 0.0) + assert len(result) == 3 + assert result[0]["rowid"] == 2 + + +def test_distance_int8_vectors(db): + """Test distance constraints with int8 vectors""" + # Use 4 elements to match 4-byte alignment + db.execute("CREATE VIRTUAL TABLE v USING vec0(embedding int8[4])") + # Use vec_int8() constructor to properly type the vectors + db.execute("INSERT INTO v(rowid, embedding) VALUES (1, vec_int8(?))", [_int8([1, 0, 0, 0])]) + db.execute("INSERT INTO v(rowid, embedding) VALUES (2, vec_int8(?))", [_int8([2, 0, 0, 0])]) + db.execute("INSERT INTO v(rowid, embedding) VALUES (3, vec_int8(?))", [_int8([3, 0, 0, 0])]) + db.execute("INSERT INTO v(rowid, embedding) VALUES (4, vec_int8(?))", [_int8([4, 0, 0, 0])]) + + # Use vec_int8() directly in the MATCH clause to preserve type + result = db.execute( + "SELECT rowid, distance FROM v WHERE embedding MATCH vec_int8(?) AND k = 10 AND distance <= 2.0 ORDER BY distance", + [_int8([0, 0, 0, 0])], + ).fetchall() + + # Distances will be 1.0, 2.0, 3.0, 4.0 - filter to <= 2.0 + assert len(result) == 2 + assert result[0]["rowid"] == 1 + assert result[1]["rowid"] == 2 + + +def test_distance_equal_distances_caveat(db): + """Test behavior with equal distances (documents the limitation)""" + db.execute("CREATE VIRTUAL TABLE v USING vec0(embedding float[2])") + + # Create vectors with same distance from query point + # All at distance 1.0 from [0, 0] + db.executemany( + "INSERT INTO v(rowid, embedding) VALUES (?, ?)", + [ + (1, "[1.0, 0.0]"), # distance 1.0 + (2, "[0.0, 1.0]"), # distance 1.0 + (3, "[-1.0, 0.0]"), # distance 1.0 + (4, "[0.0, -1.0]"), # distance 1.0 + (5, "[2.0, 0.0]"), # distance 2.0 + ], + ) + + # Query with distance > 1.0 may miss some vectors with distance == 1.0 + # This documents the expected behavior + result = db.execute( + "SELECT rowid, distance FROM v WHERE embedding MATCH '[0.0, 0.0]' AND k = 10 AND distance > 1.0 ORDER BY distance" + ).fetchall() + + # Should only get rowid 5 (distance 2.0) + assert len(result) == 1 + assert result[0]["rowid"] == 5 + + +def test_distance_with_auxiliary_columns(db): + """Test distance constraints work with auxiliary columns""" + db.execute("CREATE VIRTUAL TABLE v USING vec0(embedding float[2], +metadata TEXT)") + db.executemany( + "INSERT INTO v(rowid, embedding, metadata) VALUES (?, ?, ?)", + [ + (1, "[1.0, 0.0]", "doc1"), + (2, "[2.0, 0.0]", "doc2"), + (3, "[3.0, 0.0]", "doc3"), + ], + ) + + result = db.execute( + """SELECT rowid, distance, metadata FROM v + WHERE embedding MATCH '[0.0, 0.0]' + AND k = 10 + AND distance >= 2.0 + ORDER BY distance""" + ).fetchall() + + assert len(result) == 2 + assert result[0]["rowid"] == 2 + assert result[0]["metadata"] == "doc2" + assert result[1]["rowid"] == 3 + assert result[1]["metadata"] == "doc3" + + +def test_distance_precision_boundary(db): + """Test distance constraints with precise boundary values""" + db.execute("CREATE VIRTUAL TABLE v USING vec0(embedding float[1])") + + # Insert vectors with very precise distances + db.executemany( + "INSERT INTO v(rowid, embedding) VALUES (?, ?)", + [ + (1, "[0.1]"), + (2, "[0.2]"), + (3, "[0.3]"), + ], + ) + + # Test exact boundary + result = db.execute( + "SELECT rowid FROM v WHERE embedding MATCH '[0.0]' AND k = 10 AND distance >= 0.2 ORDER BY distance" + ).fetchall() + + # Should include 0.2 and 0.3 + assert len(result) == 2 + assert result[0]["rowid"] == 2 + + +def test_distance_k_interaction(db): + """Test that distance filter is applied during KNN search, k limits final results""" + db.execute("CREATE VIRTUAL TABLE v USING vec0(embedding float[1])") + db.executemany( + "INSERT INTO v(rowid, embedding) VALUES (?, ?)", + [(i, f"[{float(i)}]") for i in range(1, 11)], + ) + + # Distance filter is applied during search, k limits how many results we get back + result = db.execute( + "SELECT rowid FROM v WHERE embedding MATCH '[0.0]' AND k = 5 AND distance > 2.0 ORDER BY distance" + ).fetchall() + + # Distance > 2.0 filters to: 3,4,5,6,7,8,9,10 + # k=5 limits to first 5: 3,4,5,6,7 + assert len(result) == 5 + assert [r["rowid"] for r in result] == [3, 4, 5, 6, 7] diff --git a/tests/test-loadable.py b/tests/test-loadable.py index a8058c9e..0d518b82 100644 --- a/tests/test-loadable.py +++ b/tests/test-loadable.py @@ -423,6 +423,51 @@ def check(a, b, dtype=np.float32): check([1, 2, 3], [-9, -8, -7], dtype=np.int8) assert vec_distance_cosine("[1.1, 1.0]", "[1.2, 1.2]") == 0.001131898257881403 + vec_distance_cosine_bit = lambda *args: db.execute( + "select vec_distance_cosine(vec_bit(?), vec_bit(?))", args + ).fetchone()[0] + assert isclose( + vec_distance_cosine_bit(b"\xff", b"\x01"), + npy_cosine([1,1,1,1,1,1,1,1], [0,0,0,0,0,0,0,1]), + abs_tol=1e-6 + ) + assert isclose( + vec_distance_cosine_bit(b"\xab", b"\xab"), + npy_cosine([1,0,1,0,1,0,1,1], [1,0,1,0,1,0,1,1]), + abs_tol=1e-6 + ) + # test 64-bit + assert isclose( + vec_distance_cosine_bit(b"\xaa" * 8, b"\xff" * 8), + npy_cosine([1,0] * 32, [1] * 64), + abs_tol=1e-6 + ) + +def test_ensure_vector_match_cleanup_on_second_vector_error(): + """ + Test that ensure_vector_match properly cleans up the first vector + when the second vector fails to parse. + + This tests the fix for a bug where aCleanup(a) was called instead of + aCleanup(*a), passing the wrong pointer to the cleanup function. + + The bug only manifests when the first vector is parsed from JSON/TEXT + (which uses sqlite3_free as cleanup) rather than BLOB (which uses noop). + """ + # Valid first vector as JSON text - this causes memory allocation + # and sets cleanup to sqlite3_free + valid_vector_json = "[1.0, 2.0, 3.0, 4.0]" + + # Invalid second vector: 5 bytes, not divisible by 4 (sizeof float32) + # This will fail in fvec_from_value with "invalid float32 vector BLOB length" + invalid_vector = b"\x01\x02\x03\x04\x05" + + with pytest.raises(sqlite3.OperationalError, match=r"^Error reading 2nd vector: invalid float32 vector BLOB length\. Must be divisible by 4, found 5$"): + db.execute( + "select vec_distance_cosine(?, ?)", + [valid_vector_json, invalid_vector] + ).fetchone() + def test_vec_distance_hamming(): vec_distance_hamming = lambda *args: db.execute( @@ -951,6 +996,54 @@ def test_vec0_inserts(): db.execute("insert into txt_pk(txt_id, aaa) values ('b', '[2,2,2,2]')") +def test_vec0_locale_independent(): + """Test that JSON float parsing is locale-independent (issue #241)""" + import locale + + db = connect(EXT_PATH) + db.execute("create virtual table v using vec0(embedding float[3])") + + # Test with C locale first (baseline) + db.execute("insert into v(rowid, embedding) values (1, '[0.1, 0.2, 0.3]')") + + # Try to set a non-C locale that uses comma as decimal separator + # Common locales: fr_FR, de_DE, it_IT, es_ES, pt_BR, etc. + test_locales = ['fr_FR.UTF-8', 'de_DE.UTF-8', 'it_IT.UTF-8', 'C.UTF-8'] + locale_set = False + original_locale = locale.setlocale(locale.LC_NUMERIC) + + for test_locale in test_locales: + try: + locale.setlocale(locale.LC_NUMERIC, test_locale) + locale_set = True + break + except locale.Error: + continue + + try: + # Even with non-C locale, JSON parsing should work (using dot as decimal separator) + # Before the fix, this would fail in French/German/etc locales + db.execute("insert into v(rowid, embedding) values (2, '[0.4, 0.5, 0.6]')") + + # Verify the data was inserted correctly + result = db.execute("select embedding from v where rowid = 2").fetchone() + expected = _f32([0.4, 0.5, 0.6]) + assert result[0] == expected, f"Expected {expected}, got {result[0]}" + + # Also verify with different decimal values + db.execute("insert into v(rowid, embedding) values (3, '[1.23, 4.56, 7.89]')") + result = db.execute("select embedding from v where rowid = 3").fetchone() + expected = _f32([1.23, 4.56, 7.89]) + assert result[0] == expected, f"Expected {expected}, got {result[0]}" + + finally: + # Restore original locale + locale.setlocale(locale.LC_NUMERIC, original_locale) + + # If we couldn't set a non-C locale, the test still passes (baseline check) + # but we didn't really test the locale-independence + + def test_vec0_insert_errors2(): db = connect(EXT_PATH) db.execute("create virtual table t1 using vec0(aaa float[4], chunk_size=8)") diff --git a/tests/test-metadata.py b/tests/test-metadata.py index 3c2e5423..eb64128b 100644 --- a/tests/test-metadata.py +++ b/tests/test-metadata.py @@ -264,6 +264,29 @@ def test_deletes(db, snapshot): assert vec0_shadow_table_contents(db, "v") == snapshot() +def test_renames(db, snapshot): + db.execute( + "create virtual table v using vec0(vector float[1], b boolean, n int, f float, t text, chunk_size=8)" + ) + INSERT = "insert into v(rowid, vector, b, n, f, t) values (?, ?, ?, ?, ?, ?)" + + assert exec(db, INSERT, [1, b"\x11\x11\x11\x11", 1, 1, 1.1, "test1"]) == snapshot() + assert exec(db, INSERT, [2, b"\x22\x22\x22\x22", 1, 2, 2.2, "test2"]) == snapshot() + assert ( + exec(db, INSERT, [3, b"\x33\x33\x33\x33", 1, 3, 3.3, "1234567890123"]) + == snapshot() + ) + + assert exec(db, "select * from v") == snapshot() + assert vec0_shadow_table_contents(db, "v") == snapshot() + + result = exec(db, "select * from v") + db.execute( + "alter table v rename to v1" + ) + assert exec(db, "select * from v1")["rows"] == result["rows"] + + def test_knn(db, snapshot): db.execute( "create virtual table v using vec0(vector float[1], name text, chunk_size=8)" @@ -276,16 +299,795 @@ def test_knn(db, snapshot): [("[1]", "alex"), ("[2]", "brian"), ("[3]", "craig")], ) - # EVIDENCE-OF: V16511_00582 catches "illegal" constraints on metadata columns + # LIKE is now supported on text metadata columns assert ( exec( db, - "select *, distance from v where vector match '[5]' and k = 3 and name like 'illegal'", + "select *, distance from v where vector match '[5]' and k = 3 and name like 'a%'", ) == snapshot() ) +def test_like(db, snapshot): + """Test LIKE operator on text metadata columns with various patterns""" + db.execute( + "create virtual table v using vec0(vector float[1], name text, chunk_size=8)" + ) + + # Insert test data with both short (≤12 bytes) and long (>12 bytes) strings + db.execute( + """ + INSERT INTO v(vector, name) VALUES + ('[.11]', 'alice'), + ('[.22]', 'alex'), + ('[.33]', 'bob'), + ('[.44]', 'bobby'), + ('[.55]', 'carol'), + ('[.66]', 'this_is_a_very_long_string_name'), + ('[.77]', 'this_is_another_long_one'), + ('[.88]', 'yet_another_string'), + ('[.99]', 'zebra'); + """ + ) + + # Test prefix-only patterns (fast path) + assert ( + exec( + db, + "select rowid, name, distance from v where vector match '[1]' and k = 5 and name like 'a%'", + ) + == snapshot(name="prefix a%") + ) + + assert ( + exec( + db, + "select rowid, name, distance from v where vector match '[1]' and k = 5 and name like 'bob%'", + ) + == snapshot(name="prefix bob%") + ) + + assert ( + exec( + db, + "select rowid, name, distance from v where vector match '[1]' and k = 5 and name like 'this_%'", + ) + == snapshot(name="prefix this_% with long strings") + ) + + # Test complex patterns (slow path) + assert ( + exec( + db, + "select rowid, name, distance from v where vector match '[1]' and k = 9 and name like '%ice'", + ) + == snapshot(name="suffix %ice") + ) + + assert ( + exec( + db, + "select rowid, name, distance from v where vector match '[1]' and k = 9 and name like '%o%'", + ) + == snapshot(name="contains %o%") + ) + + assert ( + exec( + db, + "select rowid, name, distance from v where vector match '[1]' and k = 9 and name like 'a_e_'", + ) + == snapshot(name="wildcard pattern a_e_") + ) + + # Test edge cases + assert ( + exec( + db, + "select rowid, name, distance from v where vector match '[1]' and k = 9 and name like '%'", + ) + == snapshot(name="match all %") + ) + + assert ( + exec( + db, + "select rowid, name, distance from v where vector match '[1]' and k = 9 and name like 'nomatch%'", + ) + == snapshot(name="no matches nomatch%") + ) + + # Test LIKE on non-TEXT metadata should error + db.execute( + "create virtual table v2 using vec0(vector float[1], age int)" + ) + db.execute("insert into v2(vector, age) values ('[1]', 25)") + + assert ( + exec( + db, + "select * from v2 where vector match '[1]' and k = 1 and age like '2%'", + ) + == snapshot(name="error: LIKE on integer column") + ) + + +def test_like_case_insensitive(db, snapshot): + """Test LIKE operator is case-insensitive (SQLite default)""" + db.execute( + "create virtual table v using vec0(vector float[1], name text, chunk_size=8)" + ) + + # Insert test data with mixed case + db.execute( + """ + INSERT INTO v(vector, name) VALUES + ('[.11]', 'Apple'), + ('[.22]', 'BANANA'), + ('[.33]', 'Cherry'), + ('[.44]', 'DURIAN_IS_LONG'), + ('[.55]', 'elderberry_is_very_long_string'); + """ + ) + + # Test case insensitivity with prefix patterns (fast path) + assert ( + exec( + db, + "select rowid, name from v where vector match '[1]' and k = 5 and name like 'apple%'", + ) + == snapshot(name="lowercase pattern matches uppercase data") + ) + + assert ( + exec( + db, + "select rowid, name from v where vector match '[1]' and k = 5 and name like 'CHERRY%'", + ) + == snapshot(name="uppercase pattern matches mixed case data") + ) + + assert ( + exec( + db, + "select rowid, name from v where vector match '[1]' and k = 5 and name like 'DuRiAn%'", + ) + == snapshot(name="mixed case pattern matches uppercase data") + ) + + # Test case insensitivity with long strings + assert ( + exec( + db, + "select rowid, name from v where vector match '[1]' and k = 5 and name like 'ELDERBERRY%'", + ) + == snapshot(name="uppercase pattern matches long lowercase data") + ) + + # Test case insensitivity with complex patterns (slow path) + assert ( + exec( + db, + "select rowid, name from v where vector match '[1]' and k = 5 and name like '%APPLE%'", + ) + == snapshot(name="complex pattern case insensitive") + ) + + +def test_like_boundary_conditions(db, snapshot): + """Test LIKE operator at 12-byte cache boundary""" + db.execute( + "create virtual table v using vec0(vector float[1], name text, chunk_size=8)" + ) + + # Insert test data with specific lengths + # Exactly 12 bytes: fits in cache + # Exactly 13 bytes: first 12 bytes in cache, last byte requires full fetch + db.execute( + """ + INSERT INTO v(vector, name) VALUES + ('[.11]', 'exactly_12ch'), + ('[.22]', 'exactly_13chr'), + ('[.33]', 'short'), + ('[.44]', 'this_is_14byte'), + ('[.55]', 'this_is_much_longer_than_12_bytes'); + """ + ) + + # Verify lengths + lengths = db.execute("select name, length(name) from v order by rowid").fetchall() + assert lengths[0][1] == 12, f"Expected 12 bytes, got {lengths[0][1]} for '{lengths[0][0]}'" + assert lengths[1][1] == 13, f"Expected 13 bytes, got {lengths[1][1]} for '{lengths[1][0]}'" + + # Test prefix matching at exactly 12 bytes + assert ( + exec( + db, + "select rowid, name from v where vector match '[1]' and k = 5 and name like 'exactly_12%'", + ) + == snapshot(name="12-byte boundary: exact match") + ) + + assert ( + exec( + db, + "select rowid, name from v where vector match '[1]' and k = 5 and name like 'exactly%'", + ) + == snapshot(name="12-byte boundary: prefix matches both 12 and 13 byte strings") + ) + + # Test pattern that is exactly 12 bytes (excluding %) + assert ( + exec( + db, + "select rowid, name from v where vector match '[1]' and k = 5 and name like 'exactly_13ch%'", + ) + == snapshot(name="13-byte boundary: 12-byte pattern") + ) + + # Test short pattern on long strings + assert ( + exec( + db, + "select rowid, name from v where vector match '[1]' and k = 5 and name like 'this%'", + ) + == snapshot(name="boundary: short pattern on mixed length strings") + ) + + # Test case insensitivity at boundary + assert ( + exec( + db, + "select rowid, name from v where vector match '[1]' and k = 5 and name like 'EXACTLY_12%'", + ) + == snapshot(name="boundary: case insensitive at 12 bytes") + ) + + +def test_glob(db, snapshot): + """Test GLOB operator on text metadata columns with various patterns""" + db.execute( + "create virtual table v using vec0(vector float[1], name text, chunk_size=8)" + ) + + # Insert test data with both short (≤12 bytes) and long (>12 bytes) strings + db.execute( + """ + INSERT INTO v(vector, name) VALUES + ('[.11]', 'alice'), + ('[.22]', 'alex'), + ('[.33]', 'bob'), + ('[.44]', 'bobby'), + ('[.55]', 'carol'), + ('[.66]', 'this_is_a_very_long_string_name'), + ('[.77]', 'this_is_another_long_one'), + ('[.88]', 'yet_another_string'), + ('[.99]', 'zebra'); + """ + ) + + # Test prefix-only patterns (fast path) + assert ( + exec( + db, + "select rowid, name, distance from v where vector match '[1]' and k = 5 and name glob 'a*'", + ) + == snapshot(name="prefix a*") + ) + + assert ( + exec( + db, + "select rowid, name, distance from v where vector match '[1]' and k = 5 and name glob 'bob*'", + ) + == snapshot(name="prefix bob*") + ) + + assert ( + exec( + db, + "select rowid, name, distance from v where vector match '[1]' and k = 5 and name glob 'this_*'", + ) + == snapshot(name="prefix this_* with long strings") + ) + + # Test complex patterns (slow path) + assert ( + exec( + db, + "select rowid, name, distance from v where vector match '[1]' and k = 9 and name glob '*ice'", + ) + == snapshot(name="suffix *ice") + ) + + assert ( + exec( + db, + "select rowid, name, distance from v where vector match '[1]' and k = 9 and name glob '*o*'", + ) + == snapshot(name="contains *o*") + ) + + assert ( + exec( + db, + "select rowid, name, distance from v where vector match '[1]' and k = 9 and name glob 'a?e?'", + ) + == snapshot(name="wildcard pattern a?e?") + ) + + # Test edge cases + assert ( + exec( + db, + "select rowid, name, distance from v where vector match '[1]' and k = 9 and name glob '*'", + ) + == snapshot(name="match all *") + ) + + assert ( + exec( + db, + "select rowid, name, distance from v where vector match '[1]' and k = 9 and name glob 'nomatch*'", + ) + == snapshot(name="no matches nomatch*") + ) + + # Test GLOB on non-TEXT metadata should error + db.execute( + "create virtual table v2 using vec0(vector float[1], age int)" + ) + db.execute("insert into v2(vector, age) values ('[1]', 25)") + + assert ( + exec( + db, + "select * from v2 where vector match '[1]' and k = 1 and age glob '2*'", + ) + == snapshot(name="error: GLOB on integer column") + ) + + +def test_glob_case_sensitive(db, snapshot): + """Test GLOB operator is case-sensitive (unlike LIKE)""" + db.execute( + "create virtual table v using vec0(vector float[1], name text, chunk_size=8)" + ) + + # Insert test data with mixed case + db.execute( + """ + INSERT INTO v(vector, name) VALUES + ('[.11]', 'Apple'), + ('[.22]', 'BANANA'), + ('[.33]', 'Cherry'), + ('[.44]', 'DURIAN_IS_LONG'), + ('[.55]', 'elderberry_is_very_long_string'); + """ + ) + + # Test case sensitivity with prefix patterns (fast path) + assert ( + exec( + db, + "select rowid, name from v where vector match '[1]' and k = 5 and name glob 'apple*'", + ) + == snapshot(name="lowercase pattern should not match uppercase data") + ) + + assert ( + exec( + db, + "select rowid, name from v where vector match '[1]' and k = 5 and name glob 'Apple*'", + ) + == snapshot(name="exact case match Apple*") + ) + + assert ( + exec( + db, + "select rowid, name from v where vector match '[1]' and k = 5 and name glob 'CHERRY*'", + ) + == snapshot(name="uppercase pattern should not match mixed case") + ) + + assert ( + exec( + db, + "select rowid, name from v where vector match '[1]' and k = 5 and name glob 'Cherry*'", + ) + == snapshot(name="exact case match Cherry*") + ) + + # Test case sensitivity with long strings + assert ( + exec( + db, + "select rowid, name from v where vector match '[1]' and k = 5 and name glob 'ELDERBERRY*'", + ) + == snapshot(name="uppercase pattern should not match long lowercase data") + ) + + assert ( + exec( + db, + "select rowid, name from v where vector match '[1]' and k = 5 and name glob 'elderberry*'", + ) + == snapshot(name="lowercase pattern matches long lowercase data") + ) + + # Test case sensitivity with complex patterns (slow path) + assert ( + exec( + db, + "select rowid, name from v where vector match '[1]' and k = 5 and name glob '*APPLE*'", + ) + == snapshot(name="complex pattern case sensitive") + ) + + +def test_glob_boundary_conditions(db, snapshot): + """Test GLOB operator at 12-byte cache boundary""" + db.execute( + "create virtual table v using vec0(vector float[1], name text, chunk_size=8)" + ) + + # Insert test data with specific lengths + db.execute( + """ + INSERT INTO v(vector, name) VALUES + ('[.11]', 'exactly_12ch'), + ('[.22]', 'exactly_13chr'), + ('[.33]', 'short'), + ('[.44]', 'this_is_14byte'), + ('[.55]', 'this_is_much_longer_than_12_bytes'); + """ + ) + + # Test prefix pattern that fits in cache (fast path) + assert ( + exec( + db, + "select rowid, name from v where vector match '[1]' and k = 5 and name glob 'exactly_*'", + ) + == snapshot(name="boundary: prefix pattern at boundary") + ) + + # Test that case sensitivity works at boundary + assert ( + exec( + db, + "select rowid, name from v where vector match '[1]' and k = 5 and name glob 'EXACTLY_*'", + ) + == snapshot(name="boundary: case sensitive at 12 bytes") + ) + + +def test_is_integer_metadata(db, snapshot): + """Test IS operator on integer metadata columns""" + db.execute( + "create virtual table v using vec0(vector float[1], age int, chunk_size=8)" + ) + + # Insert test data + db.execute( + """ + INSERT INTO v(vector, age) VALUES + ('[.11]', 10), + ('[.22]', 20), + ('[.33]', 30), + ('[.44]', 20), + ('[.55]', 40); + """ + ) + + # Test IS (should work like =) + assert ( + exec( + db, + "select rowid, age from v where vector match '[1]' and k = 5 and age is 20", + ) + == snapshot(name="IS 20") + ) + + # Test IS NOT (should work like !=) + assert ( + exec( + db, + "select rowid, age from v where vector match '[1]' and k = 5 and age is not 20", + ) + == snapshot(name="IS NOT 20") + ) + + # Test IS NULL (should return no rows - metadata doesn't support NULL) + assert ( + exec( + db, + "select rowid, age from v where vector match '[1]' and k = 5 and age is null", + ) + == snapshot(name="IS NULL") + ) + + # Test IS NOT NULL (should return all rows - metadata doesn't support NULL) + assert ( + exec( + db, + "select rowid, age from v where vector match '[1]' and k = 5 and age is not null", + ) + == snapshot(name="IS NOT NULL") + ) + + +def test_is_float_metadata(db, snapshot): + """Test IS operator on float metadata columns""" + db.execute( + "create virtual table v using vec0(vector float[1], score float, chunk_size=8)" + ) + + # Insert test data + db.execute( + """ + INSERT INTO v(vector, score) VALUES + ('[.11]', 1.5), + ('[.22]', 2.5), + ('[.33]', 3.5), + ('[.44]', 2.5), + ('[.55]', 4.5); + """ + ) + + # Test IS (should work like =) + assert ( + exec( + db, + "select rowid, score from v where vector match '[1]' and k = 5 and score is 2.5", + ) + == snapshot(name="IS 2.5") + ) + + # Test IS NOT (should work like !=) + assert ( + exec( + db, + "select rowid, score from v where vector match '[1]' and k = 5 and score is not 2.5", + ) + == snapshot(name="IS NOT 2.5") + ) + + # Test IS NULL (should return no rows) + assert ( + exec( + db, + "select rowid, score from v where vector match '[1]' and k = 5 and score is null", + ) + == snapshot(name="IS NULL float") + ) + + # Test IS NOT NULL (should return all rows) + assert ( + exec( + db, + "select rowid, score from v where vector match '[1]' and k = 5 and score is not null", + ) + == snapshot(name="IS NOT NULL float") + ) + + +def test_is_text_metadata(db, snapshot): + """Test IS operator on text metadata columns""" + db.execute( + "create virtual table v using vec0(vector float[1], name text, chunk_size=8)" + ) + + # Insert test data + db.execute( + """ + INSERT INTO v(vector, name) VALUES + ('[.11]', 'alice'), + ('[.22]', 'bob'), + ('[.33]', 'carol'), + ('[.44]', 'bob'), + ('[.55]', 'david'); + """ + ) + + # Test IS (should work like =) + assert ( + exec( + db, + "select rowid, name from v where vector match '[1]' and k = 5 and name is 'bob'", + ) + == snapshot(name="IS bob") + ) + + # Test IS NOT (should work like !=) + assert ( + exec( + db, + "select rowid, name from v where vector match '[1]' and k = 5 and name is not 'bob'", + ) + == snapshot(name="IS NOT bob") + ) + + # Test IS NULL (should return no rows) + assert ( + exec( + db, + "select rowid, name from v where vector match '[1]' and k = 5 and name is null", + ) + == snapshot(name="IS NULL text") + ) + + # Test IS NOT NULL (should return all rows) + assert ( + exec( + db, + "select rowid, name from v where vector match '[1]' and k = 5 and name is not null", + ) + == snapshot(name="IS NOT NULL text") + ) + + +def test_is_boolean_metadata(db, snapshot): + """Test IS operator on boolean metadata columns (issue #190 use case)""" + db.execute( + "create virtual table v using vec0(vector float[1], is_hidden boolean, chunk_size=8)" + ) + + # Insert test data + db.execute( + """ + INSERT INTO v(vector, is_hidden) VALUES + ('[.11]', 0), + ('[.22]', 1), + ('[.33]', 0), + ('[.44]', 1), + ('[.55]', 0); + """ + ) + + # Test IS FALSE (the original use case from issue #190) + assert ( + exec( + db, + "select rowid, is_hidden from v where vector match '[1]' and k = 5 and is_hidden is 0", + ) + == snapshot(name="is_hidden IS false") + ) + + # Test IS TRUE + assert ( + exec( + db, + "select rowid, is_hidden from v where vector match '[1]' and k = 5 and is_hidden is 1", + ) + == snapshot(name="is_hidden IS true") + ) + + # Test IS NOT FALSE + assert ( + exec( + db, + "select rowid, is_hidden from v where vector match '[1]' and k = 5 and is_hidden is not 0", + ) + == snapshot(name="is_hidden IS NOT false") + ) + + # Test IS NULL (should return no rows) + assert ( + exec( + db, + "select rowid, is_hidden from v where vector match '[1]' and k = 5 and is_hidden is null", + ) + == snapshot(name="IS NULL boolean") + ) + + # Test IS NOT NULL (should return all rows) + assert ( + exec( + db, + "select rowid, is_hidden from v where vector match '[1]' and k = 5 and is_hidden is not null", + ) + == snapshot(name="IS NOT NULL boolean") + ) + + +def test_is_with_long_text(db, snapshot): + """Test IS operator with long text strings (>12 bytes)""" + db.execute( + "create virtual table v using vec0(vector float[1], name text, chunk_size=8)" + ) + + # Insert test data with long strings + db.execute( + """ + INSERT INTO v(vector, name) VALUES + ('[.11]', 'this_is_a_very_long_string_name'), + ('[.22]', 'another_long_string'), + ('[.33]', 'short'), + ('[.44]', 'this_is_a_very_long_string_name'), + ('[.55]', 'yet_another_long_one'); + """ + ) + + # Test IS with long string + assert ( + exec( + db, + "select rowid, name from v where vector match '[1]' and k = 5 and name is 'this_is_a_very_long_string_name'", + ) + == snapshot(name="IS long string") + ) + + # Test IS NOT with long string + assert ( + exec( + db, + "select rowid, name from v where vector match '[1]' and k = 5 and name is not 'this_is_a_very_long_string_name'", + ) + == snapshot(name="IS NOT long string") + ) + + +def test_is_equivalence_to_eq(db, snapshot): + """Verify IS behaves identically to = for non-NULL values""" + db.execute( + "create virtual table v using vec0(vector float[1], age int, name text, chunk_size=8)" + ) + + db.execute( + """ + INSERT INTO v(vector, age, name) VALUES + ('[.11]', 10, 'alice'), + ('[.22]', 20, 'bob'), + ('[.33]', 30, 'carol'); + """ + ) + + # IS should give same results as = + result_is = exec( + db, + "select rowid from v where vector match '[1]' and k = 5 and age is 20", + ) + result_eq = exec( + db, + "select rowid from v where vector match '[1]' and k = 5 and age = 20", + ) + assert result_is["rows"] == result_eq["rows"], "IS should behave like =" + + # IS NOT should give same results as != + result_isnot = exec( + db, + "select rowid from v where vector match '[1]' and k = 5 and name is not 'bob'", + ) + result_ne = exec( + db, + "select rowid from v where vector match '[1]' and k = 5 and name != 'bob'", + ) + assert result_isnot["rows"] == result_ne["rows"], "IS NOT should behave like !=" + + +def test_vacuum(db, snapshot): + db.execute( + "create virtual table v using vec0(vector float[1], name text)" + ) + db.executemany( + "insert into v(vector, name) values (?, ?)", + [("[1]", "alex"), ("[2]", "brian"), ("[3]", "craig")], + ) + + exec(db, "delete from v where 1 = 1") + prev_page_count = exec(db, "pragma page_count")["rows"][0]["page_count"] + + db.execute("insert into v(v) values ('optimize')") + db.commit() + db.execute("vacuum") + + cur_page_count = exec(db, "pragma page_count")["rows"][0]["page_count"] + assert cur_page_count < prev_page_count + + SUPPORTS_VTAB_IN = sqlite3.sqlite_version_info[1] >= 38 diff --git a/tests/test-optimize-reclaims-space.py b/tests/test-optimize-reclaims-space.py new file mode 100644 index 00000000..9123d637 --- /dev/null +++ b/tests/test-optimize-reclaims-space.py @@ -0,0 +1,123 @@ +import os +import pytest + + +def load_vec_extension(db): + if not hasattr(db, "load_extension"): + pytest.skip("SQLite build does not support loading extensions") + if hasattr(db, "enable_load_extension"): + db.enable_load_extension(True) + ext = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "dist", "vec0")) + try: + # Explicit entrypoint to avoid relying on default name + db.load_extension(ext, "sqlite3_vec_init") + except Exception: + # Some loaders accept missing suffix path without explicit entrypoint + db.load_extension(ext) + + +def pragma_int(db, sql): + return db.execute(sql).fetchone()[0] + + +def test_optimize_reclaims_pages_with_autovacuum_incremental(tmp_path): + try: + import pysqlite3 as sqlite3 # uses bundled modern SQLite with extension loading + except ImportError: # fallback if not available + import sqlite3 + + db_path = tmp_path / "optimize_reclaim.db" + + db = sqlite3.connect(str(db_path)) + db.row_factory = sqlite3.Row + + # Enable autovacuum before creating vec tables; VACUUM is safe here because + # the database only has SQLite system tables at this point. + db.execute("PRAGMA auto_vacuum = INCREMENTAL") + db.execute("VACUUM") + db.execute("PRAGMA journal_mode = WAL") + + load_vec_extension(db) + + # Use a modest chunk_size so we create several chunks and can reclaim them + db.execute("create virtual table v using vec0(vector float[1], chunk_size=64)") + + # Insert 256 rows (four chunks at chunk_size=64) + db.executemany( + "insert into v(rowid, vector) values(?, ?)", + ((i, b"\x11\x11\x11\x11") for i in range(1, 257)), + ) + db.commit() + chunk_rows_after_insert = pragma_int(db, "select count(*) from v_chunks") + + # Delete half the rows to create free space inside vec shadow tables + db.execute("delete from v where rowid > 128") + db.commit() + chunk_rows_after_delete = pragma_int(db, "select count(*) from v_chunks") + + # Compact vec shadow tables and reclaim file pages with autovacuum + db.execute("insert into v(v) values('optimize')") + db.commit() + db.execute("PRAGMA wal_checkpoint(TRUNCATE)") + db.execute("PRAGMA incremental_vacuum") + chunk_rows_after_optimize = pragma_int(db, "select count(*) from v_chunks") + + # Initially 256 rows at chunk_size 64 -> 4 chunk rows. After deleting half, + # optimize should compact to 2 chunk rows. + assert chunk_rows_after_insert == 4 + assert chunk_rows_after_delete == 4 + assert chunk_rows_after_optimize == 2 + + +def test_optimize_then_vacuum_allows_future_writes(tmp_path): + try: + import pysqlite3 as sqlite3 # uses bundled modern SQLite with extension loading + except ImportError: + import sqlite3 + + db_path = tmp_path / "vacuum_safe.db" + + db = sqlite3.connect(str(db_path)) + db.row_factory = sqlite3.Row + load_vec_extension(db) + + db.execute("PRAGMA journal_mode = WAL") + db.execute("create virtual table v using vec0(vector float[1], chunk_size=8)") + + # 32 rows -> 4 chunks at chunk_size=8 + db.executemany( + "insert into v(rowid, vector) values(?, ?)", + ((i, b"\x11\x11\x11\x11") for i in range(1, 33)), + ) + db.commit() + + # Delete half, then compact + db.execute("delete from v where rowid > 16") + db.commit() + db.execute("insert into v(v) values('optimize')") + db.commit() + + # Checkpoint before VACUUM; capture size/page count + db.execute("PRAGMA wal_checkpoint(TRUNCATE)") + size_before_vacuum = db.execute("PRAGMA page_count").fetchone()[0] + disk_bytes_before = os.stat(db_path).st_size + + # VACUUM should preserve shadow table consistency + db.execute("VACUUM") + db.execute("PRAGMA journal_mode = WAL") + size_after_vacuum = db.execute("PRAGMA page_count").fetchone()[0] + disk_bytes_after = os.stat(db_path).st_size + + # Insert more rows after VACUUM; expect no blob-open failures + db.executemany( + "insert into v(rowid, vector) values(?, ?)", + ((i, b"\x22\x22\x22\x22") for i in range(17, 25)), + ) + db.commit() + + # Row count and chunk rows should be consistent (3 chunks cover 24 rows) + assert db.execute("select count(*) from v").fetchone()[0] == 24 + assert db.execute("select count(*) from v_chunks").fetchone()[0] == 3 + # File/page count should not grow; should shrink when pages are freed + assert size_after_vacuum <= size_before_vacuum + assert disk_bytes_after <= disk_bytes_before diff --git a/tests/test-partition-keys.py b/tests/test-partition-keys.py index fee35600..6e9042a6 100644 --- a/tests/test-partition-keys.py +++ b/tests/test-partition-keys.py @@ -74,6 +74,32 @@ def test_updates(db, snapshot): ) +def test_vacuum(db, snapshot): + db.execute( + "create virtual table v using vec0(p text partition key, a float[1])" + ) + + db.execute( + "insert into v(rowid, p, a) values (?, ?, ?)", [1, "a", b"\x11\x11\x11\x11"] + ) + db.execute( + "insert into v(rowid, p, a) values (?, ?, ?)", [2, "a", b"\x22\x22\x22\x22"] + ) + db.execute( + "insert into v(rowid, p, a) values (?, ?, ?)", [3, "a", b"\x33\x33\x33\x33"] + ) + + exec(db, "delete from v where 1 = 1") + prev_page_count = exec(db, "pragma page_count")["rows"][0]["page_count"] + + db.execute("insert into v(v) values ('optimize')") + db.commit() + db.execute("vacuum") + + cur_page_count = exec(db, "pragma page_count")["rows"][0]["page_count"] + assert cur_page_count < prev_page_count + + class Row: def __init__(self): pass