diff --git a/API.md b/API.md index b0045e8..5e3d5c9 100644 --- a/API.md +++ b/API.md @@ -311,6 +311,7 @@ Indexes caller-provided file content without reading from the filesystem. - With `preserve_duplicate_paths=1`, an empty `content` value and a trailing slash in `path` creates an explicit empty directory marker, for example `memory_add_content('dirname/', '')` - Directory markers are stored in `dbmem_content` with a trailing slash path, are shown as directories by `memory_list_files()`, and are not indexed for search - Directory marker paths cannot contain non-empty content and cannot conflict with a file path of the same name +- With `defer_embeddings=1`, content is stored without computing embeddings or FTS entries (no embedding model required); generate them later with `memory_embed_pending()` - Available even when compiled with `DBMEM_OMIT_IO` **Example:** @@ -433,11 +434,12 @@ Returns a JSON tree with the indexed directories and files stored in `dbmem_cont - Directory nodes are derived from indexed file paths and explicit directory markers - Path separators are normalized to `/` in the returned JSON - Sibling nodes are sorted with directories first, then files; each group is alphabetical +- File nodes include an `indexed` boolean: `false` while content is waiting for embedding generation (see `defer_embeddings` and `memory_embed_pending()`), `true` otherwise **Example:** ```sql SELECT memory_list_files(); --- {"root":"","children":[{"type":"directory","name":"docs","path":"docs","children":[{"type":"file","name":"readme.md","path":"docs/readme.md"}]}]} +-- {"root":"","children":[{"type":"directory","name":"docs","path":"docs","children":[{"type":"file","name":"readme.md","path":"docs/readme.md","indexed":true}]}]} ``` --- @@ -648,6 +650,60 @@ SELECT memory_reindex(); --- +#### `memory_embed_pending([limit INTEGER])` + +Generates embeddings and FTS entries for content stored without them (see the `defer_embeddings` option). + +**Parameters:** +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `limit` | INTEGER | No | Maximum number of pending content rows to process in this call (must be positive). When omitted, all pending rows are processed | + +**Returns:** INTEGER - Number of pending content rows processed + +**Notes:** +- Requires an embedding model configured with `memory_set_model()` or loaded from persisted provider/model settings +- A content row is pending when it has a non-empty stored `value` and no `dbmem_vault` entries +- Each row is processed in its own SAVEPOINT transaction, so a row is either fully indexed or untouched; a failed or interrupted call can simply be retried and other connections can observe per-file progress while a batch is running +- Content whose parsing produces no chunks (e.g. whitespace-only text) is marked as processed so it is not retried +- Designed for background workers: call in a loop with a small `limit` and poll `memory_pending_count()` to report progress +- Returns 0 when nothing is pending + +**Example:** +```sql +-- store content instantly, without embeddings +SELECT memory_set_option('defer_embeddings', 1); +SELECT memory_add_content('docs/api.md', '# API\nUploaded from the dashboard.'); + +-- later, from a background process: embed in batches of 10 +SELECT memory_embed_pending(10); + +-- or process the whole backlog in one call +SELECT memory_embed_pending(); +``` + +--- + +#### `memory_pending_count()` + +Returns the number of content rows waiting for embedding generation. + +**Parameters:** None + +**Returns:** INTEGER - Number of pending content rows + +**Notes:** +- Counts rows with a non-empty stored `value` and no `dbmem_vault` entries +- Useful for progress reporting: `1 - pending/total` while a `memory_embed_pending()` loop is running +- Empty files and directory markers are never counted as pending + +**Example:** +```sql +SELECT memory_pending_count(); +``` + +--- + ### `memory_search` A virtual table for performing hybrid semantic search. @@ -846,6 +902,7 @@ sqlite3_memory_register_provider(db, "my-engine", &provider); | `cache_max_entries` | INTEGER | 0 | Max cache entries (0 = no limit). When exceeded, oldest entries are evicted | | `search_oversample` | INTEGER | 0 | Search oversampling multiplier (0 = no oversampling). When set, retrieves N * multiplier candidates from each index before merging down to N final results | | `preserve_duplicate_paths` | INTEGER | 0 | Preserve distinct logical paths for identical or empty content. When enabled, `dbmem_content.hash` is path-scoped and identifies an entry rather than only the raw content | +| `defer_embeddings` | INTEGER | 0 | Store content without computing embeddings or FTS entries. Deferred content is invisible to search until processed with `memory_embed_pending()` or `memory_reindex()`. Requires `save_content=1` | --- diff --git a/README.md b/README.md index e899cd8..c4c5d8c 100644 --- a/README.md +++ b/README.md @@ -245,6 +245,25 @@ Directory markers are listed as directories, materialized as directories by `mem This makes all sync functions safe to call repeatedly - for example, on a cron schedule or at agent startup - with minimal overhead. +## Deferred Embeddings + +For interactive workflows (e.g. a dashboard upload) where content should appear immediately and embeddings can be computed later by a background process, enable deferred mode: + +```sql +-- store content instantly: no embedding model needed, nothing is computed +SELECT memory_set_option('defer_embeddings', 1); +SELECT memory_add_content('docs/api.md', '# API\nUploaded from the dashboard.'); + +-- pending files are visible right away ("indexed":false in the JSON tree) +SELECT memory_list_files(); + +-- later, from a background worker: embed in batches and report progress +SELECT memory_embed_pending(10); -- returns rows processed in this batch +SELECT memory_pending_count(); -- rows still waiting +``` + +Deferred content is stored in `dbmem_content` but is invisible to `memory_search` until it is embedded. Each file is embedded in its own transaction, so a file is either fully indexed or still pending — an interrupted worker can simply be restarted, and other connections can watch progress while a batch runs. + ## Agent Memory Sync Multiple agents can share and merge knowledge without any coordination. Each agent works independently with its own local SQLite database, syncing through a shared [SQLiteCloud](https://sqlitecloud.io/) managed database when connectivity is available. diff --git a/src/sqlite-memory.c b/src/sqlite-memory.c index 3a47fff..a783f5a 100644 --- a/src/sqlite-memory.c +++ b/src/sqlite-memory.c @@ -65,6 +65,7 @@ SQLITE_EXTENSION_INIT1 #define DBMEM_SETTINGS_KEY_CACHE_MAX_ENTRIES "cache_max_entries" #define DBMEM_SETTINGS_KEY_SEARCH_OVERSAMPLE "search_oversample" #define DBMEM_SETTINGS_KEY_PRESERVE_DUP_PATHS "preserve_duplicate_paths" +#define DBMEM_SETTINGS_KEY_DEFER_EMBEDDINGS "defer_embeddings" #define DBMEM_SETTINGS_KEY_SCHEMA_VERSION "schema_version" #define DBMEM_SCHEMA_VERSION 4 @@ -128,6 +129,7 @@ struct dbmem_context { int cache_max_entries; // Max cache entries (0 = no limit) int search_oversample; // Search oversampling multiplier (0 = no oversampling) bool preserve_duplicate_paths; // Keep separate rows for distinct paths with identical content + bool defer_embeddings; // Store content without computing embeddings (use memory_embed_pending later) // Cache float *cache_buffer; // Reusable buffer for cache hits @@ -135,6 +137,7 @@ struct dbmem_context { // Runtime state int64_t counter; // Chunk counter during file processing + int64_t chunks_added; // Vault rows inserted while processing the current buffer uint64_t hash; // Hash of the current text const char *context; // Optional context string for current operation const char *path; // Portable relative file path (optional) @@ -151,7 +154,7 @@ typedef struct dbmem_json_buffer dbmem_json_buffer; static int dbmem_database_begin_transaction (sqlite3 *db); static int dbmem_database_commit_transaction (sqlite3 *db); static int dbmem_database_rollback_transaction (sqlite3 *db); -static int dbmem_json_append_tree_children (dbmem_json_buffer *json, dbmem_string_list *paths, int start, int end, size_t offset); +static int dbmem_json_append_tree_children (dbmem_json_buffer *json, dbmem_string_list *paths, const unsigned char *flags, int start, int end, size_t offset); static char *dbmem_path_normalized_copy (const char *path); static char *dbmem_path_unique_storage_copy (sqlite3 *db, const char *preferred_path, const char *source_path); static char *dbmem_path_directory_marker_storage_copy (const char *path); @@ -349,6 +352,12 @@ static int dbmem_settings_sync (dbmem_context *ctx, const char *key, sqlite3_val return 0; } + if (strcasecmp(key, DBMEM_SETTINGS_KEY_DEFER_EMBEDDINGS) == 0) { + int n = sqlite3_value_int(value); + ctx->defer_embeddings = (n > 0) ? 1 : 0; + return 0; + } + if (strcasecmp(key, DBMEM_SETTINGS_KEY_PROVIDER) == 0) { char *provider = dbmem_strdup((const char *)sqlite3_value_text(value)); if (provider) { @@ -1040,6 +1049,27 @@ static int dbmem_database_add_fts5 (dbmem_context *ctx, const char *text, size_t return rc; } +static int dbmem_database_add_vault_sentinel (dbmem_context *ctx) { + // a zero-length embedding marks content whose parsing produced no chunks as processed, + // so it is excluded from the pending predicate (sqlite-vector skips undersized blobs during scans) + static const char *sql = "INSERT INTO dbmem_vault (hash, seq, embedding, offset, length, n_tokens, truncated) VALUES (?1, 0, zeroblob(0), 0, 0, 0, 0);"; + + sqlite3_stmt *vm = NULL; + int rc = sqlite3_prepare_v2(ctx->db, sql, -1, &vm, NULL); + if (rc != SQLITE_OK) goto cleanup; + + rc = dbmem_bind_hash(vm, 1, ctx->hash); + if (rc != SQLITE_OK) goto cleanup; + + rc = sqlite3_step(vm); + if (rc == SQLITE_DONE) rc = SQLITE_OK; + +cleanup: + if (rc != SQLITE_OK) DEBUG_DBMEM_ALWAYS("Error in dbmem_database_add_vault_sentinel: %s", sqlite3_errmsg(ctx->db)); + if (vm) sqlite3_finalize(vm); + return rc; +} + static int dbmem_database_begin_transaction (sqlite3 *db) { return sqlite3_exec(db, "SAVEPOINT " DBMEM_SAVEPOINT_NAME ";", NULL, NULL, NULL); } @@ -2070,7 +2100,7 @@ static int dbmem_path_group_file_index (dbmem_string_list *paths, int start, int return -1; } -static int dbmem_json_append_file_node (dbmem_json_buffer *json, const char *path, size_t segment_start, size_t segment_end) { +static int dbmem_json_append_file_node (dbmem_json_buffer *json, const char *path, bool indexed, size_t segment_start, size_t segment_end) { int rc = dbmem_json_buffer_append(json, "{\"type\":\"file\",\"name\":"); if (rc != SQLITE_OK) return rc; rc = dbmem_json_buffer_append_escaped_len(json, path + segment_start, segment_end - segment_start); @@ -2079,10 +2109,12 @@ static int dbmem_json_append_file_node (dbmem_json_buffer *json, const char *pat if (rc != SQLITE_OK) return rc; rc = dbmem_json_buffer_append_escaped(json, path); if (rc != SQLITE_OK) return rc; + rc = dbmem_json_buffer_append(json, indexed ? ",\"indexed\":true" : ",\"indexed\":false"); + if (rc != SQLITE_OK) return rc; return dbmem_json_buffer_append_char(json, '}'); } -static int dbmem_json_append_directory_node (dbmem_json_buffer *json, dbmem_string_list *paths, int start, int end, size_t offset) { +static int dbmem_json_append_directory_node (dbmem_json_buffer *json, dbmem_string_list *paths, const unsigned char *flags, int start, int end, size_t offset) { const char *path = paths->items[start]; size_t segment_start = dbmem_path_segment_start(path, offset); size_t segment_end = dbmem_path_segment_end(path, segment_start); @@ -2097,12 +2129,12 @@ static int dbmem_json_append_directory_node (dbmem_json_buffer *json, dbmem_stri if (rc != SQLITE_OK) return rc; rc = dbmem_json_buffer_append(json, ",\"children\":"); if (rc != SQLITE_OK) return rc; - rc = dbmem_json_append_tree_children(json, paths, start, end, segment_end); + rc = dbmem_json_append_tree_children(json, paths, flags, start, end, segment_end); if (rc != SQLITE_OK) return rc; return dbmem_json_buffer_append_char(json, '}'); } -static int dbmem_json_append_tree_children (dbmem_json_buffer *json, dbmem_string_list *paths, int start, int end, size_t offset) { +static int dbmem_json_append_tree_children (dbmem_json_buffer *json, dbmem_string_list *paths, const unsigned char *flags, int start, int end, size_t offset) { int rc = dbmem_json_buffer_append_char(json, '['); if (rc != SQLITE_OK) return rc; @@ -2130,12 +2162,13 @@ static int dbmem_json_append_tree_children (dbmem_json_buffer *json, dbmem_strin first = false; if (emit_directory) { - rc = dbmem_json_append_directory_node(json, paths, i, group_end, offset); + rc = dbmem_json_append_directory_node(json, paths, flags, i, group_end, offset); } else { const char *file_path = paths->items[file_index]; + bool indexed = flags ? (flags[file_index] != 0) : true; segment_start = dbmem_path_segment_start(file_path, offset); size_t segment_end = dbmem_path_segment_end(file_path, segment_start); - rc = dbmem_json_append_file_node(json, file_path, segment_start, segment_end); + rc = dbmem_json_append_file_node(json, file_path, indexed, segment_start, segment_end); } if (rc != SQLITE_OK) return rc; } @@ -2147,7 +2180,18 @@ static int dbmem_json_append_tree_children (dbmem_json_buffer *json, dbmem_strin return dbmem_json_buffer_append_char(json, ']'); } -static int dbmem_paths_to_json (dbmem_string_list *paths, char **result) { +typedef struct { + char *path; + unsigned char indexed; +} dbmem_path_entry; + +static int dbmem_path_entry_compare (const void *a, const void *b) { + const dbmem_path_entry *ea = (const dbmem_path_entry *)a; + const dbmem_path_entry *eb = (const dbmem_path_entry *)b; + return dbmem_path_tree_compare(&ea->path, &eb->path); +} + +static int dbmem_paths_to_json (dbmem_string_list *paths, unsigned char *flags, char **result) { dbmem_json_buffer json = {0}; int rc = SQLITE_OK; size_t prefix_len = dbmem_common_directory_prefix_len(paths); @@ -2161,12 +2205,24 @@ static int dbmem_paths_to_json (dbmem_string_list *paths, char **result) { } if (paths->count > 1) { - qsort(paths->items, (size_t)paths->count, sizeof(char *), dbmem_path_tree_compare); + // sort paths and indexed flags together so flags keep matching their path by index + dbmem_path_entry *entries = (dbmem_path_entry *)dbmemory_alloc((uint64_t)paths->count * sizeof(dbmem_path_entry)); + if (!entries) { rc = SQLITE_NOMEM; goto cleanup; } + for (int i = 0; i < paths->count; i++) { + entries[i].path = paths->items[i]; + entries[i].indexed = flags ? flags[i] : 1; + } + qsort(entries, (size_t)paths->count, sizeof(dbmem_path_entry), dbmem_path_entry_compare); + for (int i = 0; i < paths->count; i++) { + paths->items[i] = entries[i].path; + if (flags) flags[i] = entries[i].indexed; + } + dbmemory_free(entries); } rc = dbmem_json_buffer_append(&json, "{\"root\":\"\",\"children\":"); if (rc != SQLITE_OK) goto cleanup; - rc = dbmem_json_append_tree_children(&json, paths, 0, paths->count, 0); + rc = dbmem_json_append_tree_children(&json, paths, flags, 0, paths->count, 0); if (rc != SQLITE_OK) goto cleanup; rc = dbmem_json_buffer_append_char(&json, '}'); if (rc != SQLITE_OK) goto cleanup; @@ -2185,9 +2241,12 @@ static void dbmem_list_files (sqlite3_context *context, int argc, sqlite3_value sqlite3 *db = sqlite3_context_db_handle(context); sqlite3_stmt *vm = NULL; dbmem_string_list paths = {0}; + unsigned char *flags = NULL; + int flags_capacity = 0; char *json = NULL; int rc = sqlite3_prepare_v2(db, - "SELECT path FROM dbmem_content WHERE path IS NOT NULL AND path != '';", + "SELECT path, (length = 0 OR EXISTS (SELECT 1 FROM dbmem_vault v WHERE v.hash = dbmem_content.hash)) " + "FROM dbmem_content WHERE path IS NOT NULL AND path != '';", -1, &vm, NULL); if (rc != SQLITE_OK) goto cleanup; @@ -2196,15 +2255,25 @@ static void dbmem_list_files (sqlite3_context *context, int argc, sqlite3_value char *copy = dbmem_strdup(path); rc = dbmem_string_list_add(&paths, copy); if (rc != SQLITE_OK) goto cleanup; + + if (paths.count > flags_capacity) { + int new_capacity = flags_capacity ? flags_capacity * 2 : 8; + unsigned char *new_flags = (unsigned char *)dbmemory_realloc(flags, (uint64_t)new_capacity); + if (!new_flags) { rc = SQLITE_NOMEM; goto cleanup; } + flags = new_flags; + flags_capacity = new_capacity; + } + flags[paths.count - 1] = (unsigned char)(sqlite3_column_int(vm, 1) != 0); } if (rc == SQLITE_DONE) rc = SQLITE_OK; if (rc != SQLITE_OK) goto cleanup; - rc = dbmem_paths_to_json(&paths, &json); + rc = dbmem_paths_to_json(&paths, flags, &json); cleanup: if (vm) sqlite3_finalize(vm); dbmem_string_list_free(&paths); + if (flags) dbmemory_free(flags); if (rc == SQLITE_OK) { sqlite3_result_text(context, json ? json : "{\"root\":\"\",\"children\":[]}", -1, json ? dbmemory_free : SQLITE_TRANSIENT); @@ -2643,7 +2712,8 @@ static void dbmem_get_option (sqlite3_context *context, int argc, sqlite3_value rc = sqlite3_step(vm); if (rc == SQLITE_DONE) { - if (strcasecmp(key, DBMEM_SETTINGS_KEY_PRESERVE_DUP_PATHS) == 0) { + if (strcasecmp(key, DBMEM_SETTINGS_KEY_PRESERVE_DUP_PATHS) == 0 || + strcasecmp(key, DBMEM_SETTINGS_KEY_DEFER_EMBEDDINGS) == 0) { sqlite3_result_int(context, 0); } else { sqlite3_result_null(context); @@ -2860,6 +2930,7 @@ static int dbmem_process_callback (const char *text, size_t len, size_t offset, dbmem_context_set_error(ctx, sqlite3_errmsg(ctx->db)); goto cleanup; } + ctx->chunks_added++; DEBUG_EMBEDDING(&result); // save FTS5 (if available) @@ -2875,6 +2946,11 @@ static int dbmem_process_callback (const char *text, size_t len, size_t offset, } static int dbmem_process_buffer (dbmem_context *ctx, const char *buffer, int64_t len) { + if (ctx->defer_embeddings && !ctx->reindex_mode && !ctx->save_content) { + dbmem_context_set_error(ctx, "defer_embeddings requires save_content to be enabled"); + return SQLITE_ERROR; + } + uint64_t hash = dbmem_storage_hash_compute(buffer, (size_t)len, ctx->path, ctx->preserve_duplicate_paths); const char *saved_path = ctx->path; char *unique_path = NULL; @@ -2931,10 +3007,18 @@ static int dbmem_process_buffer (dbmem_context *ctx, const char *buffer, int64_t } if (len == 0) goto cleanup; + if (ctx->defer_embeddings && !ctx->reindex_mode) goto cleanup; + ctx->chunks_added = 0; rc = dbmem_parse(buffer, (size_t)len, &settings); - if (rc == SQLITE_OK && !ctx->dimension_saved) { + if (rc == SQLITE_OK && ctx->chunks_added == 0) { + rc = dbmem_database_add_vault_sentinel(ctx); + } + + // persist the dimension only after a real embedding established it: + // a zero-chunk parse would latch dimension=0 and block the real write + if (rc == SQLITE_OK && ctx->chunks_added > 0 && !ctx->dimension_saved) { // make sure to serialize dimension dbmem_settings_write_int(db, DBMEM_SETTINGS_KEY_DIMENSION, ctx->dimension); ctx->dimension_saved = true; @@ -3938,6 +4022,146 @@ static void dbmem_sql_reindex (sqlite3_context *context, int argc, sqlite3_value sqlite3_result_int64(context, processed); } +// content is pending when it has a body to index but no vault rows (real chunks or sentinel) yet +#define DBMEM_PENDING_PREDICATE \ + "length > 0 AND value IS NOT NULL AND " \ + "NOT EXISTS (SELECT 1 FROM dbmem_vault v WHERE v.hash = dbmem_content.hash)" + +static void dbmem_embed_pending (sqlite3_context *context, int argc, sqlite3_value **argv) { + dbmem_context *ctx = (dbmem_context *)sqlite3_user_data(context); + sqlite3 *db = ctx->db; + + sqlite3_int64 limit = -1; + if (argc == 1) { + if (sqlite3_value_type(argv[0]) != SQLITE_INTEGER || sqlite3_value_int64(argv[0]) <= 0) { + sqlite3_result_error(context, "The function memory_embed_pending expects a positive INTEGER limit", -1); + return; + } + limit = sqlite3_value_int64(argv[0]); + } + + if (!ctx->model) { + sqlite3_result_error(context, "memory_embed_pending: no embedding model configured", -1); + return; + } + + ctx->reindex_mode = true; + dbmem_context_reset_temp_values(ctx); + + int64_t processed = 0; + int rc = SQLITE_OK; + + while (limit < 0 || processed < limit) { + sqlite3_stmt *vm = NULL; + rc = sqlite3_prepare_v2(db, + "SELECT hash, path, value, context FROM dbmem_content " + "WHERE " DBMEM_PENDING_PREDICATE " LIMIT 1;", + -1, &vm, NULL); + if (rc != SQLITE_OK) break; + + int step = sqlite3_step(vm); + if (step == SQLITE_DONE) { + sqlite3_finalize(vm); + break; + } + if (step != SQLITE_ROW) { + sqlite3_finalize(vm); + rc = step; + break; + } + + // Copy row data before finalizing so we can write in the next step + const char *hash_raw = (const char *)sqlite3_column_text(vm, 0); + const char *path_raw = (const char *)sqlite3_column_text(vm, 1); + const char *value_raw = (const char *)sqlite3_column_text(vm, 2); + int64_t value_len = (int64_t)sqlite3_column_bytes(vm, 2); + const char *ctx_raw = (const char *)sqlite3_column_text(vm, 3); + + char *hash_text = dbmem_strdup(hash_raw); + char *path = dbmem_strdup(path_raw); + char *value = (char *)sqlite3_malloc64((sqlite3_uint64)(value_len + 1)); + if (value) { memcpy(value, value_raw, (size_t)value_len); value[value_len] = '\0'; } + char *ctx_name = dbmem_strdup(ctx_raw); + + sqlite3_finalize(vm); + + if (!hash_text || !path || !value) { + dbmemory_free(hash_text); + dbmemory_free(path); + if (value) sqlite3_free(value); + dbmemory_free(ctx_name); + rc = SQLITE_NOMEM; + break; + } + + uint64_t stored_hash = 0; + if (!dbmem_hash_from_hex(hash_text, &stored_hash)) { + dbmemory_free(hash_text); + dbmemory_free(path); + sqlite3_free(value); + dbmemory_free(ctx_name); + rc = SQLITE_MISMATCH; + break; + } + + // process_buffer recomputes the hash from value/path; when the stored hash was computed + // with a different preserve_duplicate_paths scope, rekey the row to keep the new vault + // rows attached to it (same approach as memory_reindex) + uint64_t target_hash = dbmem_storage_hash_compute(value, (size_t)value_len, path, ctx->preserve_duplicate_paths); + bool target_has_vault = dbmem_database_hash_has_vault(db, target_hash); + + if (!target_has_vault) { + ctx->path = path; + ctx->context = ctx_name; + rc = dbmem_process_buffer(ctx, value, value_len); + } + + if (rc == SQLITE_OK && target_hash != stored_hash) { + rc = dbmem_database_update_content_hash(db, path, target_hash); + } + + ctx->path = NULL; + ctx->context = NULL; + dbmemory_free(hash_text); + dbmemory_free(path); + sqlite3_free(value); + dbmemory_free(ctx_name); + + if (rc != SQLITE_OK) break; + processed++; + } + + ctx->reindex_mode = false; + ctx->path = NULL; + ctx->context = NULL; + + if (rc != SQLITE_OK) { + sqlite3_result_error(context, ctx->error_msg[0] ? ctx->error_msg : sqlite3_errmsg(db), -1); + return; + } + + sqlite3_result_int64(context, processed); +} + +static void dbmem_pending_count (sqlite3_context *context, int argc, sqlite3_value **argv) { + UNUSED_PARAM(argc); UNUSED_PARAM(argv); + static const char *sql = "SELECT COUNT(*) FROM dbmem_content WHERE " DBMEM_PENDING_PREDICATE ";"; + + sqlite3 *db = sqlite3_context_db_handle(context); + sqlite3_stmt *vm = NULL; + int rc = sqlite3_prepare_v2(db, sql, -1, &vm, NULL); + if (rc == SQLITE_OK) { + rc = sqlite3_step(vm); + if (rc == SQLITE_ROW) { + sqlite3_result_int64(context, sqlite3_column_int64(vm, 0)); + rc = SQLITE_OK; + } + } + + if (vm) sqlite3_finalize(vm); + if (rc != SQLITE_OK) sqlite3_result_error(context, sqlite3_errmsg(db), -1); +} + // MARK: - Sync static void dbmem_enable_sync (sqlite3_context *context, int argc, sqlite3_value **argv) { @@ -4163,6 +4387,15 @@ SQLITE_DBMEMORY_API int sqlite3_memory_init (sqlite3 *db, char **pzErrMsg, const rc = sqlite3_create_function_v2(db, "memory_reindex", 0, SQLITE_UTF8, ctx, dbmem_sql_reindex, NULL, NULL, NULL); if (rc != SQLITE_OK) { dbmem_context_free(ctx); return rc; } + rc = sqlite3_create_function_v2(db, "memory_embed_pending", 0, SQLITE_UTF8, ctx, dbmem_embed_pending, NULL, NULL, NULL); + if (rc != SQLITE_OK) { dbmem_context_free(ctx); return rc; } + + rc = sqlite3_create_function_v2(db, "memory_embed_pending", 1, SQLITE_UTF8, ctx, dbmem_embed_pending, NULL, NULL, NULL); + if (rc != SQLITE_OK) { dbmem_context_free(ctx); return rc; } + + rc = sqlite3_create_function_v2(db, "memory_pending_count", 0, SQLITE_UTF8, ctx, dbmem_pending_count, NULL, NULL, NULL); + if (rc != SQLITE_OK) { dbmem_context_free(ctx); return rc; } + rc = sqlite3_create_function_v2(db, "memory_enable_sync", -1, SQLITE_UTF8, ctx, dbmem_enable_sync, NULL, NULL, NULL); if (rc != SQLITE_OK) { dbmem_context_free(ctx); return rc; } diff --git a/test/unittest.c b/test/unittest.c index 5d4fb4d..e1b3cfb 100644 --- a/test/unittest.c +++ b/test/unittest.c @@ -2028,7 +2028,7 @@ TEST(sqlite_memory_list_files_strips_common_full_path) { char json[1024]; rc = exec_get_text(db, "SELECT memory_list_files();", json, sizeof(json)); ASSERT_EQ(rc, SQLITE_OK); - ASSERT_STR_EQ(json, "{\"root\":\"\",\"children\":[{\"type\":\"directory\",\"name\":\"docs\",\"path\":\"docs\",\"children\":[{\"type\":\"directory\",\"name\":\"nested\",\"path\":\"docs/nested\",\"children\":[{\"type\":\"file\",\"name\":\"beta.md\",\"path\":\"docs/nested/beta.md\"}]},{\"type\":\"file\",\"name\":\"alpha.md\",\"path\":\"docs/alpha.md\"}]},{\"type\":\"file\",\"name\":\"zeta.md\",\"path\":\"zeta.md\"}]}"); + ASSERT_STR_EQ(json, "{\"root\":\"\",\"children\":[{\"type\":\"directory\",\"name\":\"docs\",\"path\":\"docs\",\"children\":[{\"type\":\"directory\",\"name\":\"nested\",\"path\":\"docs/nested\",\"children\":[{\"type\":\"file\",\"name\":\"beta.md\",\"path\":\"docs/nested/beta.md\",\"indexed\":false}]},{\"type\":\"file\",\"name\":\"alpha.md\",\"path\":\"docs/alpha.md\",\"indexed\":false}]},{\"type\":\"file\",\"name\":\"zeta.md\",\"path\":\"zeta.md\",\"indexed\":false}]}"); sqlite3_close(db); } @@ -2047,7 +2047,7 @@ TEST(sqlite_memory_list_files_keeps_relative_paths) { char json[1024]; rc = exec_get_text(db, "SELECT memory_list_files();", json, sizeof(json)); ASSERT_EQ(rc, SQLITE_OK); - ASSERT_STR_EQ(json, "{\"root\":\"\",\"children\":[{\"type\":\"directory\",\"name\":\"notes\",\"path\":\"notes\",\"children\":[{\"type\":\"directory\",\"name\":\"docs\",\"path\":\"notes/docs\",\"children\":[{\"type\":\"file\",\"name\":\"alpha.md\",\"path\":\"notes/docs/alpha.md\"}]},{\"type\":\"file\",\"name\":\"zeta.md\",\"path\":\"notes/zeta.md\"}]}]}"); + ASSERT_STR_EQ(json, "{\"root\":\"\",\"children\":[{\"type\":\"directory\",\"name\":\"notes\",\"path\":\"notes\",\"children\":[{\"type\":\"directory\",\"name\":\"docs\",\"path\":\"notes/docs\",\"children\":[{\"type\":\"file\",\"name\":\"alpha.md\",\"path\":\"notes/docs/alpha.md\",\"indexed\":false}]},{\"type\":\"file\",\"name\":\"zeta.md\",\"path\":\"notes/zeta.md\",\"indexed\":false}]}]}"); sqlite3_close(db); } @@ -2065,7 +2065,7 @@ TEST(sqlite_memory_list_files_strips_single_full_path_directory) { char json[512]; rc = exec_get_text(db, "SELECT memory_list_files();", json, sizeof(json)); ASSERT_EQ(rc, SQLITE_OK); - ASSERT_STR_EQ(json, "{\"root\":\"\",\"children\":[{\"type\":\"file\",\"name\":\"readme.md\",\"path\":\"readme.md\"}]}"); + ASSERT_STR_EQ(json, "{\"root\":\"\",\"children\":[{\"type\":\"file\",\"name\":\"readme.md\",\"path\":\"readme.md\",\"indexed\":false}]}"); sqlite3_close(db); } @@ -2084,7 +2084,7 @@ TEST(sqlite_memory_list_files_normalizes_windows_separators) { char json[1024]; rc = exec_get_text(db, "SELECT memory_list_files();", json, sizeof(json)); ASSERT_EQ(rc, SQLITE_OK); - ASSERT_STR_EQ(json, "{\"root\":\"\",\"children\":[{\"type\":\"directory\",\"name\":\"docs\",\"path\":\"docs\",\"children\":[{\"type\":\"file\",\"name\":\"beta.md\",\"path\":\"docs/beta.md\"}]},{\"type\":\"file\",\"name\":\"alpha.md\",\"path\":\"alpha.md\"}]}"); + ASSERT_STR_EQ(json, "{\"root\":\"\",\"children\":[{\"type\":\"directory\",\"name\":\"docs\",\"path\":\"docs\",\"children\":[{\"type\":\"file\",\"name\":\"beta.md\",\"path\":\"docs/beta.md\",\"indexed\":false}]},{\"type\":\"file\",\"name\":\"alpha.md\",\"path\":\"alpha.md\",\"indexed\":false}]}"); sqlite3_close(db); } @@ -2103,7 +2103,7 @@ TEST(sqlite_memory_list_files_does_not_strip_mixed_path_types) { char json[2048]; rc = exec_get_text(db, "SELECT memory_list_files();", json, sizeof(json)); ASSERT_EQ(rc, SQLITE_OK); - ASSERT_STR_EQ(json, "{\"root\":\"\",\"children\":[{\"type\":\"directory\",\"name\":\"notes\",\"path\":\"notes\",\"children\":[{\"type\":\"file\",\"name\":\"alpha.md\",\"path\":\"notes/alpha.md\"}]},{\"type\":\"directory\",\"name\":\"tmp\",\"path\":\"/tmp\",\"children\":[{\"type\":\"directory\",\"name\":\"dbmem\",\"path\":\"/tmp/dbmem\",\"children\":[{\"type\":\"directory\",\"name\":\"project\",\"path\":\"/tmp/dbmem/project\",\"children\":[{\"type\":\"file\",\"name\":\"readme.md\",\"path\":\"/tmp/dbmem/project/readme.md\"}]}]}]}]}"); + ASSERT_STR_EQ(json, "{\"root\":\"\",\"children\":[{\"type\":\"directory\",\"name\":\"notes\",\"path\":\"notes\",\"children\":[{\"type\":\"file\",\"name\":\"alpha.md\",\"path\":\"notes/alpha.md\",\"indexed\":false}]},{\"type\":\"directory\",\"name\":\"tmp\",\"path\":\"/tmp\",\"children\":[{\"type\":\"directory\",\"name\":\"dbmem\",\"path\":\"/tmp/dbmem\",\"children\":[{\"type\":\"directory\",\"name\":\"project\",\"path\":\"/tmp/dbmem/project\",\"children\":[{\"type\":\"file\",\"name\":\"readme.md\",\"path\":\"/tmp/dbmem/project/readme.md\",\"indexed\":false}]}]}]}]}"); sqlite3_close(db); } @@ -2122,7 +2122,7 @@ TEST(sqlite_memory_list_files_omits_empty_paths) { char json[512]; rc = exec_get_text(db, "SELECT memory_list_files();", json, sizeof(json)); ASSERT_EQ(rc, SQLITE_OK); - ASSERT_STR_EQ(json, "{\"root\":\"\",\"children\":[{\"type\":\"directory\",\"name\":\"docs\",\"path\":\"docs\",\"children\":[{\"type\":\"file\",\"name\":\"alpha.md\",\"path\":\"docs/alpha.md\"}]}]}"); + ASSERT_STR_EQ(json, "{\"root\":\"\",\"children\":[{\"type\":\"directory\",\"name\":\"docs\",\"path\":\"docs\",\"children\":[{\"type\":\"file\",\"name\":\"alpha.md\",\"path\":\"docs/alpha.md\",\"indexed\":false}]}]}"); sqlite3_close(db); } @@ -2140,7 +2140,7 @@ TEST(sqlite_memory_list_files_escapes_json_strings) { char json[512]; rc = exec_get_text(db, "SELECT memory_list_files();", json, sizeof(json)); ASSERT_EQ(rc, SQLITE_OK); - ASSERT_STR_EQ(json, "{\"root\":\"\",\"children\":[{\"type\":\"directory\",\"name\":\"docs\",\"path\":\"docs\",\"children\":[{\"type\":\"file\",\"name\":\"a\\\"b.md\",\"path\":\"docs/a\\\"b.md\"}]}]}"); + ASSERT_STR_EQ(json, "{\"root\":\"\",\"children\":[{\"type\":\"directory\",\"name\":\"docs\",\"path\":\"docs\",\"children\":[{\"type\":\"file\",\"name\":\"a\\\"b.md\",\"path\":\"docs/a\\\"b.md\",\"indexed\":false}]}]}"); sqlite3_close(db); } @@ -2177,7 +2177,36 @@ TEST(sqlite_memory_list_files_merges_directory_marker_with_children) { char json[512]; rc = exec_get_text(db, "SELECT memory_list_files();", json, sizeof(json)); ASSERT_EQ(rc, SQLITE_OK); - ASSERT_STR_EQ(json, "{\"root\":\"\",\"children\":[{\"type\":\"directory\",\"name\":\"dirname\",\"path\":\"dirname\",\"children\":[{\"type\":\"file\",\"name\":\"file.md\",\"path\":\"dirname/file.md\"}]}]}"); + ASSERT_STR_EQ(json, "{\"root\":\"\",\"children\":[{\"type\":\"directory\",\"name\":\"dirname\",\"path\":\"dirname\",\"children\":[{\"type\":\"file\",\"name\":\"file.md\",\"path\":\"dirname/file.md\",\"indexed\":false}]}]}"); + + sqlite3_close(db); +} + +TEST(sqlite_memory_list_files_reports_indexed_flag) { + sqlite3 *db = open_test_db(); + ASSERT(db != NULL); + + int rc = sqlite3_exec(db, + "INSERT INTO dbmem_content (hash, path, value, length, context, created_at) VALUES " + "(printf('%016x', 801), 'done.md', 'v1', 2, NULL, 0), " + "(printf('%016x', 802), 'todo.md', 'v2', 2, NULL, 0), " + "(printf('%016x', 803), 'empty.md', '', 0, NULL, 0);", + NULL, NULL, NULL); + ASSERT_EQ(rc, SQLITE_OK); + + rc = sqlite3_exec(db, + "INSERT INTO dbmem_vault (hash, seq, embedding, offset, length, n_tokens, truncated) " + "VALUES (printf('%016x', 801), 0, zeroblob(16), 0, 2, 1, 0);", + NULL, NULL, NULL); + ASSERT_EQ(rc, SQLITE_OK); + + char json[1024]; + rc = exec_get_text(db, "SELECT memory_list_files();", json, sizeof(json)); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_STR_EQ(json, "{\"root\":\"\",\"children\":[" + "{\"type\":\"file\",\"name\":\"done.md\",\"path\":\"done.md\",\"indexed\":true}," + "{\"type\":\"file\",\"name\":\"empty.md\",\"path\":\"empty.md\",\"indexed\":true}," + "{\"type\":\"file\",\"name\":\"todo.md\",\"path\":\"todo.md\",\"indexed\":false}]}"); sqlite3_close(db); } @@ -4887,9 +4916,14 @@ TEST(sqlite_custom_provider_skips_whitespace_only_text) { ASSERT_EQ(result, 1); ASSERT_EQ(dummy_compute_calls, 0); + // no embeddings are computed: the only vault row is the zero-chunk sentinel rc = exec_get_int(db, "SELECT COUNT(*) FROM dbmem_vault;", &result); ASSERT_EQ(rc, SQLITE_OK); - ASSERT_EQ(result, 0); + ASSERT_EQ(result, 1); + + rc = exec_get_int(db, "SELECT COUNT(*) FROM dbmem_vault WHERE length(embedding) = 0 AND n_tokens = 0;", &result); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(result, 1); rc = exec_get_int(db, "SELECT COUNT(*) FROM dbmem_cache;", &result); ASSERT_EQ(rc, SQLITE_OK); @@ -5169,6 +5203,243 @@ TEST(sqlite_set_model_failed_remote_switch_keeps_custom_engine) { } #endif +// ============================================================================ +// Deferred Embeddings Tests +// ============================================================================ + +TEST(sqlite_memory_defer_embeddings_stores_content_without_index) { + sqlite3 *db = open_test_db(); + ASSERT(db != NULL); + + sqlite3_int64 result = -1; + int rc = exec_get_int(db, "SELECT memory_get_option('defer_embeddings');", &result); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(result, 0); + + rc = exec_get_int(db, "SELECT memory_set_option('defer_embeddings', 1);", &result); + ASSERT_EQ(rc, SQLITE_OK); + + // no model configured: a deferred add must succeed without an embedding engine + rc = exec_get_int(db, "SELECT memory_add_content('docs/deferred.md', '# Title\nDeferred body text.');", &result); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(result, 1); + + sqlite3_int64 count = -1; + rc = exec_get_int(db, "SELECT COUNT(*) FROM dbmem_content;", &count); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(count, 1); + + rc = exec_get_int(db, "SELECT COUNT(*) FROM dbmem_vault;", &count); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(count, 0); + + rc = exec_get_int(db, "SELECT COUNT(*) FROM dbmem_vault_fts;", &count); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(count, 0); + + rc = exec_get_int(db, "SELECT memory_pending_count();", &count); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(count, 1); + + // embedding pending content requires a configured model + sqlite3_stmt *stmt = NULL; + rc = sqlite3_prepare_v2(db, "SELECT memory_embed_pending();", -1, &stmt, NULL); + ASSERT_EQ(rc, SQLITE_OK); + rc = sqlite3_step(stmt); + ASSERT(rc == SQLITE_ERROR); + const char *msg = sqlite3_errmsg(db); + ASSERT(strstr(msg, "no embedding model") != NULL); + sqlite3_finalize(stmt); + + sqlite3_close(db); +} + +TEST(sqlite_memory_defer_embeddings_requires_save_content) { + sqlite3 *db = open_test_db(); + ASSERT(db != NULL); + + sqlite3_int64 result = 0; + int rc = exec_get_int(db, "SELECT memory_set_option('defer_embeddings', 1);", &result); + ASSERT_EQ(rc, SQLITE_OK); + rc = exec_get_int(db, "SELECT memory_set_option('save_content', 0);", &result); + ASSERT_EQ(rc, SQLITE_OK); + + sqlite3_stmt *stmt = NULL; + rc = sqlite3_prepare_v2(db, "SELECT memory_add_content('docs/nosave.md', 'Body text.');", -1, &stmt, NULL); + ASSERT_EQ(rc, SQLITE_OK); + rc = sqlite3_step(stmt); + ASSERT(rc == SQLITE_ERROR); + const char *msg = sqlite3_errmsg(db); + ASSERT(strstr(msg, "save_content") != NULL); + sqlite3_finalize(stmt); + + sqlite3_close(db); +} + +TEST(sqlite_memory_embed_pending_embeds_deferred_content_in_batches) { + sqlite3 *db = open_test_db(); + ASSERT(db != NULL); + + dbmem_provider_t prov = { .init = dummy_init, .compute = dummy_compute, .free = dummy_free }; + int rc = sqlite3_memory_register_provider(db, "dummy", &prov); + ASSERT_EQ(rc, SQLITE_OK); + + sqlite3_int64 result = 0; + rc = exec_get_int(db, "SELECT memory_set_model('dummy', 'test-model');", &result); + ASSERT_EQ(rc, SQLITE_OK); + + rc = exec_get_int(db, "SELECT memory_set_option('defer_embeddings', 1);", &result); + ASSERT_EQ(rc, SQLITE_OK); + + rc = exec_get_int(db, "SELECT memory_add_content('docs/a.md', '# A\nAlpha body content.');", &result); + ASSERT_EQ(rc, SQLITE_OK); + rc = exec_get_int(db, "SELECT memory_add_content('docs/b.md', '# B\nBeta body content.');", &result); + ASSERT_EQ(rc, SQLITE_OK); + rc = exec_get_int(db, "SELECT memory_add_content('docs/c.md', '# C\nGamma body content.');", &result); + ASSERT_EQ(rc, SQLITE_OK); + + sqlite3_int64 count = -1; + rc = exec_get_int(db, "SELECT COUNT(*) FROM dbmem_vault;", &count); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(count, 0); + + rc = exec_get_int(db, "SELECT memory_pending_count();", &count); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(count, 3); + + rc = exec_get_int(db, "SELECT memory_embed_pending(2);", &count); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(count, 2); + + rc = exec_get_int(db, "SELECT memory_pending_count();", &count); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(count, 1); + + rc = exec_get_int(db, "SELECT memory_embed_pending();", &count); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(count, 1); + + rc = exec_get_int(db, "SELECT memory_pending_count();", &count); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(count, 0); + + rc = exec_get_int(db, "SELECT COUNT(*) FROM dbmem_vault;", &count); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT(count >= 3); + + sqlite3_int64 fts_count = -1; + rc = exec_get_int(db, "SELECT COUNT(*) FROM dbmem_vault_fts;", &fts_count); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(fts_count, count); + + rc = exec_get_int(db, + "SELECT COUNT(*) FROM dbmem_content c WHERE NOT EXISTS (SELECT 1 FROM dbmem_vault v WHERE v.hash = c.hash);", + &count); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(count, 0); + + rc = exec_get_int(db, "SELECT memory_embed_pending();", &count); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(count, 0); + + sqlite3_close(db); +} + +TEST(sqlite_memory_zero_chunk_content_marks_processed_with_sentinel) { + sqlite3 *db = open_test_db(); + ASSERT(db != NULL); + + dbmem_provider_t prov = { .init = dummy_init, .compute = dummy_compute, .free = dummy_free }; + int rc = sqlite3_memory_register_provider(db, "dummy", &prov); + ASSERT_EQ(rc, SQLITE_OK); + + sqlite3_int64 result = 0; + rc = exec_get_int(db, "SELECT memory_set_model('dummy', 'test-model');", &result); + ASSERT_EQ(rc, SQLITE_OK); + + int calls_before = dummy_compute_calls; + + // whitespace-only content parses to zero chunks: a sentinel vault row marks it processed + rc = exec_get_int(db, "SELECT memory_add_content('docs/blank.md', ' ' || char(10) || char(9) || char(10));", &result); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(result, 1); + + sqlite3_int64 count = -1; + rc = exec_get_int(db, "SELECT COUNT(*) FROM dbmem_vault WHERE length(embedding) = 0 AND n_tokens = 0;", &count); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(count, 1); + + rc = exec_get_int(db, "SELECT COUNT(*) FROM dbmem_vault_fts;", &count); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(count, 0); + + rc = exec_get_int(db, "SELECT memory_pending_count();", &count); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(count, 0); + + ASSERT_EQ(dummy_compute_calls, calls_before); + + // deferred zero-chunk content resolves through memory_embed_pending the same way + rc = exec_get_int(db, "SELECT memory_set_option('defer_embeddings', 1);", &result); + ASSERT_EQ(rc, SQLITE_OK); + + rc = exec_get_int(db, "SELECT memory_add_content('docs/blank2.md', char(10) || ' ' || char(10));", &result); + ASSERT_EQ(rc, SQLITE_OK); + + rc = exec_get_int(db, "SELECT memory_pending_count();", &count); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(count, 1); + + rc = exec_get_int(db, "SELECT memory_embed_pending();", &count); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(count, 1); + + rc = exec_get_int(db, "SELECT memory_pending_count();", &count); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(count, 0); + + rc = exec_get_int(db, "SELECT COUNT(*) FROM dbmem_vault WHERE length(embedding) = 0 AND n_tokens = 0;", &count); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(count, 2); + + ASSERT_EQ(dummy_compute_calls, calls_before); + + sqlite3_close(db); +} + +TEST(sqlite_memory_zero_chunk_first_add_does_not_persist_zero_dimension) { + sqlite3 *db = open_test_db(); + ASSERT(db != NULL); + + dbmem_provider_t prov = { .init = dummy_init, .compute = dummy_compute, .free = dummy_free }; + int rc = sqlite3_memory_register_provider(db, "dummy", &prov); + ASSERT_EQ(rc, SQLITE_OK); + + sqlite3_int64 result = 0; + rc = exec_get_int(db, "SELECT memory_set_model('dummy', 'test-model');", &result); + ASSERT_EQ(rc, SQLITE_OK); + + // a zero-chunk first add must not latch dimension=0 into dbmem_settings + rc = exec_get_int(db, "SELECT memory_add_content('docs/blank.md', ' ' || char(10) || char(9));", &result); + ASSERT_EQ(rc, SQLITE_OK); + + sqlite3_int64 count = -1; + rc = exec_get_int(db, "SELECT COUNT(*) FROM dbmem_settings WHERE key = 'dimension';", &count); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(count, 0); + + // the first real embedding persists the provider dimension + rc = exec_get_int(db, "SELECT memory_add_content('docs/real.md', '# Title' || char(10) || 'Real body text.');", &result); + ASSERT_EQ(rc, SQLITE_OK); + + sqlite3_int64 dimension = -1; + rc = exec_get_int(db, "SELECT value FROM dbmem_settings WHERE key = 'dimension';", &dimension); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(dimension, 4); + + sqlite3_close(db); +} + #ifndef DBMEM_OMIT_LOCAL_ENGINE TEST(sqlite_local_logger_ignores_stale_user_data) { dbmem_logger(GGML_LOG_LEVEL_WARN, "ignored warning", (void *)1); @@ -5299,6 +5570,7 @@ int main(int argc, char *argv[]) { RUN_TEST(sqlite_memory_list_files_escapes_json_strings); RUN_TEST(sqlite_memory_list_files_includes_empty_directory_marker); RUN_TEST(sqlite_memory_list_files_merges_directory_marker_with_children); + RUN_TEST(sqlite_memory_list_files_reports_indexed_flag); RUN_TEST(sqlite_memory_materialize_files_creates_directories_and_files); RUN_TEST(sqlite_memory_materialize_files_creates_directory_markers); RUN_TEST(sqlite_memory_materialize_files_accepts_existing_same_content); @@ -5392,6 +5664,14 @@ int main(int argc, char *argv[]) { RUN_TEST(sqlite_custom_provider_init_error); RUN_TEST(sqlite_custom_provider_apikey_passed); RUN_TEST(sqlite_set_model_failed_reindex_preserves_existing_rows); + + printf("\nDeferred embeddings tests:\n"); + RUN_TEST(sqlite_memory_defer_embeddings_stores_content_without_index); + RUN_TEST(sqlite_memory_defer_embeddings_requires_save_content); + RUN_TEST(sqlite_memory_embed_pending_embeds_deferred_content_in_batches); + RUN_TEST(sqlite_memory_zero_chunk_content_marks_processed_with_sentinel); + RUN_TEST(sqlite_memory_zero_chunk_first_add_does_not_persist_zero_dimension); + #ifndef DBMEM_OMIT_REMOTE_ENGINE RUN_TEST(sqlite_set_model_releases_previous_engine_on_class_switch); #else