diff --git a/CHANGELOG.md b/CHANGELOG.md index 8a2e7ac1..48fda1f0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +#### Tokenization +- **Tokenize Endpoints** ([#329](https://github.com/weaviate/csharp-client/pull/329)): Expose the `POST /v1/tokenize` and `POST /v1/schema/{class}/properties/{prop}/tokenize` endpoints introduced in Weaviate 1.37.0. Inspect how text is tokenized for a given method and analyzer configuration, or how a specific collection property would tokenize it. Access via `client.Tokenize.Text(...)` and `collection.Tokenize.Property(...)`. `AsciiFoldConfig` is modeled as a nullable record so the invalid "ignore without fold" state is unrepresentable. See [TOKENIZE_API_USAGE.md](docs/TOKENIZE_API_USAGE.md). Requires Weaviate ≥ 1.37.0. +- **Property-Level `TextAnalyzerConfig`** ([#329](https://github.com/weaviate/csharp-client/pull/329)): `Property.TextAnalyzer` (also applies to nested properties) lets a collection schema pin ASCII folding and/or a stopword preset per property at index time. The same `TextAnalyzerConfig` record is reused from the `Tokenize` endpoint so tokenize-at-query and index-at-insert stay aligned. A preflight version check on `CollectionsClient.Create` raises `WeaviateVersionMismatchException` when the server is older than 1.37.0. Requires Weaviate ≥ 1.37.0. +- **Collection-Level `StopwordPresets`** ([#329](https://github.com/weaviate/csharp-client/pull/329)): `InvertedIndexConfig.StopwordPresets` and `InvertedIndexConfigUpdate.StopwordPresets` define named preset name → word-list maps on the inverted-index config. Properties reference these presets via `TextAnalyzer.StopwordPreset`. Preset changes flow through `CollectionClient.Config.Update(c => c.InvertedIndexConfig.StopwordPresets = ...)`. Requires Weaviate ≥ 1.37.0. --- diff --git a/README.md b/README.md index aa3f4893..30232f9c 100644 --- a/README.md +++ b/README.md @@ -126,6 +126,7 @@ For more detailed information on specific features, please refer to the official - **[Backup API Usage](docs/BACKUP_API_USAGE.md)**: Creating and restoring backups - **[Nodes API Usage](docs/NODES_API_USAGE.md)**: Querying cluster node information - **[Aggregate Result Accessors](docs/AGGREGATE_RESULT_ACCESSORS.md)**: Type-safe access to aggregation results +- **[Tokenize API Usage](docs/TOKENIZE_API_USAGE.md)**: Inspect how text is tokenized with a given method or for a specific collection property. Requires Weaviate ≥ 1.37.0. - **[Microsoft.Extensions.VectorData Integration](docs/VECTORDATA.md)**: Standard .NET vector store abstraction support --- diff --git a/docs/TOKENIZE_API_USAGE.md b/docs/TOKENIZE_API_USAGE.md new file mode 100644 index 00000000..08bcb8a8 --- /dev/null +++ b/docs/TOKENIZE_API_USAGE.md @@ -0,0 +1,352 @@ +# Tokenize API Usage Guide + +> **Version Requirement:** +> The tokenize endpoints require Weaviate **v1.37.0** or newer. Calls against earlier versions throw `WeaviateVersionMismatchException`. + +This guide covers the Weaviate C# client's tokenize API — a pair of endpoints that let you inspect how the server would tokenize a piece of text, either with an ad-hoc tokenization strategy or using the one already configured on a collection property. + +## Table of Contents + +- [Overview](#overview) +- [Tokenization Methods](#tokenization-methods) +- [Ad-hoc Tokenization (`client.Tokenize.Text`)](#ad-hoc-tokenization-clienttokenizetext) +- [Property-scoped Tokenization (`collection.Tokenize.Property`)](#property-scoped-tokenization-collectiontokenizeproperty) +- [Analyzer Configuration](#analyzer-configuration) +- [Stopwords](#stopwords) +- [Result Shape](#result-shape) +- [Property-level Text Analyzer (schema)](#property-level-text-analyzer-schema) +- [Collection-level Stopword Presets (schema)](#collection-level-stopword-presets-schema) +- [Common Patterns](#common-patterns) + +## Overview + +The tokenize API exposes two REST endpoints: + +| Method | Endpoint | Use when… | +|---|---|---| +| `client.Tokenize.Text(...)` | `POST /v1/tokenize` | You want to preview tokenization for arbitrary text with any method/config — no collection required. | +| `collection.Tokenize.Property(...)` | `POST /v1/schema/{class}/properties/{prop}/tokenize` | You want to tokenize text *exactly as it would be indexed* by a specific property of an existing collection. | + +Both return a `TokenizeResult` containing two token lists: + +- **`Indexed`** — tokens as they are stored in the inverted index. +- **`Query`** — tokens as they are used for query matching (after stopword removal, etc.). + +These differ when stopwords are configured: a stopword like `"the"` is still indexed (so `BM25` can count it), but dropped from `Query` so it doesn't inflate match scores. + +## Tokenization Methods + +The `PropertyTokenization` enum covers all nine server-supported strategies: + +| Method | Input | Output (`Indexed`) | +|---|---|---| +| `Word` | `"The quick brown fox"` | `["the", "quick", "brown", "fox"]` | +| `Lowercase` | `"Hello World Test"` | `["hello", "world", "test"]` | +| `Whitespace` | `"Hello World Test"` | `["Hello", "World", "Test"]` | +| `Field` | `" Hello World "` | `["Hello World"]` *(entire field, trimmed)* | +| `Trigram` | `"Hello"` | `["hel", "ell", "llo"]` | +| `Gse` | Chinese/Japanese | Requires `ENABLE_TOKENIZER_GSE=true` on the server | +| `GseCh` | Chinese-only GSE | Requires `ENABLE_TOKENIZER_GSE_CH=true` | +| `KagomeJa` | Japanese | Requires `ENABLE_TOKENIZER_KAGOME_JA=true` | +| `KagomeKr` | Korean | Requires `ENABLE_TOKENIZER_KAGOME_KR=true` | + +## Ad-hoc Tokenization (`client.Tokenize.Text`) + +The simplest call takes only a text and a tokenization method: + +```csharp +using Weaviate.Client.Models; + +var result = await client.Tokenize.Text( + text: "The quick brown fox", + tokenization: PropertyTokenization.Word +); + +Console.WriteLine(string.Join(", ", result.Indexed)); +// the, quick, brown, fox +``` + +Signature: + +```csharp +Task Tokenize.Text( + string text, + PropertyTokenization tokenization, + TextAnalyzerConfig? analyzerConfig = null, + IDictionary? stopwordPresets = null, + CancellationToken cancellationToken = default +); +``` + +## Property-scoped Tokenization (`collection.Tokenize.Property`) + +When you want to see how a specific property would tokenize text — using that property's configured tokenization — use the collection-scoped variant: + +```csharp +var collection = await client.Collections.Get("Article"); + +var result = await collection.Tokenize.Property( + propertyName: "title", + text: " Hello World " +); + +Console.WriteLine(result.Tokenization); // Field (whatever the property is configured with) +Console.WriteLine(string.Join(", ", result.Indexed)); // Hello World +``` + +The server uses the property's configured tokenization method and any analyzer config attached to the property — you don't pass either yourself. + +## Analyzer Configuration + +`TextAnalyzerConfig` controls two optional analyzer stages: **ASCII folding** and **stopword removal**. + +### ASCII Folding + +`AsciiFoldConfig` is a nullable record — `null` means folding is disabled, non-`null` means it's enabled. The `Ignore` list lets you exempt specific characters from folding. + +```csharp +var cfg = new TextAnalyzerConfig +{ + AsciiFold = new AsciiFoldConfig(), // folding enabled, nothing ignored +}; + +var result = await client.Tokenize.Text( + "L'école est fermée", + PropertyTokenization.Word, + analyzerConfig: cfg +); +// result.Indexed == ["l", "ecole", "est", "fermee"] +``` + +Ignore a specific character: + +```csharp +var cfg = new TextAnalyzerConfig +{ + AsciiFold = new AsciiFoldConfig(Ignore: ["é"]), +}; + +var result = await client.Tokenize.Text( + "L'école est fermée", + PropertyTokenization.Word, + analyzerConfig: cfg +); +// result.Indexed == ["l", "école", "est", "fermée"] +``` + +> **Tip:** Modeling `AsciiFold` as a nullable record makes the "ignore without fold" state unrepresentable — you can't accidentally pass `Ignore` without enabling folding. + +### Stopwords + +Use a built-in preset (`"en"`, `"none"`) via the `StopwordPreset` field: + +```csharp +var cfg = new TextAnalyzerConfig { StopwordPreset = "en" }; + +var result = await client.Tokenize.Text( + "The quick brown fox", + PropertyTokenization.Word, + analyzerConfig: cfg +); + +// result.Indexed → ["the", "quick", "brown", "fox"] (all tokens kept in index) +// result.Query → ["quick", "brown", "fox"] ("the" removed for queries) +``` + +## Stopwords + +For more control, define a named preset via the `stopwordPresets` dictionary and reference it from `StopwordPreset`. + +### Add words to a preset + +```csharp +var cfg = new TextAnalyzerConfig { StopwordPreset = "custom" }; + +var presets = new Dictionary +{ + ["custom"] = new StopwordConfig + { + Preset = StopwordConfig.Presets.None, + Additions = ["test"], + }, +}; + +var result = await client.Tokenize.Text( + "hello world test", + PropertyTokenization.Word, + analyzerConfig: cfg, + stopwordPresets: presets +); + +// result.Indexed → ["hello", "world", "test"] +// result.Query → ["hello", "world"] ("test" dropped) +``` + +### Start from a base preset and remove words + +```csharp +var cfg = new TextAnalyzerConfig { StopwordPreset = "en-no-the" }; + +var presets = new Dictionary +{ + ["en-no-the"] = new StopwordConfig + { + Preset = StopwordConfig.Presets.EN, + Removals = ["the"], + }, +}; + +var result = await client.Tokenize.Text( + "the quick", + PropertyTokenization.Word, + analyzerConfig: cfg, + stopwordPresets: presets +); + +// "the" is no longer a stopword in this preset, so it survives in both lists. +``` + +### Combining folding and stopwords + +```csharp +var cfg = new TextAnalyzerConfig +{ + AsciiFold = new AsciiFoldConfig(Ignore: ["é"]), + StopwordPreset = "en", +}; + +var result = await client.Tokenize.Text( + "The école est fermée", + PropertyTokenization.Word, + analyzerConfig: cfg +); + +// result.Indexed → ["the", "école", "est", "fermee"] +// result.Query → ["école", "est", "fermee"] ("the" dropped) +``` + +## Result Shape + +`TokenizeResult` is a sealed record: + +| Member | Type | Description | +|---|---|---| +| `Tokenization` | `PropertyTokenization` | The method that was applied. | +| `Indexed` | `ImmutableList` | Tokens as stored in the inverted index. | +| `Query` | `ImmutableList` | Tokens used at query time (after stopword removal). | +| `AnalyzerConfig` | `TextAnalyzerConfig?` | Echo of the analyzer config that was applied, or `null`. | +| `StopwordConfig` | `StopwordConfig?` | Echo of the resolved stopword config, or `null`. | + +The `AnalyzerConfig` echo is the server's view of what was applied — useful for verifying that your config was parsed correctly. The round-trip also normalizes wire-format quirks (the server represents `asciiFold` as a `bool` + separate `asciiFoldIgnore[]`, but the client unwraps it back into the nested `AsciiFoldConfig` record). + +## Property-level Text Analyzer (schema) + +Beyond the ad-hoc tokenize endpoint, Weaviate 1.37.0 also lets you pin analyzer options directly on a property at **collection-creation time**. The same `TextAnalyzerConfig` record is reused: whatever you would pass to `client.Tokenize.Text(...)` can also be attached to a property so every value indexed through that property gets the same treatment. + +```csharp +await client.Collections.Create(new CollectionCreateParams +{ + Name = "Article", + Properties = + [ + new Property + { + Name = "title", + DataType = [DataType.Text], + Tokenization = PropertyTokenization.Word, + TextAnalyzer = new TextAnalyzerConfig + { + AsciiFold = new AsciiFoldConfig(), + StopwordPreset = "en", + }, + }, + ], +}); +``` + +Nested properties (object / object-array) accept `TextAnalyzer` too — they are `Property` records themselves, so the same field is available on every depth. + +> **Version requirement:** `Property.TextAnalyzer` is only wired up for servers at Weaviate ≥ 1.37.0. `CollectionsClient.Create` performs a preflight version check and throws `WeaviateVersionMismatchException` if the connected server is older, before the schema request is sent. + +## Collection-level Stopword Presets (schema) + +Named stopword lists live on the collection's inverted-index config. A preset is a `preset-name → word-list` pair; properties reference one by name via `TextAnalyzer.StopwordPreset`. + +```csharp +await client.Collections.Create(new CollectionCreateParams +{ + Name = "Article", + InvertedIndexConfig = new InvertedIndexConfig + { + StopwordPresets = new Dictionary> + { + ["fr"] = new[] { "le", "la", "les" }, + ["custom_en"] = new[] { "foo", "bar" }, + }, + }, + Properties = + [ + new Property + { + Name = "body", + DataType = [DataType.Text], + TextAnalyzer = new TextAnalyzerConfig { StopwordPreset = "fr" }, + }, + ], +}); +``` + +Updating presets on an existing collection goes through the normal update path: + +```csharp +await collection.Config.Update(c => +{ + c.InvertedIndexConfig.StopwordPresets = new Dictionary> + { + ["fr"] = new[] { "le", "la", "les", "un", "une" }, + }; +}); +``` + +Setting `StopwordPresets` replaces the whole preset map on the server. The server rejects removing a preset that is still referenced by a property's `TextAnalyzer.StopwordPreset` — keep preset removals and property-config changes in the same update, or unwire the property first. + +> **Version requirement:** Requires Weaviate ≥ 1.37.0. The preflight in `CollectionsClient.Create` also trips on `InvertedIndexConfig.StopwordPresets` before contacting the server. + +## Common Patterns + +### Previewing a query + +Use `collection.Tokenize.Property` to see exactly what tokens the server will match your search against: + +```csharp +var tokens = (await collection.Tokenize.Property("title", userQuery)).Query; +// Show tokens in the UI as "searching for: X, Y, Z" +``` + +### Debugging a BM25 miss + +If a search misses a term you expected, tokenize both the query and a sample document with the same property: + +```csharp +var queryTokens = (await collection.Tokenize.Property("body", "running")).Query; +var docTokens = (await collection.Tokenize.Property("body", "I was running")).Indexed; + +// If the sets don't intersect, BM25 can't match — check for stemming / stopwords. +``` + +### Verifying analyzer config round-trip + +When you configure ASCII folding or a stopword preset, the server echoes back its interpretation on every call: + +```csharp +var cfg = new TextAnalyzerConfig +{ + AsciiFold = new AsciiFoldConfig(Ignore: ["é"]), + StopwordPreset = "en", +}; + +var result = await client.Tokenize.Text("L'école", PropertyTokenization.Word, analyzerConfig: cfg); + +Debug.Assert(result.AnalyzerConfig!.AsciiFold!.Ignore!.SequenceEqual(new[] { "é" })); +Debug.Assert(result.AnalyzerConfig.StopwordPreset == "en"); +``` diff --git a/src/Weaviate.Client.Tests/Integration/TestCollectionTextAnalyzer.cs b/src/Weaviate.Client.Tests/Integration/TestCollectionTextAnalyzer.cs new file mode 100644 index 00000000..6619eb22 --- /dev/null +++ b/src/Weaviate.Client.Tests/Integration/TestCollectionTextAnalyzer.cs @@ -0,0 +1,462 @@ +using Weaviate.Client.Models; + +namespace Weaviate.Client.Tests.Integration; + +/// +/// Integration tests for property-level textAnalyzer configuration and +/// collection-level stopwordPresets. Requires Weaviate ≥ 1.37.0. +/// Ports integration/test_collection_config.py from weaviate-python-client PR #2006. +/// +[Collection("TestCollectionTextAnalyzer")] +public class TestCollectionTextAnalyzer : IntegrationTests +{ + private const string MinVersion = "1.37.0"; + + // ----------------------------------------------------------------------- + // Collection-level stopwordPresets + // ----------------------------------------------------------------------- + + [Fact] + public async Task StopwordPresets_AppliedAndRoundTripped() + { + RequireVersion(MinVersion, message: "stopwordPresets requires Weaviate >= 1.37.0"); + + var collection = await CollectionFactory( + properties: + [ + new Property + { + Name = "title_fr", + DataType = DataType.Text, + PropertyTokenization = PropertyTokenization.Word, + TextAnalyzer = new TextAnalyzerConfig { StopwordPreset = "fr" }, + }, + new Property + { + Name = "title_en", + DataType = DataType.Text, + PropertyTokenization = PropertyTokenization.Word, + TextAnalyzer = new TextAnalyzerConfig { StopwordPreset = "en" }, + }, + new Property + { + Name = "plain", + DataType = DataType.Text, + PropertyTokenization = PropertyTokenization.Word, + }, + ], + invertedIndexConfig: new() + { + StopwordPresets = new Dictionary> + { + ["fr"] = new List { "le", "la", "les" }, + }, + } + ); + + var config = await collection.Config.Get(TestContext.Current.CancellationToken); + + Assert.NotNull(config.InvertedIndexConfig); + Assert.NotNull(config.InvertedIndexConfig!.StopwordPresets); + Assert.Equal( + new[] { "le", "la", "les" }, + config.InvertedIndexConfig.StopwordPresets!["fr"] + ); + + var titleFr = config.Properties.Single(p => p.Name == "title_fr"); + var titleEn = config.Properties.Single(p => p.Name == "title_en"); + var plain = config.Properties.Single(p => p.Name == "plain"); + + Assert.NotNull(titleFr.TextAnalyzer); + Assert.Equal("fr", titleFr.TextAnalyzer!.StopwordPreset); + Assert.NotNull(titleEn.TextAnalyzer); + Assert.Equal("en", titleEn.TextAnalyzer!.StopwordPreset); + Assert.Null(plain.TextAnalyzer); + } + + [Fact] + public async Task StopwordPresets_Update_ReplacesPreset() + { + RequireVersion(MinVersion, message: "stopwordPresets requires Weaviate >= 1.37.0"); + + var collection = await CollectionFactory( + properties: + [ + new Property + { + Name = "title_fr", + DataType = DataType.Text, + PropertyTokenization = PropertyTokenization.Word, + TextAnalyzer = new TextAnalyzerConfig { StopwordPreset = "fr" }, + }, + ], + invertedIndexConfig: new() + { + StopwordPresets = new Dictionary> + { + ["fr"] = new List { "le" }, + }, + } + ); + + var config = await collection.Config.Get(TestContext.Current.CancellationToken); + Assert.Equal(new[] { "le" }, config.InvertedIndexConfig!.StopwordPresets!["fr"]); + + await collection.Config.Update( + c => + { + c.InvertedIndexConfig.StopwordPresets = new Dictionary> + { + ["fr"] = new List { "la" }, + }; + }, + TestContext.Current.CancellationToken + ); + + config = await collection.Config.Get(TestContext.Current.CancellationToken); + Assert.Equal(new[] { "la" }, config.InvertedIndexConfig!.StopwordPresets!["fr"]); + } + + [Fact] + public async Task StopwordPresets_RemoveInUse_RejectedByServer() + { + RequireVersion(MinVersion, message: "stopwordPresets requires Weaviate >= 1.37.0"); + + var collection = await CollectionFactory( + properties: + [ + new Property + { + Name = "title_fr", + DataType = DataType.Text, + PropertyTokenization = PropertyTokenization.Word, + TextAnalyzer = new TextAnalyzerConfig { StopwordPreset = "fr" }, + }, + ], + invertedIndexConfig: new() + { + StopwordPresets = new Dictionary> + { + ["fr"] = new List { "le", "la", "les" }, + }, + } + ); + + await Assert.ThrowsAnyAsync(async () => + { + await collection.Config.Update( + c => + { + c.InvertedIndexConfig.StopwordPresets = new Dictionary>(); + }, + TestContext.Current.CancellationToken + ); + }); + + // The original preset must survive the rejected update. + var config = await collection.Config.Get(TestContext.Current.CancellationToken); + Assert.Equal( + new[] { "le", "la", "les" }, + config.InvertedIndexConfig!.StopwordPresets!["fr"] + ); + } + + [Fact] + public async Task StopwordPresets_RemoveUnused_Allowed() + { + RequireVersion(MinVersion, message: "stopwordPresets requires Weaviate >= 1.37.0"); + + var collection = await CollectionFactory( + properties: + [ + new Property + { + Name = "title", + DataType = DataType.Text, + PropertyTokenization = PropertyTokenization.Word, + TextAnalyzer = new TextAnalyzerConfig { StopwordPreset = "fr" }, + }, + ], + invertedIndexConfig: new() + { + StopwordPresets = new Dictionary> + { + ["fr"] = new List { "le", "la", "les" }, + ["es"] = new List { "el", "la", "los" }, + }, + } + ); + + // Drop only 'es' (unused). 'fr' is still referenced by title. + await collection.Config.Update( + c => + { + c.InvertedIndexConfig.StopwordPresets = new Dictionary> + { + ["fr"] = new List { "le", "la", "les" }, + }; + }, + TestContext.Current.CancellationToken + ); + + var config = await collection.Config.Get(TestContext.Current.CancellationToken); + Assert.Equal( + new[] { "le", "la", "les" }, + config.InvertedIndexConfig!.StopwordPresets!["fr"] + ); + Assert.False(config.InvertedIndexConfig.StopwordPresets.ContainsKey("es")); + } + + [Fact] + public async Task StopwordPresets_RemoveReferencedByNested_RejectedByServer() + { + RequireVersion(MinVersion, message: "stopwordPresets requires Weaviate >= 1.37.0"); + + var collection = await CollectionFactory( + properties: + [ + new Property + { + Name = "doc", + DataType = DataType.Object, + NestedProperties = + [ + new Property + { + Name = "body", + DataType = DataType.Text, + PropertyTokenization = PropertyTokenization.Word, + TextAnalyzer = new TextAnalyzerConfig { StopwordPreset = "fr" }, + }, + ], + }, + ], + invertedIndexConfig: new() + { + StopwordPresets = new Dictionary> + { + ["fr"] = new List { "le", "la", "les" }, + }, + } + ); + + await Assert.ThrowsAnyAsync(async () => + { + await collection.Config.Update( + c => + { + c.InvertedIndexConfig.StopwordPresets = new Dictionary>(); + }, + TestContext.Current.CancellationToken + ); + }); + + var config = await collection.Config.Get(TestContext.Current.CancellationToken); + Assert.Equal( + new[] { "le", "la", "les" }, + config.InvertedIndexConfig!.StopwordPresets!["fr"] + ); + } + + [Fact] + public async Task UserDefinedStopwordPreset_OverridesBuiltin() + { + RequireVersion(MinVersion, message: "stopwordPresets requires Weaviate >= 1.37.0"); + + var collection = await CollectionFactory( + properties: + [ + new Property + { + Name = "title", + DataType = DataType.Text, + PropertyTokenization = PropertyTokenization.Word, + TextAnalyzer = new TextAnalyzerConfig { StopwordPreset = "en" }, + }, + ], + invertedIndexConfig: new() + { + StopwordPresets = new Dictionary> + { + ["en"] = new List { "hello" }, + }, + } + ); + + var config = await collection.Config.Get(TestContext.Current.CancellationToken); + Assert.Equal(new[] { "hello" }, config.InvertedIndexConfig!.StopwordPresets!["en"]); + + var title = config.Properties.Single(p => p.Name == "title"); + Assert.NotNull(title.TextAnalyzer); + Assert.Equal("en", title.TextAnalyzer!.StopwordPreset); + } + + // ----------------------------------------------------------------------- + // Property-level TextAnalyzer + // ----------------------------------------------------------------------- + + [Fact] + public async Task TextAnalyzer_CombinedAsciiFoldAndStopwordPreset() + { + RequireVersion(MinVersion, message: "textAnalyzer requires Weaviate >= 1.37.0"); + + var collection = await CollectionFactory( + properties: + [ + new Property + { + Name = "title", + DataType = DataType.Text, + PropertyTokenization = PropertyTokenization.Word, + TextAnalyzer = new TextAnalyzerConfig + { + AsciiFold = new AsciiFoldConfig(), + StopwordPreset = "en", + }, + }, + ] + ); + + var config = await collection.Config.Get(TestContext.Current.CancellationToken); + var title = config.Properties.Single(p => p.Name == "title"); + + Assert.NotNull(title.TextAnalyzer); + Assert.NotNull(title.TextAnalyzer!.AsciiFold); + Assert.Equal("en", title.TextAnalyzer.StopwordPreset); + } + + [Fact] + public async Task TextAnalyzer_AsciiFoldIgnore_RoundTrips() + { + RequireVersion(MinVersion, message: "textAnalyzer requires Weaviate >= 1.37.0"); + + var collection = await CollectionFactory( + properties: + [ + new Property + { + Name = "title", + DataType = DataType.Text, + PropertyTokenization = PropertyTokenization.Word, + TextAnalyzer = new TextAnalyzerConfig + { + AsciiFold = new AsciiFoldConfig(Ignore: ["é"]), + }, + }, + ] + ); + + var config = await collection.Config.Get(TestContext.Current.CancellationToken); + var title = config.Properties.Single(p => p.Name == "title"); + + Assert.NotNull(title.TextAnalyzer); + Assert.NotNull(title.TextAnalyzer!.AsciiFold); + Assert.Equal(new[] { "é" }, title.TextAnalyzer.AsciiFold!.Ignore); + } + + [Fact] + public async Task TextAnalyzer_FullRoundTrip_FromDictStyleConfig() + { + RequireVersion(MinVersion, message: "textAnalyzer requires Weaviate >= 1.37.0"); + + var collection = await CollectionFactory( + properties: + [ + new Property + { + Name = "title", + DataType = DataType.Text, + PropertyTokenization = PropertyTokenization.Word, + TextAnalyzer = new TextAnalyzerConfig + { + AsciiFold = new AsciiFoldConfig(Ignore: ["é"]), + StopwordPreset = "fr", + }, + }, + ], + invertedIndexConfig: new() + { + Stopwords = new StopwordConfig + { + Preset = StopwordConfig.Presets.EN, + Additions = ["a"], + Removals = ["the"], + }, + StopwordPresets = new Dictionary> + { + ["fr"] = new List { "le", "la", "les" }, + }, + } + ); + + var config = await collection.Config.Get(TestContext.Current.CancellationToken); + + Assert.Equal(StopwordConfig.Presets.EN, config.InvertedIndexConfig!.Stopwords!.Preset); + Assert.Equal(new[] { "the" }, config.InvertedIndexConfig.Stopwords.Removals); + Assert.Equal( + new[] { "le", "la", "les" }, + config.InvertedIndexConfig.StopwordPresets!["fr"] + ); + + var title = config.Properties.Single(p => p.Name == "title"); + Assert.NotNull(title.TextAnalyzer); + Assert.Equal("fr", title.TextAnalyzer!.StopwordPreset); + Assert.NotNull(title.TextAnalyzer.AsciiFold); + Assert.Equal(new[] { "é" }, title.TextAnalyzer.AsciiFold!.Ignore); + } + + // ----------------------------------------------------------------------- + // Version-gate + // ----------------------------------------------------------------------- + + [Fact] + public async Task Property_TextAnalyzer_RaisesOnOldServer() + { + if (ServerVersionIsInRange(MinVersion)) + { + Assert.Skip( + $"Version gate only applies to Weaviate < {MinVersion}. Current: {_weaviate.WeaviateVersion}" + ); + } + + await Assert.ThrowsAsync(async () => + { + await CollectionFactory( + properties: + [ + new Property + { + Name = "title", + DataType = DataType.Text, + PropertyTokenization = PropertyTokenization.Word, + TextAnalyzer = new TextAnalyzerConfig { AsciiFold = new AsciiFoldConfig() }, + }, + ] + ); + }); + } + + [Fact] + public async Task InvertedIndexConfig_StopwordPresets_RaisesOnOldServer() + { + if (ServerVersionIsInRange(MinVersion)) + { + Assert.Skip( + $"Version gate only applies to Weaviate < {MinVersion}. Current: {_weaviate.WeaviateVersion}" + ); + } + + await Assert.ThrowsAsync(async () => + { + await CollectionFactory( + invertedIndexConfig: new() + { + StopwordPresets = new Dictionary> + { + ["fr"] = new List { "le", "la" }, + }, + } + ); + }); + } +} diff --git a/src/Weaviate.Client.Tests/Integration/TestTokenize.cs b/src/Weaviate.Client.Tests/Integration/TestTokenize.cs new file mode 100644 index 00000000..8c8b30a8 --- /dev/null +++ b/src/Weaviate.Client.Tests/Integration/TestTokenize.cs @@ -0,0 +1,285 @@ +using System.Collections.Immutable; +using Weaviate.Client.Models; + +namespace Weaviate.Client.Tests.Integration; + +/// +/// Integration tests for the /v1/tokenize and +/// /v1/schema/{className}/properties/{propertyName}/tokenize endpoints. +/// Requires Weaviate server version 1.37.0 or later. +/// +[Collection("TestTokenize")] +public class TestTokenize : IntegrationTests +{ + // ----------------------------------------------------------------------- + // Serialization + // ----------------------------------------------------------------------- + + public static TheoryData TokenizationCases => + new() + { + { + PropertyTokenization.Word, + "The quick brown fox", + new[] { "the", "quick", "brown", "fox" } + }, + { + PropertyTokenization.Lowercase, + "Hello World Test", + new[] { "hello", "world", "test" } + }, + { + PropertyTokenization.Whitespace, + "Hello World Test", + new[] { "Hello", "World", "Test" } + }, + { PropertyTokenization.Field, " Hello World ", new[] { "Hello World" } }, + { PropertyTokenization.Trigram, "Hello", new[] { "hel", "ell", "llo" } }, + }; + + [Theory] + [MemberData(nameof(TokenizationCases))] + public async Task Tokenization_Enum( + PropertyTokenization tokenization, + string text, + string[] expectedTokens + ) + { + RequireVersion(nameof(TokenizeClient.Text)); + + var result = await _weaviate.Tokenize.Text( + text, + tokenization, + cancellationToken: TestContext.Current.CancellationToken + ); + + Assert.Equal(tokenization, result.Tokenization); + Assert.Equal(expectedTokens, result.Indexed); + Assert.Equal(expectedTokens, result.Query); + } + + [Fact] + public async Task NoAnalyzerConfig() + { + RequireVersion(nameof(TokenizeClient.Text)); + + var result = await _weaviate.Tokenize.Text( + "hello world", + PropertyTokenization.Word, + cancellationToken: TestContext.Current.CancellationToken + ); + + Assert.Equal(PropertyTokenization.Word, result.Tokenization); + Assert.Equal(new[] { "hello", "world" }, result.Indexed); + Assert.Null(result.AnalyzerConfig); + } + + [Fact] + public async Task AsciiFold() + { + RequireVersion(nameof(TokenizeClient.Text)); + + var cfg = new TextAnalyzerConfig { AsciiFold = new AsciiFoldConfig() }; + var result = await _weaviate.Tokenize.Text( + "L'école est fermée", + PropertyTokenization.Word, + analyzerConfig: cfg, + cancellationToken: TestContext.Current.CancellationToken + ); + + Assert.Equal(new[] { "l", "ecole", "est", "fermee" }, result.Indexed); + } + + [Fact] + public async Task AsciiFold_WithIgnore() + { + RequireVersion(nameof(TokenizeClient.Text)); + + var cfg = new TextAnalyzerConfig { AsciiFold = new AsciiFoldConfig(Ignore: ["é"]) }; + var result = await _weaviate.Tokenize.Text( + "L'école est fermée", + PropertyTokenization.Word, + analyzerConfig: cfg, + cancellationToken: TestContext.Current.CancellationToken + ); + + Assert.Equal(new[] { "l", "école", "est", "fermée" }, result.Indexed); + } + + [Fact] + public async Task StopwordPreset_String() + { + RequireVersion(nameof(TokenizeClient.Text)); + + var cfg = new TextAnalyzerConfig { StopwordPreset = "en" }; + var result = await _weaviate.Tokenize.Text( + "The quick brown fox", + PropertyTokenization.Word, + analyzerConfig: cfg, + cancellationToken: TestContext.Current.CancellationToken + ); + + Assert.DoesNotContain("the", result.Query); + Assert.Contains("quick", result.Query); + } + + [Fact] + public async Task Combined_AsciiFold_Stopwords() + { + RequireVersion(nameof(TokenizeClient.Text)); + + var cfg = new TextAnalyzerConfig + { + AsciiFold = new AsciiFoldConfig(Ignore: ["é"]), + StopwordPreset = "en", + }; + var result = await _weaviate.Tokenize.Text( + "The école est fermée", + PropertyTokenization.Word, + analyzerConfig: cfg, + cancellationToken: TestContext.Current.CancellationToken + ); + + Assert.Equal(new[] { "the", "école", "est", "fermée" }, result.Indexed); + Assert.DoesNotContain("the", result.Query); + Assert.Contains("école", result.Query); + } + + [Fact] + public async Task CustomPreset_Additions() + { + RequireVersion(nameof(TokenizeClient.Text)); + + var cfg = new TextAnalyzerConfig { StopwordPreset = "custom" }; + var presets = new Dictionary + { + ["custom"] = new StopwordConfig + { + Preset = StopwordConfig.Presets.None, + Additions = ["test"], + }, + }; + + var result = await _weaviate.Tokenize.Text( + "hello world test", + PropertyTokenization.Word, + analyzerConfig: cfg, + stopwordPresets: presets, + cancellationToken: TestContext.Current.CancellationToken + ); + + Assert.Equal(new[] { "hello", "world", "test" }, result.Indexed); + Assert.Equal(new[] { "hello", "world" }, result.Query); + } + + [Fact] + public async Task CustomPreset_BaseAndRemovals() + { + RequireVersion(nameof(TokenizeClient.Text)); + + var cfg = new TextAnalyzerConfig { StopwordPreset = "en-no-the" }; + var presets = new Dictionary + { + ["en-no-the"] = new StopwordConfig + { + Preset = StopwordConfig.Presets.EN, + Removals = ["the"], + }, + }; + + var result = await _weaviate.Tokenize.Text( + "the quick", + PropertyTokenization.Word, + analyzerConfig: cfg, + stopwordPresets: presets, + cancellationToken: TestContext.Current.CancellationToken + ); + + Assert.Equal(new[] { "the", "quick" }, result.Indexed); + Assert.Equal(new[] { "the", "quick" }, result.Query); + } + + // ----------------------------------------------------------------------- + // Deserialization + // ----------------------------------------------------------------------- + + [Fact] + public async Task Result_Types() + { + RequireVersion(nameof(TokenizeClient.Text)); + + var result = await _weaviate.Tokenize.Text( + "hello", + PropertyTokenization.Word, + cancellationToken: TestContext.Current.CancellationToken + ); + + Assert.IsType(result); + Assert.IsType>(result.Indexed); + Assert.IsType>(result.Query); + } + + [Fact] + public async Task AnalyzerConfig_Echoed() + { + RequireVersion(nameof(TokenizeClient.Text)); + + var cfg = new TextAnalyzerConfig + { + AsciiFold = new AsciiFoldConfig(Ignore: ["é"]), + StopwordPreset = "en", + }; + var result = await _weaviate.Tokenize.Text( + "L'école", + PropertyTokenization.Word, + analyzerConfig: cfg, + cancellationToken: TestContext.Current.CancellationToken + ); + + Assert.NotNull(result.AnalyzerConfig); + Assert.NotNull(result.AnalyzerConfig!.AsciiFold); + Assert.Equal(new[] { "é" }, result.AnalyzerConfig.AsciiFold!.Ignore); + Assert.Equal("en", result.AnalyzerConfig.StopwordPreset); + } + + [Fact] + public async Task AnalyzerConfig_None() + { + RequireVersion(nameof(TokenizeClient.Text)); + + var result = await _weaviate.Tokenize.Text( + "hello", + PropertyTokenization.Word, + cancellationToken: TestContext.Current.CancellationToken + ); + + Assert.Null(result.AnalyzerConfig); + } + + [Fact] + public async Task PropertyTokenize_Field() + { + RequireVersion(nameof(CollectionTokenizeClient.Property)); + + var collection = await CollectionFactory( + properties: + [ + new Property + { + Name = "tag", + DataType = DataType.Text, + PropertyTokenization = PropertyTokenization.Field, + }, + ] + ); + + var result = await collection.Tokenize.Property( + "tag", + " Hello World ", + TestContext.Current.CancellationToken + ); + + Assert.Equal(PropertyTokenization.Field, result.Tokenization); + Assert.Equal(new[] { "Hello World" }, result.Indexed); + } +} diff --git a/src/Weaviate.Client/CollectionClient.cs b/src/Weaviate.Client/CollectionClient.cs index 18294bdc..afcfc02e 100644 --- a/src/Weaviate.Client/CollectionClient.cs +++ b/src/Weaviate.Client/CollectionClient.cs @@ -232,4 +232,10 @@ public CollectionClient WithConsistencyLevel(ConsistencyLevels consistencyLevel) /// Gets the configuration client for managing collection configuration. /// public CollectionConfigClient Config => new(Client, Name); + + /// + /// Gets the tokenize client for inspecting how text is tokenized by + /// properties of this collection. Requires Weaviate server version 1.37.0 or later. + /// + public CollectionTokenizeClient Tokenize => new(Client, Name); } diff --git a/src/Weaviate.Client/CollectionTokenizeClient.cs b/src/Weaviate.Client/CollectionTokenizeClient.cs new file mode 100644 index 00000000..39525fac --- /dev/null +++ b/src/Weaviate.Client/CollectionTokenizeClient.cs @@ -0,0 +1,58 @@ +using Weaviate.Client.Models; + +namespace Weaviate.Client; + +/// +/// Exposes the per-property /v1/schema/{className}/properties/{propertyName}/tokenize +/// endpoint for a specific collection. Requires Weaviate server version 1.37.0 or later. +/// +public sealed class CollectionTokenizeClient +{ + private readonly WeaviateClient _client; + private readonly string _collectionName; + + internal CollectionTokenizeClient(WeaviateClient client, string collectionName) + { + _client = client; + _collectionName = collectionName; + } + + /// + /// Tokenizes using the tokenization method configured on + /// property of this collection. + /// + /// The name of the property whose tokenization to apply. + /// The text to tokenize. + /// Cancellation token. + /// The tokenization result. + /// + /// Thrown when the connected server version is below 1.37.0. + /// + [RequiresWeaviateVersion(1, 37, 0)] + public async Task Property( + string propertyName, + string text, + CancellationToken cancellationToken = default + ) + { + ArgumentException.ThrowIfNullOrEmpty(propertyName); + ArgumentNullException.ThrowIfNull(text); + + await _client.EnsureVersion(); + + var response = + await _client.RestClient.TokenizeProperty( + _collectionName, + propertyName, + new Rest.Dto.PropertyTokenizeRequest { Text = text }, + cancellationToken + ) + ?? throw new WeaviateClientException( + new InvalidOperationException( + "Tokenize property endpoint returned an empty response." + ) + ); + + return response.ToModel(); + } +} diff --git a/src/Weaviate.Client/CollectionsClient.cs b/src/Weaviate.Client/CollectionsClient.cs index faa1ec6a..063cbc91 100644 --- a/src/Weaviate.Client/CollectionsClient.cs +++ b/src/Weaviate.Client/CollectionsClient.cs @@ -197,6 +197,8 @@ public async Task Create( { ArgumentNullException.ThrowIfNull(collection); + await EnsureTextAnalyzerFeaturesSupported(collection); + var config = CollectionConfig.FromCollectionCreate(collection); var jsonString = JsonSerializer.Serialize( @@ -207,6 +209,59 @@ public async Task Create( return await CreateFromJson(jsonString, cancellationToken); } + private static readonly Version TextAnalyzerMinimumVersion = new(1, 37, 0); + + private async Task EnsureTextAnalyzerFeaturesSupported(CollectionCreateParams collection) + { + string? feature = DetectTextAnalyzerFeature(collection); + if (feature is null) + return; + + await _client.EnsureInitializedAsync(); + + var serverVersion = _client.WeaviateVersion; + if (serverVersion is null) + return; + + if (serverVersion < TextAnalyzerMinimumVersion) + { + throw new WeaviateVersionMismatchException( + feature, + TextAnalyzerMinimumVersion, + serverVersion + ); + } + } + + private static string? DetectTextAnalyzerFeature(CollectionCreateParams collection) + { + if (collection.InvertedIndexConfig?.StopwordPresets is { Count: > 0 }) + return "InvertedIndexConfig.StopwordPresets"; + + foreach (var property in collection.Properties) + { + if (PropertyUsesTextAnalyzer(property)) + return "Property.TextAnalyzer"; + } + + return null; + } + + private static bool PropertyUsesTextAnalyzer(Property property) + { + if (property.TextAnalyzer is not null) + return true; + if (property.NestedProperties is { } nested) + { + foreach (var np in nested) + { + if (PropertyUsesTextAnalyzer(np)) + return true; + } + } + return false; + } + /// /// Create a new typed collection from a json string. /// diff --git a/src/Weaviate.Client/Extensions.cs b/src/Weaviate.Client/Extensions.cs index a38b8296..7ae8ad8e 100644 --- a/src/Weaviate.Client/Extensions.cs +++ b/src/Weaviate.Client/Extensions.cs @@ -242,6 +242,7 @@ internal static Rest.Dto.Class ToDto(this CollectionConfig collection) ? collection.InvertedIndexConfig.IndexTimestamps : null, UsingBlockMaxWAND = collection.InvertedIndexConfig.UsingBlockMaxWAND, + StopwordPresets = collection.InvertedIndexConfig.StopwordPresets, }; } @@ -352,6 +353,7 @@ internal static CollectionConfigExport ToModel(this Rest.Dto.Class collection) ?? Weaviate.Client.Models.InvertedIndexConfig.Default.IndexTimestamps, UsingBlockMaxWAND = iic.UsingBlockMaxWAND, + StopwordPresets = iic.StopwordPresets, } : null; diff --git a/src/Weaviate.Client/Models/Collection.Update.cs b/src/Weaviate.Client/Models/Collection.Update.cs index 82e032b1..148114fa 100644 --- a/src/Weaviate.Client/Models/Collection.Update.cs +++ b/src/Weaviate.Client/Models/Collection.Update.cs @@ -123,6 +123,17 @@ public int CleanupIntervalSeconds /// public StopwordsConfigUpdate Stopwords => new(WrappedConfig.Stopwords ??= StopwordConfig.Default); + + /// + /// Gets or sets the named stopword presets defined at the collection level. + /// Setting this replaces the full preset map on the server. + /// Requires Weaviate ≥ 1.37.0. + /// + public IDictionary>? StopwordPresets + { + get => WrappedConfig.StopwordPresets; + set => WrappedConfig.StopwordPresets = value; + } } /// diff --git a/src/Weaviate.Client/Models/Extensions.cs b/src/Weaviate.Client/Models/Extensions.cs index 5a7db735..e79cde27 100644 --- a/src/Weaviate.Client/Models/Extensions.cs +++ b/src/Weaviate.Client/Models/Extensions.cs @@ -57,6 +57,7 @@ internal static Rest.Dto.NestedProperty ToNestedPropertyDto(this Property proper NestedProperties = property .NestedProperties?.Select(np => np.ToNestedPropertyDto()) .ToList(), + TextAnalyzer = property.TextAnalyzer.ToDto(), }; } @@ -122,6 +123,7 @@ internal static Rest.Dto.Property ToDto( .NestedProperties?.Select(np => np.ToNestedPropertyDto()) .ToList(), ModuleConfig = moduleConfig, + TextAnalyzer = property.TextAnalyzer.ToDto(), }; } } diff --git a/src/Weaviate.Client/Models/InvertedIndexConfig.cs b/src/Weaviate.Client/Models/InvertedIndexConfig.cs index 64e9c4fe..66e5292b 100644 --- a/src/Weaviate.Client/Models/InvertedIndexConfig.cs +++ b/src/Weaviate.Client/Models/InvertedIndexConfig.cs @@ -51,6 +51,14 @@ public record InvertedIndexConfig : IEquatable /// public bool? UsingBlockMaxWAND { get; set; } = null; + /// + /// Optional named stopword presets defined at the collection level. + /// Each entry is a preset name → list of stopwords. Individual properties + /// can reference a preset via . + /// Requires Weaviate ≥ 1.37.0. + /// + public IDictionary>? StopwordPresets { get; set; } = null; + /// /// Gets the hash code /// @@ -65,6 +73,15 @@ public override int GetHashCode() hash.Add(IndexTimestamps); hash.Add(Stopwords?.GetHashCode() ?? 0); hash.Add(UsingBlockMaxWAND); + if (StopwordPresets is not null) + { + foreach (var kvp in StopwordPresets.OrderBy(kvp => kvp.Key, StringComparer.Ordinal)) + { + hash.Add(kvp.Key); + foreach (var word in kvp.Value) + hash.Add(word); + } + } return hash.ToHashCode(); } @@ -106,6 +123,30 @@ UsingBlockMaxWAND is not null ) return false; + if (!StopwordPresetsEqual(StopwordPresets, other.StopwordPresets)) + return false; + + return true; + } + + private static bool StopwordPresetsEqual( + IDictionary>? a, + IDictionary>? b + ) + { + if (ReferenceEquals(a, b)) + return true; + if (a is null || b is null) + return false; + if (a.Count != b.Count) + return false; + foreach (var kvp in a) + { + if (!b.TryGetValue(kvp.Key, out var otherValue)) + return false; + if (!kvp.Value.SequenceEqual(otherValue)) + return false; + } return true; } } diff --git a/src/Weaviate.Client/Models/Property.cs b/src/Weaviate.Client/Models/Property.cs index 8dc2440d..7180559b 100644 --- a/src/Weaviate.Client/Models/Property.cs +++ b/src/Weaviate.Client/Models/Property.cs @@ -606,6 +606,13 @@ public required string Name /// public bool VectorizePropertyName { get; init; } = true; + /// + /// Optional property-level text analyzer configuration. When set, the property's + /// indexed and query tokens are post-processed according to the configured + /// ASCII-folding and stopword preset. Requires Weaviate ≥ 1.37.0. + /// + public TextAnalyzerConfig? TextAnalyzer { get; init; } + /// Gets a factory for creating text properties. public static PropertyFactory Text => PropertyHelper.Factory(DataType.Text); @@ -795,6 +802,7 @@ public override int GetHashCode() hash.Add(NestedProperties); hash.Add(SkipVectorization); hash.Add(VectorizePropertyName); + hash.Add(TextAnalyzer); return hash.ToHashCode(); } @@ -820,6 +828,10 @@ public virtual bool Equals(Property? other) && PropertyTokenization == other.PropertyTokenization && SkipVectorization == other.SkipVectorization && VectorizePropertyName == other.VectorizePropertyName + && EqualityComparer.Default.Equals( + TextAnalyzer, + other.TextAnalyzer + ) && ( (NestedProperties == null && other.NestedProperties == null) || ( diff --git a/src/Weaviate.Client/Models/Tokenize.cs b/src/Weaviate.Client/Models/Tokenize.cs new file mode 100644 index 00000000..f0695642 --- /dev/null +++ b/src/Weaviate.Client/Models/Tokenize.cs @@ -0,0 +1,138 @@ +using System.Collections.Immutable; + +namespace Weaviate.Client.Models; + +/// +/// ASCII-folding configuration: enables accent/diacritic folding, with an +/// optional list of characters to exclude. When set on +/// , folding is applied; when +/// null, folding is disabled. +/// +/// +/// Optional list of characters that should be excluded from ASCII folding. +/// +public sealed record AsciiFoldConfig(IReadOnlyList? Ignore = null); + +/// +/// Optional text-analyzer configuration for the tokenize endpoint. +/// Mirrors the server's TextAnalyzerConfig. +/// +public sealed record TextAnalyzerConfig +{ + /// + /// ASCII-folding configuration. When non-null, accent/diacritic marks are + /// folded to their base characters (e.g. 'école' → 'ecole'), except for + /// characters listed in . + /// When null, folding is disabled. + /// + public AsciiFoldConfig? AsciiFold { get; init; } + + /// + /// Stopword preset name. May be a built-in preset ("en", "none") + /// or the name of a custom preset provided via + /// 's + /// stopwordPresets dictionary. + /// + public string? StopwordPreset { get; init; } +} + +/// +/// Result of a tokenize request. +/// +public sealed record TokenizeResult +{ + /// + /// The tokenization method that was applied. + /// + public PropertyTokenization Tokenization { get; init; } + + /// + /// Tokens as they are stored in the inverted index. + /// + public ImmutableList Indexed { get; init; } = []; + + /// + /// Tokens as they are used for query matching (after stopword removal, etc.). + /// + public ImmutableList Query { get; init; } = []; + + /// + /// The text-analyzer configuration that was applied, if any. + /// + public TextAnalyzerConfig? AnalyzerConfig { get; init; } + + /// + /// The stopword configuration that was applied, if any. + /// + public StopwordConfig? StopwordConfig { get; init; } +} + +/// +/// Mapping helpers between public tokenize models and generated DTOs. +/// +internal static class TokenizeMapping +{ + internal static Rest.Dto.TokenizeRequestTokenization ToDto(this PropertyTokenization value) => + (Rest.Dto.TokenizeRequestTokenization)(int)value; + + internal static PropertyTokenization ToTokenization(string? wireValue) => + string.IsNullOrEmpty(wireValue) + ? PropertyTokenization.Word + : wireValue.FromEnumMemberString(); + + internal static Rest.Dto.TextAnalyzerConfig? ToDto(this TextAnalyzerConfig? config) => + config is null + ? null + : new Rest.Dto.TextAnalyzerConfig + { + AsciiFold = config.AsciiFold is not null ? true : null, + AsciiFoldIgnore = config.AsciiFold?.Ignore is { Count: > 0 } ignore + ? [.. ignore] + : null, + StopwordPreset = config.StopwordPreset, + }; + + internal static TextAnalyzerConfig? ToModel(this Rest.Dto.TextAnalyzerConfig? dto) => + dto is null + ? null + : new TextAnalyzerConfig + { + AsciiFold = + dto.AsciiFold == true + ? new AsciiFoldConfig( + dto.AsciiFoldIgnore is { Count: > 0 } ignore ? [.. ignore] : null + ) + : null, + StopwordPreset = dto.StopwordPreset, + }; + + internal static Rest.Dto.StopwordConfig ToDto(this StopwordConfig config) => + new() + { + Preset = config.Preset.ToEnumMemberString(), + Additions = config.Additions.Count > 0 ? [.. config.Additions] : null, + Removals = config.Removals.Count > 0 ? [.. config.Removals] : null, + }; + + internal static StopwordConfig? ToModel(this Rest.Dto.StopwordConfig? dto) => + dto is null + ? null + : new StopwordConfig + { + Preset = string.IsNullOrEmpty(dto.Preset) + ? StopwordConfig.Presets.None + : dto.Preset.FromEnumMemberString(), + Additions = dto.Additions?.ToImmutableList() ?? [], + Removals = dto.Removals?.ToImmutableList() ?? [], + }; + + internal static TokenizeResult ToModel(this Rest.Dto.TokenizeResponse dto) => + new() + { + Tokenization = ToTokenization(dto.Tokenization), + Indexed = dto.Indexed?.ToImmutableList() ?? [], + Query = dto.Query?.ToImmutableList() ?? [], + AnalyzerConfig = dto.AnalyzerConfig.ToModel(), + StopwordConfig = dto.StopwordConfig.ToModel(), + }; +} diff --git a/src/Weaviate.Client/PublicAPI.Unshipped.txt b/src/Weaviate.Client/PublicAPI.Unshipped.txt index 0ad485ce..557c9f65 100644 --- a/src/Weaviate.Client/PublicAPI.Unshipped.txt +++ b/src/Weaviate.Client/PublicAPI.Unshipped.txt @@ -6811,3 +6811,59 @@ Weaviate.Client.DependencyInjection.WeaviateOptions.AddIntegration(string! integ Weaviate.Client.ClientConfigurationExtensions static Weaviate.Client.ClientConfigurationExtensions.WithIntegration(this Weaviate.Client.ClientConfiguration! config, string! integrationValue) -> Weaviate.Client.ClientConfiguration! static Weaviate.Client.WeaviateClientBuilderExtensions.WithIntegration(this Weaviate.Client.WeaviateClientBuilder! builder, string! integrationValue) -> Weaviate.Client.WeaviateClientBuilder! +Weaviate.Client.TokenizeClient +Weaviate.Client.TokenizeClient.Text(string! text, Weaviate.Client.Models.PropertyTokenization tokenization, Weaviate.Client.Models.TextAnalyzerConfig? analyzerConfig = null, System.Collections.Generic.IDictionary? stopwordPresets = null, System.Threading.CancellationToken cancellationToken = default(System.Threading.CancellationToken)) -> System.Threading.Tasks.Task! +Weaviate.Client.WeaviateClient.Tokenize.get -> Weaviate.Client.TokenizeClient! +Weaviate.Client.CollectionTokenizeClient +Weaviate.Client.CollectionTokenizeClient.Property(string! propertyName, string! text, System.Threading.CancellationToken cancellationToken = default(System.Threading.CancellationToken)) -> System.Threading.Tasks.Task! +Weaviate.Client.CollectionClient.Tokenize.get -> Weaviate.Client.CollectionTokenizeClient! +Weaviate.Client.Models.AsciiFoldConfig +Weaviate.Client.Models.AsciiFoldConfig.AsciiFoldConfig(System.Collections.Generic.IReadOnlyList? Ignore = null) -> void +Weaviate.Client.Models.AsciiFoldConfig.Ignore.get -> System.Collections.Generic.IReadOnlyList? +Weaviate.Client.Models.AsciiFoldConfig.Ignore.init -> void +Weaviate.Client.Models.AsciiFoldConfig.Equals(Weaviate.Client.Models.AsciiFoldConfig? other) -> bool +Weaviate.Client.Models.AsciiFoldConfig.$() -> Weaviate.Client.Models.AsciiFoldConfig! +Weaviate.Client.Models.AsciiFoldConfig.Deconstruct(out System.Collections.Generic.IReadOnlyList? Ignore) -> void +override Weaviate.Client.Models.AsciiFoldConfig.Equals(object? obj) -> bool +override Weaviate.Client.Models.AsciiFoldConfig.GetHashCode() -> int +override Weaviate.Client.Models.AsciiFoldConfig.ToString() -> string! +static Weaviate.Client.Models.AsciiFoldConfig.operator !=(Weaviate.Client.Models.AsciiFoldConfig? left, Weaviate.Client.Models.AsciiFoldConfig? right) -> bool +static Weaviate.Client.Models.AsciiFoldConfig.operator ==(Weaviate.Client.Models.AsciiFoldConfig? left, Weaviate.Client.Models.AsciiFoldConfig? right) -> bool +Weaviate.Client.Models.TextAnalyzerConfig +Weaviate.Client.Models.TextAnalyzerConfig.TextAnalyzerConfig() -> void +Weaviate.Client.Models.TextAnalyzerConfig.AsciiFold.get -> Weaviate.Client.Models.AsciiFoldConfig? +Weaviate.Client.Models.TextAnalyzerConfig.AsciiFold.init -> void +Weaviate.Client.Models.TextAnalyzerConfig.StopwordPreset.get -> string? +Weaviate.Client.Models.TextAnalyzerConfig.StopwordPreset.init -> void +Weaviate.Client.Models.TextAnalyzerConfig.Equals(Weaviate.Client.Models.TextAnalyzerConfig? other) -> bool +Weaviate.Client.Models.TextAnalyzerConfig.$() -> Weaviate.Client.Models.TextAnalyzerConfig! +override Weaviate.Client.Models.TextAnalyzerConfig.Equals(object? obj) -> bool +override Weaviate.Client.Models.TextAnalyzerConfig.GetHashCode() -> int +override Weaviate.Client.Models.TextAnalyzerConfig.ToString() -> string! +static Weaviate.Client.Models.TextAnalyzerConfig.operator !=(Weaviate.Client.Models.TextAnalyzerConfig? left, Weaviate.Client.Models.TextAnalyzerConfig? right) -> bool +static Weaviate.Client.Models.TextAnalyzerConfig.operator ==(Weaviate.Client.Models.TextAnalyzerConfig? left, Weaviate.Client.Models.TextAnalyzerConfig? right) -> bool +Weaviate.Client.Models.TokenizeResult +Weaviate.Client.Models.TokenizeResult.TokenizeResult() -> void +Weaviate.Client.Models.TokenizeResult.Tokenization.get -> Weaviate.Client.Models.PropertyTokenization +Weaviate.Client.Models.TokenizeResult.Tokenization.init -> void +Weaviate.Client.Models.TokenizeResult.Indexed.get -> System.Collections.Immutable.ImmutableList! +Weaviate.Client.Models.TokenizeResult.Indexed.init -> void +Weaviate.Client.Models.TokenizeResult.Query.get -> System.Collections.Immutable.ImmutableList! +Weaviate.Client.Models.TokenizeResult.Query.init -> void +Weaviate.Client.Models.TokenizeResult.AnalyzerConfig.get -> Weaviate.Client.Models.TextAnalyzerConfig? +Weaviate.Client.Models.TokenizeResult.AnalyzerConfig.init -> void +Weaviate.Client.Models.TokenizeResult.StopwordConfig.get -> Weaviate.Client.Models.StopwordConfig? +Weaviate.Client.Models.TokenizeResult.StopwordConfig.init -> void +Weaviate.Client.Models.TokenizeResult.Equals(Weaviate.Client.Models.TokenizeResult? other) -> bool +Weaviate.Client.Models.TokenizeResult.$() -> Weaviate.Client.Models.TokenizeResult! +override Weaviate.Client.Models.TokenizeResult.Equals(object? obj) -> bool +override Weaviate.Client.Models.TokenizeResult.GetHashCode() -> int +override Weaviate.Client.Models.TokenizeResult.ToString() -> string! +static Weaviate.Client.Models.TokenizeResult.operator !=(Weaviate.Client.Models.TokenizeResult? left, Weaviate.Client.Models.TokenizeResult? right) -> bool +static Weaviate.Client.Models.TokenizeResult.operator ==(Weaviate.Client.Models.TokenizeResult? left, Weaviate.Client.Models.TokenizeResult? right) -> bool +Weaviate.Client.Models.Property.TextAnalyzer.get -> Weaviate.Client.Models.TextAnalyzerConfig? +Weaviate.Client.Models.Property.TextAnalyzer.init -> void +Weaviate.Client.Models.InvertedIndexConfig.StopwordPresets.get -> System.Collections.Generic.IDictionary!>? +Weaviate.Client.Models.InvertedIndexConfig.StopwordPresets.set -> void +Weaviate.Client.Models.InvertedIndexConfigUpdate.StopwordPresets.get -> System.Collections.Generic.IDictionary!>? +Weaviate.Client.Models.InvertedIndexConfigUpdate.StopwordPresets.set -> void diff --git a/src/Weaviate.Client/Rest/Dto/Extensions.cs b/src/Weaviate.Client/Rest/Dto/Extensions.cs index 0b915bd1..7caf7898 100644 --- a/src/Weaviate.Client/Rest/Dto/Extensions.cs +++ b/src/Weaviate.Client/Rest/Dto/Extensions.cs @@ -88,6 +88,7 @@ public Models.Property ToModel() IndexRangeFilters = IndexRangeFilters, PropertyTokenization = (Models.PropertyTokenization?)Tokenization, NestedProperties = NestedProperties?.Select(np => np.ToModel()).ToArray(), + TextAnalyzer = Weaviate.Client.Models.TokenizeMapping.ToModel(TextAnalyzer), }; } } @@ -150,6 +151,7 @@ public Models.Property ToModel() NestedProperties = NestedProperties?.Select(np => np.ToModel()).ToArray(), SkipVectorization = skipVectorization, VectorizePropertyName = vectorizePropertyName, + TextAnalyzer = Weaviate.Client.Models.TokenizeMapping.ToModel(TextAnalyzer), }; } diff --git a/src/Weaviate.Client/Rest/Endpoints.cs b/src/Weaviate.Client/Rest/Endpoints.cs index d52c1f12..2e1a32dc 100644 --- a/src/Weaviate.Client/Rest/Endpoints.cs +++ b/src/Weaviate.Client/Rest/Endpoints.cs @@ -208,6 +208,21 @@ internal static string Nodes(string? collection, string verbosity) return path; } + /// + /// Path for the generic tokenize endpoint. + /// + /// The string + internal static string Tokenize() => "tokenize"; + + /// + /// Path for the per-property tokenize endpoint. + /// + /// The collection (class) name. + /// The property name. + /// The string + internal static string TokenizeProperty(string className, string propertyName) => + $"schema/{className}/properties/{propertyName}/tokenize"; + // Well-known endpoints /// /// Wells the known live diff --git a/src/Weaviate.Client/Rest/Tokenize.cs b/src/Weaviate.Client/Rest/Tokenize.cs new file mode 100644 index 00000000..ac3e4da8 --- /dev/null +++ b/src/Weaviate.Client/Rest/Tokenize.cs @@ -0,0 +1,54 @@ +using System.Net; +using System.Net.Http.Json; +using Weaviate.Client.Rest.Dto; + +namespace Weaviate.Client.Rest; + +internal partial class WeaviateRestClient +{ + /// + /// Calls POST /v1/tokenize. + /// + internal async Task Tokenize( + TokenizeRequest request, + CancellationToken cancellationToken = default + ) + { + var response = await _httpClient.PostAsJsonAsync( + WeaviateEndpoints.Tokenize(), + request, + options: RestJsonSerializerOptions, + cancellationToken: cancellationToken + ); + + await response.ManageStatusCode([HttpStatusCode.OK], "tokenize"); + + return await response.DecodeAsync(cancellationToken); + } + + /// + /// Calls POST /v1/schema/{className}/properties/{propertyName}/tokenize. + /// + internal async Task TokenizeProperty( + string className, + string propertyName, + PropertyTokenizeRequest request, + CancellationToken cancellationToken = default + ) + { + var response = await _httpClient.PostAsJsonAsync( + WeaviateEndpoints.TokenizeProperty(className, propertyName), + request, + options: RestJsonSerializerOptions, + cancellationToken: cancellationToken + ); + + await response.ManageStatusCode( + [HttpStatusCode.OK], + "tokenize property", + ResourceType.Property + ); + + return await response.DecodeAsync(cancellationToken); + } +} diff --git a/src/Weaviate.Client/TokenizeClient.cs b/src/Weaviate.Client/TokenizeClient.cs new file mode 100644 index 00000000..ffb4b933 --- /dev/null +++ b/src/Weaviate.Client/TokenizeClient.cs @@ -0,0 +1,67 @@ +using Weaviate.Client.Models; + +namespace Weaviate.Client; + +/// +/// Exposes the /v1/tokenize endpoint for inspecting how text is tokenized +/// with a given tokenization method and analyzer configuration. +/// Requires Weaviate server version 1.37.0 or later. +/// +public sealed class TokenizeClient +{ + private readonly WeaviateClient _client; + + internal TokenizeClient(WeaviateClient client) + { + _client = client; + } + + /// + /// Tokenizes using the given strategy. + /// Returns the indexed and query forms produced by the server, plus the analyzer/stopword + /// configurations that were applied. + /// + /// The text to tokenize. + /// The tokenization method to apply. + /// Optional text analyzer configuration (e.g. ASCII folding, stopword preset). + /// + /// Optional named stopword configurations. Each key is a preset name that can be referenced by + /// . Each value is a . + /// + /// Cancellation token. + /// + /// Thrown when the connected server version is below 1.37.0. + /// + [RequiresWeaviateVersion(1, 37, 0)] + public async Task Text( + string text, + PropertyTokenization tokenization, + TextAnalyzerConfig? analyzerConfig = null, + IDictionary? stopwordPresets = null, + CancellationToken cancellationToken = default + ) + { + ArgumentNullException.ThrowIfNull(text); + + await _client.EnsureVersion(); + + var request = new Rest.Dto.TokenizeRequest + { + Text = text, + Tokenization = tokenization.ToDto(), + AnalyzerConfig = analyzerConfig.ToDto(), + StopwordPresets = stopwordPresets?.ToDictionary( + kvp => kvp.Key, + kvp => kvp.Value.ToDto() + ), + }; + + var response = + await _client.RestClient.Tokenize(request, cancellationToken) + ?? throw new WeaviateClientException( + new InvalidOperationException("Tokenize endpoint returned an empty response.") + ); + + return response.ToModel(); + } +} diff --git a/src/Weaviate.Client/WeaviateClient.cs b/src/Weaviate.Client/WeaviateClient.cs index 29b7fbaa..88b59cb7 100644 --- a/src/Weaviate.Client/WeaviateClient.cs +++ b/src/Weaviate.Client/WeaviateClient.cs @@ -193,6 +193,12 @@ private CancellationToken CreateInitCancellationToken(CancellationToken userToke /// public GroupsClient Groups { get; } + /// + /// Gets the tokenize client for inspecting how text is tokenized by the server. + /// Requires Weaviate server version 1.37.0 or later. + /// + public TokenizeClient Tokenize { get; } + /// /// Ises the weaviate domain using the specified url /// @@ -227,6 +233,7 @@ internal WeaviateClient(ClientConfiguration configuration, ILogger @@ -267,6 +274,7 @@ internal WeaviateClient( Users = new UsersClient(this); Roles = new RolesClient(this); Groups = new GroupsClient(this); + Tokenize = new TokenizeClient(this); } /// @@ -322,6 +330,7 @@ public WeaviateClient( Users = new UsersClient(this); Roles = new RolesClient(this); Groups = new GroupsClient(this); + Tokenize = new TokenizeClient(this); } ///