From af0606670a86791c4e9b49ba3aadf746ca00680f Mon Sep 17 00:00:00 2001 From: Michelangelo Partipilo Date: Tue, 21 Apr 2026 19:32:05 +0200 Subject: [PATCH 1/4] feat: add tokenize endpoint support (Weaviate 1.37.0+) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Port of python-client PR #2012, aligned with the TS client's `tokenize` namespace design. Adds: - `client.Tokenize.Text(text, tokenization, analyzerConfig?, stopwordPresets?)` → POST /v1/tokenize - `collection.Tokenize.Property(propertyName, text)` → POST /v1/schema/{class}/properties/{prop}/tokenize Version-gated at 1.37.0 via `[RequiresWeaviateVersion]`. `AsciiFold` is modeled as a nullable record (null = disabled, non-null = enabled with optional `Ignore` list) so the invalid "ignore without fold" state is unrepresentable without a validator. Co-Authored-By: Claude Opus 4.7 --- .../Integration/TestTokenize.cs | 285 ++++++++++++++++++ src/Weaviate.Client/CollectionClient.cs | 6 + .../CollectionTokenizeClient.cs | 58 ++++ src/Weaviate.Client/Models/Tokenize.cs | 138 +++++++++ src/Weaviate.Client/PublicAPI.Unshipped.txt | 50 +++ src/Weaviate.Client/Rest/Endpoints.cs | 15 + src/Weaviate.Client/Rest/Tokenize.cs | 54 ++++ src/Weaviate.Client/TokenizeClient.cs | 67 ++++ src/Weaviate.Client/WeaviateClient.cs | 9 + 9 files changed, 682 insertions(+) create mode 100644 src/Weaviate.Client.Tests/Integration/TestTokenize.cs create mode 100644 src/Weaviate.Client/CollectionTokenizeClient.cs create mode 100644 src/Weaviate.Client/Models/Tokenize.cs create mode 100644 src/Weaviate.Client/Rest/Tokenize.cs create mode 100644 src/Weaviate.Client/TokenizeClient.cs diff --git a/src/Weaviate.Client.Tests/Integration/TestTokenize.cs b/src/Weaviate.Client.Tests/Integration/TestTokenize.cs new file mode 100644 index 00000000..e2ed2de4 --- /dev/null +++ b/src/Weaviate.Client.Tests/Integration/TestTokenize.cs @@ -0,0 +1,285 @@ +using System.Collections.Immutable; +using Weaviate.Client.Models; + +namespace Weaviate.Client.Tests.Integration; + +/// +/// Integration tests for the /v1/tokenize and +/// /v1/schema/{className}/properties/{propertyName}/tokenize endpoints. +/// Requires Weaviate server version 1.37.0 or later. +/// +[Collection("TestTokenize")] +public class TestTokenize : IntegrationTests +{ + // ----------------------------------------------------------------------- + // Serialization + // ----------------------------------------------------------------------- + + public static TheoryData TokenizationCases => + new() + { + { + PropertyTokenization.Word, + "The quick brown fox", + new[] { "the", "quick", "brown", "fox" } + }, + { + PropertyTokenization.Lowercase, + "Hello World Test", + new[] { "hello", "world", "test" } + }, + { + PropertyTokenization.Whitespace, + "Hello World Test", + new[] { "Hello", "World", "Test" } + }, + { PropertyTokenization.Field, " Hello World ", new[] { "Hello World" } }, + { PropertyTokenization.Trigram, "Hello", new[] { "hel", "ell", "llo" } }, + }; + + [Theory] + [MemberData(nameof(TokenizationCases))] + public async Task Tokenization_Enum( + PropertyTokenization tokenization, + string text, + string[] expectedTokens + ) + { + RequireVersion(nameof(TokenizeClient.Text)); + + var result = await _weaviate.Tokenize.Text( + text, + tokenization, + cancellationToken: TestContext.Current.CancellationToken + ); + + Assert.Equal(tokenization, result.Tokenization); + Assert.Equal(expectedTokens, result.Indexed); + Assert.Equal(expectedTokens, result.Query); + } + + [Fact] + public async Task NoAnalyzerConfig() + { + RequireVersion(nameof(TokenizeClient.Text)); + + var result = await _weaviate.Tokenize.Text( + "hello world", + PropertyTokenization.Word, + cancellationToken: TestContext.Current.CancellationToken + ); + + Assert.Equal(PropertyTokenization.Word, result.Tokenization); + Assert.Equal(new[] { "hello", "world" }, result.Indexed); + Assert.Null(result.AnalyzerConfig); + } + + [Fact] + public async Task AsciiFold() + { + RequireVersion(nameof(TokenizeClient.Text)); + + var cfg = new TokenizeAnalyzerConfig { AsciiFold = new AsciiFoldConfig() }; + var result = await _weaviate.Tokenize.Text( + "L'école est fermée", + PropertyTokenization.Word, + analyzerConfig: cfg, + cancellationToken: TestContext.Current.CancellationToken + ); + + Assert.Equal(new[] { "l", "ecole", "est", "fermee" }, result.Indexed); + } + + [Fact] + public async Task AsciiFold_WithIgnore() + { + RequireVersion(nameof(TokenizeClient.Text)); + + var cfg = new TokenizeAnalyzerConfig { AsciiFold = new AsciiFoldConfig(Ignore: ["é"]) }; + var result = await _weaviate.Tokenize.Text( + "L'école est fermée", + PropertyTokenization.Word, + analyzerConfig: cfg, + cancellationToken: TestContext.Current.CancellationToken + ); + + Assert.Equal(new[] { "l", "école", "est", "fermée" }, result.Indexed); + } + + [Fact] + public async Task StopwordPreset_String() + { + RequireVersion(nameof(TokenizeClient.Text)); + + var cfg = new TokenizeAnalyzerConfig { StopwordPreset = "en" }; + var result = await _weaviate.Tokenize.Text( + "The quick brown fox", + PropertyTokenization.Word, + analyzerConfig: cfg, + cancellationToken: TestContext.Current.CancellationToken + ); + + Assert.DoesNotContain("the", result.Query); + Assert.Contains("quick", result.Query); + } + + [Fact] + public async Task Combined_AsciiFold_Stopwords() + { + RequireVersion(nameof(TokenizeClient.Text)); + + var cfg = new TokenizeAnalyzerConfig + { + AsciiFold = new AsciiFoldConfig(Ignore: ["é"]), + StopwordPreset = "en", + }; + var result = await _weaviate.Tokenize.Text( + "The école est fermée", + PropertyTokenization.Word, + analyzerConfig: cfg, + cancellationToken: TestContext.Current.CancellationToken + ); + + Assert.Equal(new[] { "the", "école", "est", "fermée" }, result.Indexed); + Assert.DoesNotContain("the", result.Query); + Assert.Contains("école", result.Query); + } + + [Fact] + public async Task CustomPreset_Additions() + { + RequireVersion(nameof(TokenizeClient.Text)); + + var cfg = new TokenizeAnalyzerConfig { StopwordPreset = "custom" }; + var presets = new Dictionary + { + ["custom"] = new StopwordConfig + { + Preset = StopwordConfig.Presets.None, + Additions = ["test"], + }, + }; + + var result = await _weaviate.Tokenize.Text( + "hello world test", + PropertyTokenization.Word, + analyzerConfig: cfg, + stopwordPresets: presets, + cancellationToken: TestContext.Current.CancellationToken + ); + + Assert.Equal(new[] { "hello", "world", "test" }, result.Indexed); + Assert.Equal(new[] { "hello", "world" }, result.Query); + } + + [Fact] + public async Task CustomPreset_BaseAndRemovals() + { + RequireVersion(nameof(TokenizeClient.Text)); + + var cfg = new TokenizeAnalyzerConfig { StopwordPreset = "en-no-the" }; + var presets = new Dictionary + { + ["en-no-the"] = new StopwordConfig + { + Preset = StopwordConfig.Presets.EN, + Removals = ["the"], + }, + }; + + var result = await _weaviate.Tokenize.Text( + "the quick", + PropertyTokenization.Word, + analyzerConfig: cfg, + stopwordPresets: presets, + cancellationToken: TestContext.Current.CancellationToken + ); + + Assert.Equal(new[] { "the", "quick" }, result.Indexed); + Assert.Equal(new[] { "the", "quick" }, result.Query); + } + + // ----------------------------------------------------------------------- + // Deserialization + // ----------------------------------------------------------------------- + + [Fact] + public async Task Result_Types() + { + RequireVersion(nameof(TokenizeClient.Text)); + + var result = await _weaviate.Tokenize.Text( + "hello", + PropertyTokenization.Word, + cancellationToken: TestContext.Current.CancellationToken + ); + + Assert.IsType(result); + Assert.IsType>(result.Indexed); + Assert.IsType>(result.Query); + } + + [Fact] + public async Task AnalyzerConfig_Echoed() + { + RequireVersion(nameof(TokenizeClient.Text)); + + var cfg = new TokenizeAnalyzerConfig + { + AsciiFold = new AsciiFoldConfig(Ignore: ["é"]), + StopwordPreset = "en", + }; + var result = await _weaviate.Tokenize.Text( + "L'école", + PropertyTokenization.Word, + analyzerConfig: cfg, + cancellationToken: TestContext.Current.CancellationToken + ); + + Assert.NotNull(result.AnalyzerConfig); + Assert.NotNull(result.AnalyzerConfig!.AsciiFold); + Assert.Equal(new[] { "é" }, result.AnalyzerConfig.AsciiFold!.Ignore); + Assert.Equal("en", result.AnalyzerConfig.StopwordPreset); + } + + [Fact] + public async Task AnalyzerConfig_None() + { + RequireVersion(nameof(TokenizeClient.Text)); + + var result = await _weaviate.Tokenize.Text( + "hello", + PropertyTokenization.Word, + cancellationToken: TestContext.Current.CancellationToken + ); + + Assert.Null(result.AnalyzerConfig); + } + + [Fact] + public async Task PropertyTokenize_Field() + { + RequireVersion(nameof(CollectionTokenizeClient.Property)); + + var collection = await CollectionFactory( + properties: + [ + new Property + { + Name = "tag", + DataType = DataType.Text, + PropertyTokenization = PropertyTokenization.Field, + }, + ] + ); + + var result = await collection.Tokenize.Property( + "tag", + " Hello World ", + TestContext.Current.CancellationToken + ); + + Assert.Equal(PropertyTokenization.Field, result.Tokenization); + Assert.Equal(new[] { "Hello World" }, result.Indexed); + } +} diff --git a/src/Weaviate.Client/CollectionClient.cs b/src/Weaviate.Client/CollectionClient.cs index 18294bdc..afcfc02e 100644 --- a/src/Weaviate.Client/CollectionClient.cs +++ b/src/Weaviate.Client/CollectionClient.cs @@ -232,4 +232,10 @@ public CollectionClient WithConsistencyLevel(ConsistencyLevels consistencyLevel) /// Gets the configuration client for managing collection configuration. /// public CollectionConfigClient Config => new(Client, Name); + + /// + /// Gets the tokenize client for inspecting how text is tokenized by + /// properties of this collection. Requires Weaviate server version 1.37.0 or later. + /// + public CollectionTokenizeClient Tokenize => new(Client, Name); } diff --git a/src/Weaviate.Client/CollectionTokenizeClient.cs b/src/Weaviate.Client/CollectionTokenizeClient.cs new file mode 100644 index 00000000..39525fac --- /dev/null +++ b/src/Weaviate.Client/CollectionTokenizeClient.cs @@ -0,0 +1,58 @@ +using Weaviate.Client.Models; + +namespace Weaviate.Client; + +/// +/// Exposes the per-property /v1/schema/{className}/properties/{propertyName}/tokenize +/// endpoint for a specific collection. Requires Weaviate server version 1.37.0 or later. +/// +public sealed class CollectionTokenizeClient +{ + private readonly WeaviateClient _client; + private readonly string _collectionName; + + internal CollectionTokenizeClient(WeaviateClient client, string collectionName) + { + _client = client; + _collectionName = collectionName; + } + + /// + /// Tokenizes using the tokenization method configured on + /// property of this collection. + /// + /// The name of the property whose tokenization to apply. + /// The text to tokenize. + /// Cancellation token. + /// The tokenization result. + /// + /// Thrown when the connected server version is below 1.37.0. + /// + [RequiresWeaviateVersion(1, 37, 0)] + public async Task Property( + string propertyName, + string text, + CancellationToken cancellationToken = default + ) + { + ArgumentException.ThrowIfNullOrEmpty(propertyName); + ArgumentNullException.ThrowIfNull(text); + + await _client.EnsureVersion(); + + var response = + await _client.RestClient.TokenizeProperty( + _collectionName, + propertyName, + new Rest.Dto.PropertyTokenizeRequest { Text = text }, + cancellationToken + ) + ?? throw new WeaviateClientException( + new InvalidOperationException( + "Tokenize property endpoint returned an empty response." + ) + ); + + return response.ToModel(); + } +} diff --git a/src/Weaviate.Client/Models/Tokenize.cs b/src/Weaviate.Client/Models/Tokenize.cs new file mode 100644 index 00000000..3b10d137 --- /dev/null +++ b/src/Weaviate.Client/Models/Tokenize.cs @@ -0,0 +1,138 @@ +using System.Collections.Immutable; + +namespace Weaviate.Client.Models; + +/// +/// ASCII-folding configuration: enables accent/diacritic folding, with an +/// optional list of characters to exclude. When set on +/// , folding is applied; when +/// null, folding is disabled. +/// +/// +/// Optional list of characters that should be excluded from ASCII folding. +/// +public sealed record AsciiFoldConfig(IReadOnlyList? Ignore = null); + +/// +/// Optional text-analyzer configuration for the tokenize endpoint. +/// Mirrors the server's TextAnalyzerConfig. +/// +public sealed record TokenizeAnalyzerConfig +{ + /// + /// ASCII-folding configuration. When non-null, accent/diacritic marks are + /// folded to their base characters (e.g. 'école' → 'ecole'), except for + /// characters listed in . + /// When null, folding is disabled. + /// + public AsciiFoldConfig? AsciiFold { get; init; } + + /// + /// Stopword preset name. May be a built-in preset ("en", "none") + /// or the name of a custom preset provided via + /// 's + /// stopwordPresets dictionary. + /// + public string? StopwordPreset { get; init; } +} + +/// +/// Result of a tokenize request. +/// +public sealed record TokenizeResult +{ + /// + /// The tokenization method that was applied. + /// + public PropertyTokenization Tokenization { get; init; } + + /// + /// Tokens as they are stored in the inverted index. + /// + public ImmutableList Indexed { get; init; } = []; + + /// + /// Tokens as they are used for query matching (after stopword removal, etc.). + /// + public ImmutableList Query { get; init; } = []; + + /// + /// The text-analyzer configuration that was applied, if any. + /// + public TokenizeAnalyzerConfig? AnalyzerConfig { get; init; } + + /// + /// The stopword configuration that was applied, if any. + /// + public StopwordConfig? StopwordConfig { get; init; } +} + +/// +/// Mapping helpers between public tokenize models and generated DTOs. +/// +internal static class TokenizeMapping +{ + internal static Rest.Dto.TokenizeRequestTokenization ToDto(this PropertyTokenization value) => + (Rest.Dto.TokenizeRequestTokenization)(int)value; + + internal static PropertyTokenization ToTokenization(string? wireValue) => + string.IsNullOrEmpty(wireValue) + ? PropertyTokenization.Word + : wireValue.FromEnumMemberString(); + + internal static Rest.Dto.TextAnalyzerConfig? ToDto(this TokenizeAnalyzerConfig? config) => + config is null + ? null + : new Rest.Dto.TextAnalyzerConfig + { + AsciiFold = config.AsciiFold is not null ? true : null, + AsciiFoldIgnore = config.AsciiFold?.Ignore is { Count: > 0 } ignore + ? [.. ignore] + : null, + StopwordPreset = config.StopwordPreset, + }; + + internal static TokenizeAnalyzerConfig? ToModel(this Rest.Dto.TextAnalyzerConfig? dto) => + dto is null + ? null + : new TokenizeAnalyzerConfig + { + AsciiFold = + dto.AsciiFold == true + ? new AsciiFoldConfig( + dto.AsciiFoldIgnore is { Count: > 0 } ignore ? [.. ignore] : null + ) + : null, + StopwordPreset = dto.StopwordPreset, + }; + + internal static Rest.Dto.StopwordConfig ToDto(this StopwordConfig config) => + new() + { + Preset = config.Preset.ToEnumMemberString(), + Additions = config.Additions.Count > 0 ? [.. config.Additions] : null, + Removals = config.Removals.Count > 0 ? [.. config.Removals] : null, + }; + + internal static StopwordConfig? ToModel(this Rest.Dto.StopwordConfig? dto) => + dto is null + ? null + : new StopwordConfig + { + Preset = string.IsNullOrEmpty(dto.Preset) + ? StopwordConfig.Presets.None + : dto.Preset.FromEnumMemberString(), + Additions = dto.Additions?.ToImmutableList() ?? [], + Removals = dto.Removals?.ToImmutableList() ?? [], + }; + + internal static TokenizeResult ToModel(this Rest.Dto.TokenizeResponse dto) => + new() + { + Tokenization = ToTokenization(dto.Tokenization), + Indexed = dto.Indexed?.ToImmutableList() ?? [], + Query = dto.Query?.ToImmutableList() ?? [], + AnalyzerConfig = dto.AnalyzerConfig.ToModel(), + StopwordConfig = dto.StopwordConfig.ToModel(), + }; +} diff --git a/src/Weaviate.Client/PublicAPI.Unshipped.txt b/src/Weaviate.Client/PublicAPI.Unshipped.txt index 0ad485ce..453a24d9 100644 --- a/src/Weaviate.Client/PublicAPI.Unshipped.txt +++ b/src/Weaviate.Client/PublicAPI.Unshipped.txt @@ -6811,3 +6811,53 @@ Weaviate.Client.DependencyInjection.WeaviateOptions.AddIntegration(string! integ Weaviate.Client.ClientConfigurationExtensions static Weaviate.Client.ClientConfigurationExtensions.WithIntegration(this Weaviate.Client.ClientConfiguration! config, string! integrationValue) -> Weaviate.Client.ClientConfiguration! static Weaviate.Client.WeaviateClientBuilderExtensions.WithIntegration(this Weaviate.Client.WeaviateClientBuilder! builder, string! integrationValue) -> Weaviate.Client.WeaviateClientBuilder! +Weaviate.Client.TokenizeClient +Weaviate.Client.TokenizeClient.Text(string! text, Weaviate.Client.Models.PropertyTokenization tokenization, Weaviate.Client.Models.TokenizeAnalyzerConfig? analyzerConfig = null, System.Collections.Generic.IDictionary? stopwordPresets = null, System.Threading.CancellationToken cancellationToken = default(System.Threading.CancellationToken)) -> System.Threading.Tasks.Task! +Weaviate.Client.WeaviateClient.Tokenize.get -> Weaviate.Client.TokenizeClient! +Weaviate.Client.CollectionTokenizeClient +Weaviate.Client.CollectionTokenizeClient.Property(string! propertyName, string! text, System.Threading.CancellationToken cancellationToken = default(System.Threading.CancellationToken)) -> System.Threading.Tasks.Task! +Weaviate.Client.CollectionClient.Tokenize.get -> Weaviate.Client.CollectionTokenizeClient! +Weaviate.Client.Models.AsciiFoldConfig +Weaviate.Client.Models.AsciiFoldConfig.AsciiFoldConfig(System.Collections.Generic.IReadOnlyList? Ignore = null) -> void +Weaviate.Client.Models.AsciiFoldConfig.Ignore.get -> System.Collections.Generic.IReadOnlyList? +Weaviate.Client.Models.AsciiFoldConfig.Ignore.init -> void +Weaviate.Client.Models.AsciiFoldConfig.Equals(Weaviate.Client.Models.AsciiFoldConfig? other) -> bool +Weaviate.Client.Models.AsciiFoldConfig.$() -> Weaviate.Client.Models.AsciiFoldConfig! +Weaviate.Client.Models.AsciiFoldConfig.Deconstruct(out System.Collections.Generic.IReadOnlyList? Ignore) -> void +override Weaviate.Client.Models.AsciiFoldConfig.Equals(object? obj) -> bool +override Weaviate.Client.Models.AsciiFoldConfig.GetHashCode() -> int +override Weaviate.Client.Models.AsciiFoldConfig.ToString() -> string! +static Weaviate.Client.Models.AsciiFoldConfig.operator !=(Weaviate.Client.Models.AsciiFoldConfig? left, Weaviate.Client.Models.AsciiFoldConfig? right) -> bool +static Weaviate.Client.Models.AsciiFoldConfig.operator ==(Weaviate.Client.Models.AsciiFoldConfig? left, Weaviate.Client.Models.AsciiFoldConfig? right) -> bool +Weaviate.Client.Models.TokenizeAnalyzerConfig +Weaviate.Client.Models.TokenizeAnalyzerConfig.TokenizeAnalyzerConfig() -> void +Weaviate.Client.Models.TokenizeAnalyzerConfig.AsciiFold.get -> Weaviate.Client.Models.AsciiFoldConfig? +Weaviate.Client.Models.TokenizeAnalyzerConfig.AsciiFold.init -> void +Weaviate.Client.Models.TokenizeAnalyzerConfig.StopwordPreset.get -> string? +Weaviate.Client.Models.TokenizeAnalyzerConfig.StopwordPreset.init -> void +Weaviate.Client.Models.TokenizeAnalyzerConfig.Equals(Weaviate.Client.Models.TokenizeAnalyzerConfig? other) -> bool +Weaviate.Client.Models.TokenizeAnalyzerConfig.$() -> Weaviate.Client.Models.TokenizeAnalyzerConfig! +override Weaviate.Client.Models.TokenizeAnalyzerConfig.Equals(object? obj) -> bool +override Weaviate.Client.Models.TokenizeAnalyzerConfig.GetHashCode() -> int +override Weaviate.Client.Models.TokenizeAnalyzerConfig.ToString() -> string! +static Weaviate.Client.Models.TokenizeAnalyzerConfig.operator !=(Weaviate.Client.Models.TokenizeAnalyzerConfig? left, Weaviate.Client.Models.TokenizeAnalyzerConfig? right) -> bool +static Weaviate.Client.Models.TokenizeAnalyzerConfig.operator ==(Weaviate.Client.Models.TokenizeAnalyzerConfig? left, Weaviate.Client.Models.TokenizeAnalyzerConfig? right) -> bool +Weaviate.Client.Models.TokenizeResult +Weaviate.Client.Models.TokenizeResult.TokenizeResult() -> void +Weaviate.Client.Models.TokenizeResult.Tokenization.get -> Weaviate.Client.Models.PropertyTokenization +Weaviate.Client.Models.TokenizeResult.Tokenization.init -> void +Weaviate.Client.Models.TokenizeResult.Indexed.get -> System.Collections.Immutable.ImmutableList! +Weaviate.Client.Models.TokenizeResult.Indexed.init -> void +Weaviate.Client.Models.TokenizeResult.Query.get -> System.Collections.Immutable.ImmutableList! +Weaviate.Client.Models.TokenizeResult.Query.init -> void +Weaviate.Client.Models.TokenizeResult.AnalyzerConfig.get -> Weaviate.Client.Models.TokenizeAnalyzerConfig? +Weaviate.Client.Models.TokenizeResult.AnalyzerConfig.init -> void +Weaviate.Client.Models.TokenizeResult.StopwordConfig.get -> Weaviate.Client.Models.StopwordConfig? +Weaviate.Client.Models.TokenizeResult.StopwordConfig.init -> void +Weaviate.Client.Models.TokenizeResult.Equals(Weaviate.Client.Models.TokenizeResult? other) -> bool +Weaviate.Client.Models.TokenizeResult.$() -> Weaviate.Client.Models.TokenizeResult! +override Weaviate.Client.Models.TokenizeResult.Equals(object? obj) -> bool +override Weaviate.Client.Models.TokenizeResult.GetHashCode() -> int +override Weaviate.Client.Models.TokenizeResult.ToString() -> string! +static Weaviate.Client.Models.TokenizeResult.operator !=(Weaviate.Client.Models.TokenizeResult? left, Weaviate.Client.Models.TokenizeResult? right) -> bool +static Weaviate.Client.Models.TokenizeResult.operator ==(Weaviate.Client.Models.TokenizeResult? left, Weaviate.Client.Models.TokenizeResult? right) -> bool diff --git a/src/Weaviate.Client/Rest/Endpoints.cs b/src/Weaviate.Client/Rest/Endpoints.cs index d52c1f12..2e1a32dc 100644 --- a/src/Weaviate.Client/Rest/Endpoints.cs +++ b/src/Weaviate.Client/Rest/Endpoints.cs @@ -208,6 +208,21 @@ internal static string Nodes(string? collection, string verbosity) return path; } + /// + /// Path for the generic tokenize endpoint. + /// + /// The string + internal static string Tokenize() => "tokenize"; + + /// + /// Path for the per-property tokenize endpoint. + /// + /// The collection (class) name. + /// The property name. + /// The string + internal static string TokenizeProperty(string className, string propertyName) => + $"schema/{className}/properties/{propertyName}/tokenize"; + // Well-known endpoints /// /// Wells the known live diff --git a/src/Weaviate.Client/Rest/Tokenize.cs b/src/Weaviate.Client/Rest/Tokenize.cs new file mode 100644 index 00000000..ac3e4da8 --- /dev/null +++ b/src/Weaviate.Client/Rest/Tokenize.cs @@ -0,0 +1,54 @@ +using System.Net; +using System.Net.Http.Json; +using Weaviate.Client.Rest.Dto; + +namespace Weaviate.Client.Rest; + +internal partial class WeaviateRestClient +{ + /// + /// Calls POST /v1/tokenize. + /// + internal async Task Tokenize( + TokenizeRequest request, + CancellationToken cancellationToken = default + ) + { + var response = await _httpClient.PostAsJsonAsync( + WeaviateEndpoints.Tokenize(), + request, + options: RestJsonSerializerOptions, + cancellationToken: cancellationToken + ); + + await response.ManageStatusCode([HttpStatusCode.OK], "tokenize"); + + return await response.DecodeAsync(cancellationToken); + } + + /// + /// Calls POST /v1/schema/{className}/properties/{propertyName}/tokenize. + /// + internal async Task TokenizeProperty( + string className, + string propertyName, + PropertyTokenizeRequest request, + CancellationToken cancellationToken = default + ) + { + var response = await _httpClient.PostAsJsonAsync( + WeaviateEndpoints.TokenizeProperty(className, propertyName), + request, + options: RestJsonSerializerOptions, + cancellationToken: cancellationToken + ); + + await response.ManageStatusCode( + [HttpStatusCode.OK], + "tokenize property", + ResourceType.Property + ); + + return await response.DecodeAsync(cancellationToken); + } +} diff --git a/src/Weaviate.Client/TokenizeClient.cs b/src/Weaviate.Client/TokenizeClient.cs new file mode 100644 index 00000000..16b7f01e --- /dev/null +++ b/src/Weaviate.Client/TokenizeClient.cs @@ -0,0 +1,67 @@ +using Weaviate.Client.Models; + +namespace Weaviate.Client; + +/// +/// Exposes the /v1/tokenize endpoint for inspecting how text is tokenized +/// with a given tokenization method and analyzer configuration. +/// Requires Weaviate server version 1.37.0 or later. +/// +public sealed class TokenizeClient +{ + private readonly WeaviateClient _client; + + internal TokenizeClient(WeaviateClient client) + { + _client = client; + } + + /// + /// Tokenizes using the given strategy. + /// Returns the indexed and query forms produced by the server, plus the analyzer/stopword + /// configurations that were applied. + /// + /// The text to tokenize. + /// The tokenization method to apply. + /// Optional text analyzer configuration (e.g. ASCII folding, stopword preset). + /// + /// Optional named stopword configurations. Each key is a preset name that can be referenced by + /// . Each value is a . + /// + /// Cancellation token. + /// + /// Thrown when the connected server version is below 1.37.0. + /// + [RequiresWeaviateVersion(1, 37, 0)] + public async Task Text( + string text, + PropertyTokenization tokenization, + TokenizeAnalyzerConfig? analyzerConfig = null, + IDictionary? stopwordPresets = null, + CancellationToken cancellationToken = default + ) + { + ArgumentNullException.ThrowIfNull(text); + + await _client.EnsureVersion(); + + var request = new Rest.Dto.TokenizeRequest + { + Text = text, + Tokenization = tokenization.ToDto(), + AnalyzerConfig = analyzerConfig.ToDto(), + StopwordPresets = stopwordPresets?.ToDictionary( + kvp => kvp.Key, + kvp => kvp.Value.ToDto() + ), + }; + + var response = + await _client.RestClient.Tokenize(request, cancellationToken) + ?? throw new WeaviateClientException( + new InvalidOperationException("Tokenize endpoint returned an empty response.") + ); + + return response.ToModel(); + } +} diff --git a/src/Weaviate.Client/WeaviateClient.cs b/src/Weaviate.Client/WeaviateClient.cs index 29b7fbaa..88b59cb7 100644 --- a/src/Weaviate.Client/WeaviateClient.cs +++ b/src/Weaviate.Client/WeaviateClient.cs @@ -193,6 +193,12 @@ private CancellationToken CreateInitCancellationToken(CancellationToken userToke /// public GroupsClient Groups { get; } + /// + /// Gets the tokenize client for inspecting how text is tokenized by the server. + /// Requires Weaviate server version 1.37.0 or later. + /// + public TokenizeClient Tokenize { get; } + /// /// Ises the weaviate domain using the specified url /// @@ -227,6 +233,7 @@ internal WeaviateClient(ClientConfiguration configuration, ILogger @@ -267,6 +274,7 @@ internal WeaviateClient( Users = new UsersClient(this); Roles = new RolesClient(this); Groups = new GroupsClient(this); + Tokenize = new TokenizeClient(this); } /// @@ -322,6 +330,7 @@ public WeaviateClient( Users = new UsersClient(this); Roles = new RolesClient(this); Groups = new GroupsClient(this); + Tokenize = new TokenizeClient(this); } /// From b939a9a65bf601264cd7aa0d203819b7f4c7564d Mon Sep 17 00:00:00 2001 From: Michelangelo Partipilo Date: Tue, 21 Apr 2026 19:46:56 +0200 Subject: [PATCH 2/4] docs: add tokenize API usage guide - New docs/TOKENIZE_API_USAGE.md covers both `client.Tokenize.Text` and `collection.Tokenize.Property`, analyzer config (ASCII folding, stopwords), the result shape, and common usage patterns. - Link the guide from README under "Additional Guides". - Add an "Unreleased" CHANGELOG entry for the tokenize endpoints. Co-Authored-By: Claude Opus 4.7 --- CHANGELOG.md | 4 + README.md | 1 + docs/TOKENIZE_API_USAGE.md | 277 +++++++++++++++++++++++++++++++++++++ 3 files changed, 282 insertions(+) create mode 100644 docs/TOKENIZE_API_USAGE.md diff --git a/CHANGELOG.md b/CHANGELOG.md index 8a2e7ac1..f205612c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +#### Tokenization +- **Tokenize Endpoints** ([#329](https://github.com/weaviate/csharp-client/pull/329)): Expose the `POST /v1/tokenize` and `POST /v1/schema/{class}/properties/{prop}/tokenize` endpoints introduced in Weaviate 1.37.0. Inspect how text is tokenized for a given method and analyzer configuration, or how a specific collection property would tokenize it. Access via `client.Tokenize.Text(...)` and `collection.Tokenize.Property(...)`. `AsciiFoldConfig` is modeled as a nullable record so the invalid "ignore without fold" state is unrepresentable. See [TOKENIZE_API_USAGE.md](docs/TOKENIZE_API_USAGE.md). Requires Weaviate ≥ 1.37.0. --- diff --git a/README.md b/README.md index aa3f4893..30232f9c 100644 --- a/README.md +++ b/README.md @@ -126,6 +126,7 @@ For more detailed information on specific features, please refer to the official - **[Backup API Usage](docs/BACKUP_API_USAGE.md)**: Creating and restoring backups - **[Nodes API Usage](docs/NODES_API_USAGE.md)**: Querying cluster node information - **[Aggregate Result Accessors](docs/AGGREGATE_RESULT_ACCESSORS.md)**: Type-safe access to aggregation results +- **[Tokenize API Usage](docs/TOKENIZE_API_USAGE.md)**: Inspect how text is tokenized with a given method or for a specific collection property. Requires Weaviate ≥ 1.37.0. - **[Microsoft.Extensions.VectorData Integration](docs/VECTORDATA.md)**: Standard .NET vector store abstraction support --- diff --git a/docs/TOKENIZE_API_USAGE.md b/docs/TOKENIZE_API_USAGE.md new file mode 100644 index 00000000..624e1692 --- /dev/null +++ b/docs/TOKENIZE_API_USAGE.md @@ -0,0 +1,277 @@ +# Tokenize API Usage Guide + +> **Version Requirement:** +> The tokenize endpoints require Weaviate **v1.37.0** or newer. Calls against earlier versions throw `WeaviateVersionMismatchException`. + +This guide covers the Weaviate C# client's tokenize API — a pair of endpoints that let you inspect how the server would tokenize a piece of text, either with an ad-hoc tokenization strategy or using the one already configured on a collection property. + +## Table of Contents + +- [Overview](#overview) +- [Tokenization Methods](#tokenization-methods) +- [Ad-hoc Tokenization (`client.Tokenize.Text`)](#ad-hoc-tokenization-clienttokenizetext) +- [Property-scoped Tokenization (`collection.Tokenize.Property`)](#property-scoped-tokenization-collectiontokenizeproperty) +- [Analyzer Configuration](#analyzer-configuration) +- [Stopwords](#stopwords) +- [Result Shape](#result-shape) +- [Common Patterns](#common-patterns) + +## Overview + +The tokenize API exposes two REST endpoints: + +| Method | Endpoint | Use when… | +|---|---|---| +| `client.Tokenize.Text(...)` | `POST /v1/tokenize` | You want to preview tokenization for arbitrary text with any method/config — no collection required. | +| `collection.Tokenize.Property(...)` | `POST /v1/schema/{class}/properties/{prop}/tokenize` | You want to tokenize text *exactly as it would be indexed* by a specific property of an existing collection. | + +Both return a `TokenizeResult` containing two token lists: + +- **`Indexed`** — tokens as they are stored in the inverted index. +- **`Query`** — tokens as they are used for query matching (after stopword removal, etc.). + +These differ when stopwords are configured: a stopword like `"the"` is still indexed (so `BM25` can count it), but dropped from `Query` so it doesn't inflate match scores. + +## Tokenization Methods + +The `PropertyTokenization` enum covers all nine server-supported strategies: + +| Method | Input | Output (`Indexed`) | +|---|---|---| +| `Word` | `"The quick brown fox"` | `["the", "quick", "brown", "fox"]` | +| `Lowercase` | `"Hello World Test"` | `["hello", "world", "test"]` | +| `Whitespace` | `"Hello World Test"` | `["Hello", "World", "Test"]` | +| `Field` | `" Hello World "` | `["Hello World"]` *(entire field, trimmed)* | +| `Trigram` | `"Hello"` | `["hel", "ell", "llo"]` | +| `Gse` | Chinese/Japanese | Requires `ENABLE_TOKENIZER_GSE=true` on the server | +| `GseCh` | Chinese-only GSE | Requires `ENABLE_TOKENIZER_GSE_CH=true` | +| `KagomeJa` | Japanese | Requires `ENABLE_TOKENIZER_KAGOME_JA=true` | +| `KagomeKr` | Korean | Requires `ENABLE_TOKENIZER_KAGOME_KR=true` | + +## Ad-hoc Tokenization (`client.Tokenize.Text`) + +The simplest call takes only a text and a tokenization method: + +```csharp +using Weaviate.Client.Models; + +var result = await client.Tokenize.Text( + text: "The quick brown fox", + tokenization: PropertyTokenization.Word +); + +Console.WriteLine(string.Join(", ", result.Indexed)); +// the, quick, brown, fox +``` + +Signature: + +```csharp +Task Tokenize.Text( + string text, + PropertyTokenization tokenization, + TokenizeAnalyzerConfig? analyzerConfig = null, + IDictionary? stopwordPresets = null, + CancellationToken cancellationToken = default +); +``` + +## Property-scoped Tokenization (`collection.Tokenize.Property`) + +When you want to see how a specific property would tokenize text — using that property's configured tokenization — use the collection-scoped variant: + +```csharp +var collection = await client.Collections.Get("Article"); + +var result = await collection.Tokenize.Property( + propertyName: "title", + text: " Hello World " +); + +Console.WriteLine(result.Tokenization); // Field (whatever the property is configured with) +Console.WriteLine(string.Join(", ", result.Indexed)); // Hello World +``` + +The server uses the property's configured tokenization method and any analyzer config attached to the property — you don't pass either yourself. + +## Analyzer Configuration + +`TokenizeAnalyzerConfig` controls two optional analyzer stages: **ASCII folding** and **stopword removal**. + +### ASCII Folding + +`AsciiFoldConfig` is a nullable record — `null` means folding is disabled, non-`null` means it's enabled. The `Ignore` list lets you exempt specific characters from folding. + +```csharp +var cfg = new TokenizeAnalyzerConfig +{ + AsciiFold = new AsciiFoldConfig(), // folding enabled, nothing ignored +}; + +var result = await client.Tokenize.Text( + "L'école est fermée", + PropertyTokenization.Word, + analyzerConfig: cfg +); +// result.Indexed == ["l", "ecole", "est", "fermee"] +``` + +Ignore a specific character: + +```csharp +var cfg = new TokenizeAnalyzerConfig +{ + AsciiFold = new AsciiFoldConfig(Ignore: ["é"]), +}; + +var result = await client.Tokenize.Text( + "L'école est fermée", + PropertyTokenization.Word, + analyzerConfig: cfg +); +// result.Indexed == ["l", "école", "est", "fermée"] +``` + +> **Tip:** Modeling `AsciiFold` as a nullable record makes the "ignore without fold" state unrepresentable — you can't accidentally pass `Ignore` without enabling folding. + +### Stopwords + +Use a built-in preset (`"en"`, `"none"`) via the `StopwordPreset` field: + +```csharp +var cfg = new TokenizeAnalyzerConfig { StopwordPreset = "en" }; + +var result = await client.Tokenize.Text( + "The quick brown fox", + PropertyTokenization.Word, + analyzerConfig: cfg +); + +// result.Indexed → ["the", "quick", "brown", "fox"] (all tokens kept in index) +// result.Query → ["quick", "brown", "fox"] ("the" removed for queries) +``` + +## Stopwords + +For more control, define a named preset via the `stopwordPresets` dictionary and reference it from `StopwordPreset`. + +### Add words to a preset + +```csharp +var cfg = new TokenizeAnalyzerConfig { StopwordPreset = "custom" }; + +var presets = new Dictionary +{ + ["custom"] = new StopwordConfig + { + Preset = StopwordConfig.Presets.None, + Additions = ["test"], + }, +}; + +var result = await client.Tokenize.Text( + "hello world test", + PropertyTokenization.Word, + analyzerConfig: cfg, + stopwordPresets: presets +); + +// result.Indexed → ["hello", "world", "test"] +// result.Query → ["hello", "world"] ("test" dropped) +``` + +### Start from a base preset and remove words + +```csharp +var cfg = new TokenizeAnalyzerConfig { StopwordPreset = "en-no-the" }; + +var presets = new Dictionary +{ + ["en-no-the"] = new StopwordConfig + { + Preset = StopwordConfig.Presets.EN, + Removals = ["the"], + }, +}; + +var result = await client.Tokenize.Text( + "the quick", + PropertyTokenization.Word, + analyzerConfig: cfg, + stopwordPresets: presets +); + +// "the" is no longer a stopword in this preset, so it survives in both lists. +``` + +### Combining folding and stopwords + +```csharp +var cfg = new TokenizeAnalyzerConfig +{ + AsciiFold = new AsciiFoldConfig(Ignore: ["é"]), + StopwordPreset = "en", +}; + +var result = await client.Tokenize.Text( + "The école est fermée", + PropertyTokenization.Word, + analyzerConfig: cfg +); + +// result.Indexed → ["the", "école", "est", "fermee"] +// result.Query → ["école", "est", "fermee"] ("the" dropped) +``` + +## Result Shape + +`TokenizeResult` is a sealed record: + +| Member | Type | Description | +|---|---|---| +| `Tokenization` | `PropertyTokenization` | The method that was applied. | +| `Indexed` | `ImmutableList` | Tokens as stored in the inverted index. | +| `Query` | `ImmutableList` | Tokens used at query time (after stopword removal). | +| `AnalyzerConfig` | `TokenizeAnalyzerConfig?` | Echo of the analyzer config that was applied, or `null`. | +| `StopwordConfig` | `StopwordConfig?` | Echo of the resolved stopword config, or `null`. | + +The `AnalyzerConfig` echo is the server's view of what was applied — useful for verifying that your config was parsed correctly. The round-trip also normalizes wire-format quirks (the server represents `asciiFold` as a `bool` + separate `asciiFoldIgnore[]`, but the client unwraps it back into the nested `AsciiFoldConfig` record). + +## Common Patterns + +### Previewing a query + +Use `collection.Tokenize.Property` to see exactly what tokens the server will match your search against: + +```csharp +var tokens = (await collection.Tokenize.Property("title", userQuery)).Query; +// Show tokens in the UI as "searching for: X, Y, Z" +``` + +### Debugging a BM25 miss + +If a search misses a term you expected, tokenize both the query and a sample document with the same property: + +```csharp +var queryTokens = (await collection.Tokenize.Property("body", "running")).Query; +var docTokens = (await collection.Tokenize.Property("body", "I was running")).Indexed; + +// If the sets don't intersect, BM25 can't match — check for stemming / stopwords. +``` + +### Verifying analyzer config round-trip + +When you configure ASCII folding or a stopword preset, the server echoes back its interpretation on every call: + +```csharp +var cfg = new TokenizeAnalyzerConfig +{ + AsciiFold = new AsciiFoldConfig(Ignore: ["é"]), + StopwordPreset = "en", +}; + +var result = await client.Tokenize.Text("L'école", PropertyTokenization.Word, analyzerConfig: cfg); + +Debug.Assert(result.AnalyzerConfig!.AsciiFold!.Ignore!.SequenceEqual(new[] { "é" })); +Debug.Assert(result.AnalyzerConfig.StopwordPreset == "en"); +``` From 45b6517dd42925ea19dba8dd00a405c28094a95e Mon Sep 17 00:00:00 2001 From: Michelangelo Partipilo Date: Tue, 21 Apr 2026 22:54:52 +0200 Subject: [PATCH 3/4] feat: property-level TextAnalyzer + collection-level StopwordPresets Port weaviate-python-client PR #2006 on top of the tokenize-endpoint stack for Weaviate 1.37.0: - Property.TextAnalyzer: pin ASCII folding and stopword preset per property at index time. Reuses the TextAnalyzerConfig record already introduced for /v1/tokenize so tokenize-at-query and index-at-insert stay aligned. Propagates through nested properties via Property-> NestedProperties recursion. - InvertedIndexConfig.StopwordPresets: named preset->word-list map on the collection inverted-index config. Properties reference presets via TextAnalyzer.StopwordPreset. Round-trips through create + update. - InvertedIndexConfigUpdate.StopwordPresets: mirrors the set accessor on the update wrapper so c.InvertedIndexConfig.StopwordPresets = ... works inside collection.Config.Update(...). - Preflight in CollectionsClient.Create: detects either feature in the incoming schema and throws WeaviateVersionMismatchException when the connected server is older than 1.37.0, before any REST call. - Rename TokenizeAnalyzerConfig -> TextAnalyzerConfig: same shape now serves both the tokenize endpoint and the property-level analyzer, matching the server type name and Python naming. - Integration tests in TestCollectionTextAnalyzer.cs cover preset round-trip, update, referenced-removal rejection, ascii-fold combos, and version-gate behaviour. - CHANGELOG + docs/TOKENIZE_API_USAGE.md extended with worked examples for the schema-side analyzer and stopword presets. Co-Authored-By: Claude Opus 4.7 --- CHANGELOG.md | 2 + docs/TOKENIZE_API_USAGE.md | 95 +++- .../Integration/TestCollectionTextAnalyzer.cs | 462 ++++++++++++++++++ .../Integration/TestTokenize.cs | 14 +- src/Weaviate.Client/CollectionsClient.cs | 55 +++ src/Weaviate.Client/Extensions.cs | 2 + .../Models/Collection.Update.cs | 11 + src/Weaviate.Client/Models/Extensions.cs | 2 + .../Models/InvertedIndexConfig.cs | 41 ++ src/Weaviate.Client/Models/Property.cs | 12 + src/Weaviate.Client/Models/Tokenize.cs | 14 +- src/Weaviate.Client/PublicAPI.Unshipped.txt | 36 +- src/Weaviate.Client/Rest/Dto/Extensions.cs | 2 + src/Weaviate.Client/TokenizeClient.cs | 4 +- 14 files changed, 711 insertions(+), 41 deletions(-) create mode 100644 src/Weaviate.Client.Tests/Integration/TestCollectionTextAnalyzer.cs diff --git a/CHANGELOG.md b/CHANGELOG.md index f205612c..48fda1f0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 #### Tokenization - **Tokenize Endpoints** ([#329](https://github.com/weaviate/csharp-client/pull/329)): Expose the `POST /v1/tokenize` and `POST /v1/schema/{class}/properties/{prop}/tokenize` endpoints introduced in Weaviate 1.37.0. Inspect how text is tokenized for a given method and analyzer configuration, or how a specific collection property would tokenize it. Access via `client.Tokenize.Text(...)` and `collection.Tokenize.Property(...)`. `AsciiFoldConfig` is modeled as a nullable record so the invalid "ignore without fold" state is unrepresentable. See [TOKENIZE_API_USAGE.md](docs/TOKENIZE_API_USAGE.md). Requires Weaviate ≥ 1.37.0. +- **Property-Level `TextAnalyzerConfig`** ([#329](https://github.com/weaviate/csharp-client/pull/329)): `Property.TextAnalyzer` (also applies to nested properties) lets a collection schema pin ASCII folding and/or a stopword preset per property at index time. The same `TextAnalyzerConfig` record is reused from the `Tokenize` endpoint so tokenize-at-query and index-at-insert stay aligned. A preflight version check on `CollectionsClient.Create` raises `WeaviateVersionMismatchException` when the server is older than 1.37.0. Requires Weaviate ≥ 1.37.0. +- **Collection-Level `StopwordPresets`** ([#329](https://github.com/weaviate/csharp-client/pull/329)): `InvertedIndexConfig.StopwordPresets` and `InvertedIndexConfigUpdate.StopwordPresets` define named preset name → word-list maps on the inverted-index config. Properties reference these presets via `TextAnalyzer.StopwordPreset`. Preset changes flow through `CollectionClient.Config.Update(c => c.InvertedIndexConfig.StopwordPresets = ...)`. Requires Weaviate ≥ 1.37.0. --- diff --git a/docs/TOKENIZE_API_USAGE.md b/docs/TOKENIZE_API_USAGE.md index 624e1692..08bcb8a8 100644 --- a/docs/TOKENIZE_API_USAGE.md +++ b/docs/TOKENIZE_API_USAGE.md @@ -14,6 +14,8 @@ This guide covers the Weaviate C# client's tokenize API — a pair of endpoints - [Analyzer Configuration](#analyzer-configuration) - [Stopwords](#stopwords) - [Result Shape](#result-shape) +- [Property-level Text Analyzer (schema)](#property-level-text-analyzer-schema) +- [Collection-level Stopword Presets (schema)](#collection-level-stopword-presets-schema) - [Common Patterns](#common-patterns) ## Overview @@ -70,7 +72,7 @@ Signature: Task Tokenize.Text( string text, PropertyTokenization tokenization, - TokenizeAnalyzerConfig? analyzerConfig = null, + TextAnalyzerConfig? analyzerConfig = null, IDictionary? stopwordPresets = null, CancellationToken cancellationToken = default ); @@ -96,14 +98,14 @@ The server uses the property's configured tokenization method and any analyzer c ## Analyzer Configuration -`TokenizeAnalyzerConfig` controls two optional analyzer stages: **ASCII folding** and **stopword removal**. +`TextAnalyzerConfig` controls two optional analyzer stages: **ASCII folding** and **stopword removal**. ### ASCII Folding `AsciiFoldConfig` is a nullable record — `null` means folding is disabled, non-`null` means it's enabled. The `Ignore` list lets you exempt specific characters from folding. ```csharp -var cfg = new TokenizeAnalyzerConfig +var cfg = new TextAnalyzerConfig { AsciiFold = new AsciiFoldConfig(), // folding enabled, nothing ignored }; @@ -119,7 +121,7 @@ var result = await client.Tokenize.Text( Ignore a specific character: ```csharp -var cfg = new TokenizeAnalyzerConfig +var cfg = new TextAnalyzerConfig { AsciiFold = new AsciiFoldConfig(Ignore: ["é"]), }; @@ -139,7 +141,7 @@ var result = await client.Tokenize.Text( Use a built-in preset (`"en"`, `"none"`) via the `StopwordPreset` field: ```csharp -var cfg = new TokenizeAnalyzerConfig { StopwordPreset = "en" }; +var cfg = new TextAnalyzerConfig { StopwordPreset = "en" }; var result = await client.Tokenize.Text( "The quick brown fox", @@ -158,7 +160,7 @@ For more control, define a named preset via the `stopwordPresets` dictionary and ### Add words to a preset ```csharp -var cfg = new TokenizeAnalyzerConfig { StopwordPreset = "custom" }; +var cfg = new TextAnalyzerConfig { StopwordPreset = "custom" }; var presets = new Dictionary { @@ -183,7 +185,7 @@ var result = await client.Tokenize.Text( ### Start from a base preset and remove words ```csharp -var cfg = new TokenizeAnalyzerConfig { StopwordPreset = "en-no-the" }; +var cfg = new TextAnalyzerConfig { StopwordPreset = "en-no-the" }; var presets = new Dictionary { @@ -207,7 +209,7 @@ var result = await client.Tokenize.Text( ### Combining folding and stopwords ```csharp -var cfg = new TokenizeAnalyzerConfig +var cfg = new TextAnalyzerConfig { AsciiFold = new AsciiFoldConfig(Ignore: ["é"]), StopwordPreset = "en", @@ -232,11 +234,84 @@ var result = await client.Tokenize.Text( | `Tokenization` | `PropertyTokenization` | The method that was applied. | | `Indexed` | `ImmutableList` | Tokens as stored in the inverted index. | | `Query` | `ImmutableList` | Tokens used at query time (after stopword removal). | -| `AnalyzerConfig` | `TokenizeAnalyzerConfig?` | Echo of the analyzer config that was applied, or `null`. | +| `AnalyzerConfig` | `TextAnalyzerConfig?` | Echo of the analyzer config that was applied, or `null`. | | `StopwordConfig` | `StopwordConfig?` | Echo of the resolved stopword config, or `null`. | The `AnalyzerConfig` echo is the server's view of what was applied — useful for verifying that your config was parsed correctly. The round-trip also normalizes wire-format quirks (the server represents `asciiFold` as a `bool` + separate `asciiFoldIgnore[]`, but the client unwraps it back into the nested `AsciiFoldConfig` record). +## Property-level Text Analyzer (schema) + +Beyond the ad-hoc tokenize endpoint, Weaviate 1.37.0 also lets you pin analyzer options directly on a property at **collection-creation time**. The same `TextAnalyzerConfig` record is reused: whatever you would pass to `client.Tokenize.Text(...)` can also be attached to a property so every value indexed through that property gets the same treatment. + +```csharp +await client.Collections.Create(new CollectionCreateParams +{ + Name = "Article", + Properties = + [ + new Property + { + Name = "title", + DataType = [DataType.Text], + Tokenization = PropertyTokenization.Word, + TextAnalyzer = new TextAnalyzerConfig + { + AsciiFold = new AsciiFoldConfig(), + StopwordPreset = "en", + }, + }, + ], +}); +``` + +Nested properties (object / object-array) accept `TextAnalyzer` too — they are `Property` records themselves, so the same field is available on every depth. + +> **Version requirement:** `Property.TextAnalyzer` is only wired up for servers at Weaviate ≥ 1.37.0. `CollectionsClient.Create` performs a preflight version check and throws `WeaviateVersionMismatchException` if the connected server is older, before the schema request is sent. + +## Collection-level Stopword Presets (schema) + +Named stopword lists live on the collection's inverted-index config. A preset is a `preset-name → word-list` pair; properties reference one by name via `TextAnalyzer.StopwordPreset`. + +```csharp +await client.Collections.Create(new CollectionCreateParams +{ + Name = "Article", + InvertedIndexConfig = new InvertedIndexConfig + { + StopwordPresets = new Dictionary> + { + ["fr"] = new[] { "le", "la", "les" }, + ["custom_en"] = new[] { "foo", "bar" }, + }, + }, + Properties = + [ + new Property + { + Name = "body", + DataType = [DataType.Text], + TextAnalyzer = new TextAnalyzerConfig { StopwordPreset = "fr" }, + }, + ], +}); +``` + +Updating presets on an existing collection goes through the normal update path: + +```csharp +await collection.Config.Update(c => +{ + c.InvertedIndexConfig.StopwordPresets = new Dictionary> + { + ["fr"] = new[] { "le", "la", "les", "un", "une" }, + }; +}); +``` + +Setting `StopwordPresets` replaces the whole preset map on the server. The server rejects removing a preset that is still referenced by a property's `TextAnalyzer.StopwordPreset` — keep preset removals and property-config changes in the same update, or unwire the property first. + +> **Version requirement:** Requires Weaviate ≥ 1.37.0. The preflight in `CollectionsClient.Create` also trips on `InvertedIndexConfig.StopwordPresets` before contacting the server. + ## Common Patterns ### Previewing a query @@ -264,7 +339,7 @@ var docTokens = (await collection.Tokenize.Property("body", "I was running")). When you configure ASCII folding or a stopword preset, the server echoes back its interpretation on every call: ```csharp -var cfg = new TokenizeAnalyzerConfig +var cfg = new TextAnalyzerConfig { AsciiFold = new AsciiFoldConfig(Ignore: ["é"]), StopwordPreset = "en", diff --git a/src/Weaviate.Client.Tests/Integration/TestCollectionTextAnalyzer.cs b/src/Weaviate.Client.Tests/Integration/TestCollectionTextAnalyzer.cs new file mode 100644 index 00000000..c9e59b37 --- /dev/null +++ b/src/Weaviate.Client.Tests/Integration/TestCollectionTextAnalyzer.cs @@ -0,0 +1,462 @@ +using Weaviate.Client.Models; + +namespace Weaviate.Client.Tests.Integration; + +/// +/// Integration tests for property-level textAnalyzer configuration and +/// collection-level stopwordPresets. Requires Weaviate ≥ 1.37.0. +/// Ports integration/test_collection_config.py from weaviate-python-client PR #2006. +/// +[Collection("TestCollectionTextAnalyzer")] +public class TestCollectionTextAnalyzer : IntegrationTests +{ + private const string MinVersion = "1.37.0"; + + // ----------------------------------------------------------------------- + // Collection-level stopwordPresets + // ----------------------------------------------------------------------- + + [Fact] + public async Task StopwordPresets_AppliedAndRoundTripped() + { + RequireVersion(MinVersion, message: "stopwordPresets requires Weaviate >= 1.37.0"); + + var collection = await CollectionFactory( + properties: + [ + new Property + { + Name = "title_fr", + DataType = DataType.Text, + PropertyTokenization = PropertyTokenization.Word, + TextAnalyzer = new TextAnalyzerConfig { StopwordPreset = "fr" }, + }, + new Property + { + Name = "title_en", + DataType = DataType.Text, + PropertyTokenization = PropertyTokenization.Word, + TextAnalyzer = new TextAnalyzerConfig { StopwordPreset = "en" }, + }, + new Property + { + Name = "plain", + DataType = DataType.Text, + PropertyTokenization = PropertyTokenization.Word, + }, + ], + invertedIndexConfig: new() + { + StopwordPresets = new Dictionary> + { + ["fr"] = new List { "le", "la", "les" }, + }, + } + ); + + var config = await collection.Config.Get(TestContext.Current.CancellationToken); + + Assert.NotNull(config.InvertedIndexConfig); + Assert.NotNull(config.InvertedIndexConfig!.StopwordPresets); + Assert.Equal( + new[] { "le", "la", "les" }, + config.InvertedIndexConfig.StopwordPresets!["fr"] + ); + + var titleFr = config.Properties.Single(p => p.Name == "title_fr"); + var titleEn = config.Properties.Single(p => p.Name == "title_en"); + var plain = config.Properties.Single(p => p.Name == "plain"); + + Assert.NotNull(titleFr.TextAnalyzer); + Assert.Equal("fr", titleFr.TextAnalyzer!.StopwordPreset); + Assert.NotNull(titleEn.TextAnalyzer); + Assert.Equal("en", titleEn.TextAnalyzer!.StopwordPreset); + Assert.Null(plain.TextAnalyzer); + } + + [Fact] + public async Task StopwordPresets_Update_ReplacesPreset() + { + RequireVersion(MinVersion, message: "stopwordPresets requires Weaviate >= 1.37.0"); + + var collection = await CollectionFactory( + properties: + [ + new Property + { + Name = "title_fr", + DataType = DataType.Text, + PropertyTokenization = PropertyTokenization.Word, + TextAnalyzer = new TextAnalyzerConfig { StopwordPreset = "fr" }, + }, + ], + invertedIndexConfig: new() + { + StopwordPresets = new Dictionary> + { + ["fr"] = new List { "le" }, + }, + } + ); + + var config = await collection.Config.Get(TestContext.Current.CancellationToken); + Assert.Equal(new[] { "le" }, config.InvertedIndexConfig!.StopwordPresets!["fr"]); + + await collection.Config.Update( + c => + { + c.InvertedIndexConfig.StopwordPresets = new Dictionary> + { + ["fr"] = new List { "la" }, + }; + }, + TestContext.Current.CancellationToken + ); + + config = await collection.Config.Get(TestContext.Current.CancellationToken); + Assert.Equal(new[] { "la" }, config.InvertedIndexConfig!.StopwordPresets!["fr"]); + } + + [Fact] + public async Task StopwordPresets_RemoveInUse_RejectedByServer() + { + RequireVersion(MinVersion, message: "stopwordPresets requires Weaviate >= 1.37.0"); + + var collection = await CollectionFactory( + properties: + [ + new Property + { + Name = "title_fr", + DataType = DataType.Text, + PropertyTokenization = PropertyTokenization.Word, + TextAnalyzer = new TextAnalyzerConfig { StopwordPreset = "fr" }, + }, + ], + invertedIndexConfig: new() + { + StopwordPresets = new Dictionary> + { + ["fr"] = new List { "le", "la", "les" }, + }, + } + ); + + await Assert.ThrowsAnyAsync(async () => + { + await collection.Config.Update( + c => + { + c.InvertedIndexConfig.StopwordPresets = new Dictionary>(); + }, + TestContext.Current.CancellationToken + ); + }); + + // The original preset must survive the rejected update. + var config = await collection.Config.Get(TestContext.Current.CancellationToken); + Assert.Equal( + new[] { "le", "la", "les" }, + config.InvertedIndexConfig!.StopwordPresets!["fr"] + ); + } + + [Fact] + public async Task StopwordPresets_RemoveUnused_Allowed() + { + RequireVersion(MinVersion, message: "stopwordPresets requires Weaviate >= 1.37.0"); + + var collection = await CollectionFactory( + properties: + [ + new Property + { + Name = "title", + DataType = DataType.Text, + PropertyTokenization = PropertyTokenization.Word, + TextAnalyzer = new TextAnalyzerConfig { StopwordPreset = "fr" }, + }, + ], + invertedIndexConfig: new() + { + StopwordPresets = new Dictionary> + { + ["fr"] = new List { "le", "la", "les" }, + ["es"] = new List { "el", "la", "los" }, + }, + } + ); + + // Drop only 'es' (unused). 'fr' is still referenced by title. + await collection.Config.Update( + c => + { + c.InvertedIndexConfig.StopwordPresets = new Dictionary> + { + ["fr"] = new List { "le", "la", "les" }, + }; + }, + TestContext.Current.CancellationToken + ); + + var config = await collection.Config.Get(TestContext.Current.CancellationToken); + Assert.Equal( + new[] { "le", "la", "les" }, + config.InvertedIndexConfig!.StopwordPresets!["fr"] + ); + Assert.False(config.InvertedIndexConfig.StopwordPresets.ContainsKey("es")); + } + + [Fact] + public async Task StopwordPresets_RemoveReferencedByNested_RejectedByServer() + { + RequireVersion(MinVersion, message: "stopwordPresets requires Weaviate >= 1.37.0"); + + var collection = await CollectionFactory( + properties: + [ + new Property + { + Name = "doc", + DataType = DataType.Object, + NestedProperties = + [ + new Property + { + Name = "body", + DataType = DataType.Text, + PropertyTokenization = PropertyTokenization.Word, + TextAnalyzer = new TextAnalyzerConfig { StopwordPreset = "fr" }, + }, + ], + }, + ], + invertedIndexConfig: new() + { + StopwordPresets = new Dictionary> + { + ["fr"] = new List { "le", "la", "les" }, + }, + } + ); + + await Assert.ThrowsAnyAsync(async () => + { + await collection.Config.Update( + c => + { + c.InvertedIndexConfig.StopwordPresets = new Dictionary>(); + }, + TestContext.Current.CancellationToken + ); + }); + + var config = await collection.Config.Get(TestContext.Current.CancellationToken); + Assert.Equal( + new[] { "le", "la", "les" }, + config.InvertedIndexConfig!.StopwordPresets!["fr"] + ); + } + + [Fact] + public async Task UserDefinedStopwordPreset_OverridesBuiltin() + { + RequireVersion(MinVersion, message: "stopwordPresets requires Weaviate >= 1.37.0"); + + var collection = await CollectionFactory( + properties: + [ + new Property + { + Name = "title", + DataType = DataType.Text, + PropertyTokenization = PropertyTokenization.Word, + TextAnalyzer = new TextAnalyzerConfig { StopwordPreset = "en" }, + }, + ], + invertedIndexConfig: new() + { + StopwordPresets = new Dictionary> + { + ["en"] = new List { "hello" }, + }, + } + ); + + var config = await collection.Config.Get(TestContext.Current.CancellationToken); + Assert.Equal(new[] { "hello" }, config.InvertedIndexConfig!.StopwordPresets!["en"]); + + var title = config.Properties.Single(p => p.Name == "title"); + Assert.NotNull(title.TextAnalyzer); + Assert.Equal("en", title.TextAnalyzer!.StopwordPreset); + } + + // ----------------------------------------------------------------------- + // Property-level TextAnalyzer + // ----------------------------------------------------------------------- + + [Fact] + public async Task TextAnalyzer_CombinedAsciiFoldAndStopwordPreset() + { + RequireVersion(MinVersion, message: "textAnalyzer requires Weaviate >= 1.37.0"); + + var collection = await CollectionFactory( + properties: + [ + new Property + { + Name = "title", + DataType = DataType.Text, + PropertyTokenization = PropertyTokenization.Word, + TextAnalyzer = new TextAnalyzerConfig + { + AsciiFold = new AsciiFoldConfig(), + StopwordPreset = "en", + }, + }, + ] + ); + + var config = await collection.Config.Get(TestContext.Current.CancellationToken); + var title = config.Properties.Single(p => p.Name == "title"); + + Assert.NotNull(title.TextAnalyzer); + Assert.NotNull(title.TextAnalyzer!.AsciiFold); + Assert.Equal("en", title.TextAnalyzer.StopwordPreset); + } + + [Fact] + public async Task TextAnalyzer_AsciiFoldIgnore_RoundTrips() + { + RequireVersion(MinVersion, message: "textAnalyzer requires Weaviate >= 1.37.0"); + + var collection = await CollectionFactory( + properties: + [ + new Property + { + Name = "title", + DataType = DataType.Text, + PropertyTokenization = PropertyTokenization.Word, + TextAnalyzer = new TextAnalyzerConfig + { + AsciiFold = new AsciiFoldConfig(Ignore: ["é"]), + }, + }, + ] + ); + + var config = await collection.Config.Get(TestContext.Current.CancellationToken); + var title = config.Properties.Single(p => p.Name == "title"); + + Assert.NotNull(title.TextAnalyzer); + Assert.NotNull(title.TextAnalyzer!.AsciiFold); + Assert.Equal(new[] { "é" }, title.TextAnalyzer.AsciiFold!.Ignore); + } + + [Fact] + public async Task TextAnalyzer_FullRoundTrip_FromDictStyleConfig() + { + RequireVersion(MinVersion, message: "textAnalyzer requires Weaviate >= 1.37.0"); + + var collection = await CollectionFactory( + properties: + [ + new Property + { + Name = "title", + DataType = DataType.Text, + PropertyTokenization = PropertyTokenization.Word, + TextAnalyzer = new TextAnalyzerConfig + { + AsciiFold = new AsciiFoldConfig(Ignore: ["é"]), + StopwordPreset = "fr", + }, + }, + ], + invertedIndexConfig: new() + { + Stopwords = new StopwordConfig + { + Preset = StopwordConfig.Presets.EN, + Additions = ["a"], + Removals = ["the"], + }, + StopwordPresets = new Dictionary> + { + ["fr"] = new List { "le", "la", "les" }, + }, + } + ); + + var config = await collection.Config.Get(TestContext.Current.CancellationToken); + + Assert.Equal(StopwordConfig.Presets.EN, config.InvertedIndexConfig!.Stopwords!.Preset); + Assert.Equal(new[] { "the" }, config.InvertedIndexConfig.Stopwords.Removals); + Assert.Equal( + new[] { "le", "la", "les" }, + config.InvertedIndexConfig.StopwordPresets!["fr"] + ); + + var title = config.Properties.Single(p => p.Name == "title"); + Assert.NotNull(title.TextAnalyzer); + Assert.Equal("fr", title.TextAnalyzer!.StopwordPreset); + Assert.NotNull(title.TextAnalyzer.AsciiFold); + Assert.Equal(new[] { "é" }, title.TextAnalyzer.AsciiFold!.Ignore); + } + + // ----------------------------------------------------------------------- + // Version-gate + // ----------------------------------------------------------------------- + + [Fact] + public async Task Property_TextAnalyzer_RaisesOnOldServer() + { + if (ServerVersionIsInRange(MinVersion)) + { + Assert.Skip( + $"Version gate only applies to Weaviate < {MinVersion}. Current: {_weaviate.WeaviateVersion}" + ); + } + + await Assert.ThrowsAsync(async () => + { + await CollectionFactory( + properties: + [ + new Property + { + Name = "title", + DataType = DataType.Text, + PropertyTokenization = PropertyTokenization.Word, + TextAnalyzer = new TextAnalyzerConfig { AsciiFold = new AsciiFoldConfig() }, + }, + ] + ); + }); + } + + [Fact] + public async Task InvertedIndexConfig_StopwordPresets_RaisesOnOldServer() + { + if (ServerVersionIsInRange(MinVersion)) + { + Assert.Skip( + $"Version gate only applies to Weaviate < {MinVersion}. Current: {_weaviate.WeaviateVersion}" + ); + } + + await Assert.ThrowsAsync(async () => + { + await CollectionFactory( + invertedIndexConfig: new() + { + StopwordPresets = new Dictionary> + { + ["fr"] = new List { "le", "la" }, + }, + } + ); + }); + } +} diff --git a/src/Weaviate.Client.Tests/Integration/TestTokenize.cs b/src/Weaviate.Client.Tests/Integration/TestTokenize.cs index e2ed2de4..8c8b30a8 100644 --- a/src/Weaviate.Client.Tests/Integration/TestTokenize.cs +++ b/src/Weaviate.Client.Tests/Integration/TestTokenize.cs @@ -79,7 +79,7 @@ public async Task AsciiFold() { RequireVersion(nameof(TokenizeClient.Text)); - var cfg = new TokenizeAnalyzerConfig { AsciiFold = new AsciiFoldConfig() }; + var cfg = new TextAnalyzerConfig { AsciiFold = new AsciiFoldConfig() }; var result = await _weaviate.Tokenize.Text( "L'école est fermée", PropertyTokenization.Word, @@ -95,7 +95,7 @@ public async Task AsciiFold_WithIgnore() { RequireVersion(nameof(TokenizeClient.Text)); - var cfg = new TokenizeAnalyzerConfig { AsciiFold = new AsciiFoldConfig(Ignore: ["é"]) }; + var cfg = new TextAnalyzerConfig { AsciiFold = new AsciiFoldConfig(Ignore: ["é"]) }; var result = await _weaviate.Tokenize.Text( "L'école est fermée", PropertyTokenization.Word, @@ -111,7 +111,7 @@ public async Task StopwordPreset_String() { RequireVersion(nameof(TokenizeClient.Text)); - var cfg = new TokenizeAnalyzerConfig { StopwordPreset = "en" }; + var cfg = new TextAnalyzerConfig { StopwordPreset = "en" }; var result = await _weaviate.Tokenize.Text( "The quick brown fox", PropertyTokenization.Word, @@ -128,7 +128,7 @@ public async Task Combined_AsciiFold_Stopwords() { RequireVersion(nameof(TokenizeClient.Text)); - var cfg = new TokenizeAnalyzerConfig + var cfg = new TextAnalyzerConfig { AsciiFold = new AsciiFoldConfig(Ignore: ["é"]), StopwordPreset = "en", @@ -150,7 +150,7 @@ public async Task CustomPreset_Additions() { RequireVersion(nameof(TokenizeClient.Text)); - var cfg = new TokenizeAnalyzerConfig { StopwordPreset = "custom" }; + var cfg = new TextAnalyzerConfig { StopwordPreset = "custom" }; var presets = new Dictionary { ["custom"] = new StopwordConfig @@ -177,7 +177,7 @@ public async Task CustomPreset_BaseAndRemovals() { RequireVersion(nameof(TokenizeClient.Text)); - var cfg = new TokenizeAnalyzerConfig { StopwordPreset = "en-no-the" }; + var cfg = new TextAnalyzerConfig { StopwordPreset = "en-no-the" }; var presets = new Dictionary { ["en-no-the"] = new StopwordConfig @@ -224,7 +224,7 @@ public async Task AnalyzerConfig_Echoed() { RequireVersion(nameof(TokenizeClient.Text)); - var cfg = new TokenizeAnalyzerConfig + var cfg = new TextAnalyzerConfig { AsciiFold = new AsciiFoldConfig(Ignore: ["é"]), StopwordPreset = "en", diff --git a/src/Weaviate.Client/CollectionsClient.cs b/src/Weaviate.Client/CollectionsClient.cs index faa1ec6a..063cbc91 100644 --- a/src/Weaviate.Client/CollectionsClient.cs +++ b/src/Weaviate.Client/CollectionsClient.cs @@ -197,6 +197,8 @@ public async Task Create( { ArgumentNullException.ThrowIfNull(collection); + await EnsureTextAnalyzerFeaturesSupported(collection); + var config = CollectionConfig.FromCollectionCreate(collection); var jsonString = JsonSerializer.Serialize( @@ -207,6 +209,59 @@ public async Task Create( return await CreateFromJson(jsonString, cancellationToken); } + private static readonly Version TextAnalyzerMinimumVersion = new(1, 37, 0); + + private async Task EnsureTextAnalyzerFeaturesSupported(CollectionCreateParams collection) + { + string? feature = DetectTextAnalyzerFeature(collection); + if (feature is null) + return; + + await _client.EnsureInitializedAsync(); + + var serverVersion = _client.WeaviateVersion; + if (serverVersion is null) + return; + + if (serverVersion < TextAnalyzerMinimumVersion) + { + throw new WeaviateVersionMismatchException( + feature, + TextAnalyzerMinimumVersion, + serverVersion + ); + } + } + + private static string? DetectTextAnalyzerFeature(CollectionCreateParams collection) + { + if (collection.InvertedIndexConfig?.StopwordPresets is { Count: > 0 }) + return "InvertedIndexConfig.StopwordPresets"; + + foreach (var property in collection.Properties) + { + if (PropertyUsesTextAnalyzer(property)) + return "Property.TextAnalyzer"; + } + + return null; + } + + private static bool PropertyUsesTextAnalyzer(Property property) + { + if (property.TextAnalyzer is not null) + return true; + if (property.NestedProperties is { } nested) + { + foreach (var np in nested) + { + if (PropertyUsesTextAnalyzer(np)) + return true; + } + } + return false; + } + /// /// Create a new typed collection from a json string. /// diff --git a/src/Weaviate.Client/Extensions.cs b/src/Weaviate.Client/Extensions.cs index a38b8296..7ae8ad8e 100644 --- a/src/Weaviate.Client/Extensions.cs +++ b/src/Weaviate.Client/Extensions.cs @@ -242,6 +242,7 @@ internal static Rest.Dto.Class ToDto(this CollectionConfig collection) ? collection.InvertedIndexConfig.IndexTimestamps : null, UsingBlockMaxWAND = collection.InvertedIndexConfig.UsingBlockMaxWAND, + StopwordPresets = collection.InvertedIndexConfig.StopwordPresets, }; } @@ -352,6 +353,7 @@ internal static CollectionConfigExport ToModel(this Rest.Dto.Class collection) ?? Weaviate.Client.Models.InvertedIndexConfig.Default.IndexTimestamps, UsingBlockMaxWAND = iic.UsingBlockMaxWAND, + StopwordPresets = iic.StopwordPresets, } : null; diff --git a/src/Weaviate.Client/Models/Collection.Update.cs b/src/Weaviate.Client/Models/Collection.Update.cs index 82e032b1..148114fa 100644 --- a/src/Weaviate.Client/Models/Collection.Update.cs +++ b/src/Weaviate.Client/Models/Collection.Update.cs @@ -123,6 +123,17 @@ public int CleanupIntervalSeconds /// public StopwordsConfigUpdate Stopwords => new(WrappedConfig.Stopwords ??= StopwordConfig.Default); + + /// + /// Gets or sets the named stopword presets defined at the collection level. + /// Setting this replaces the full preset map on the server. + /// Requires Weaviate ≥ 1.37.0. + /// + public IDictionary>? StopwordPresets + { + get => WrappedConfig.StopwordPresets; + set => WrappedConfig.StopwordPresets = value; + } } /// diff --git a/src/Weaviate.Client/Models/Extensions.cs b/src/Weaviate.Client/Models/Extensions.cs index 5a7db735..e79cde27 100644 --- a/src/Weaviate.Client/Models/Extensions.cs +++ b/src/Weaviate.Client/Models/Extensions.cs @@ -57,6 +57,7 @@ internal static Rest.Dto.NestedProperty ToNestedPropertyDto(this Property proper NestedProperties = property .NestedProperties?.Select(np => np.ToNestedPropertyDto()) .ToList(), + TextAnalyzer = property.TextAnalyzer.ToDto(), }; } @@ -122,6 +123,7 @@ internal static Rest.Dto.Property ToDto( .NestedProperties?.Select(np => np.ToNestedPropertyDto()) .ToList(), ModuleConfig = moduleConfig, + TextAnalyzer = property.TextAnalyzer.ToDto(), }; } } diff --git a/src/Weaviate.Client/Models/InvertedIndexConfig.cs b/src/Weaviate.Client/Models/InvertedIndexConfig.cs index 64e9c4fe..66e5292b 100644 --- a/src/Weaviate.Client/Models/InvertedIndexConfig.cs +++ b/src/Weaviate.Client/Models/InvertedIndexConfig.cs @@ -51,6 +51,14 @@ public record InvertedIndexConfig : IEquatable /// public bool? UsingBlockMaxWAND { get; set; } = null; + /// + /// Optional named stopword presets defined at the collection level. + /// Each entry is a preset name → list of stopwords. Individual properties + /// can reference a preset via . + /// Requires Weaviate ≥ 1.37.0. + /// + public IDictionary>? StopwordPresets { get; set; } = null; + /// /// Gets the hash code /// @@ -65,6 +73,15 @@ public override int GetHashCode() hash.Add(IndexTimestamps); hash.Add(Stopwords?.GetHashCode() ?? 0); hash.Add(UsingBlockMaxWAND); + if (StopwordPresets is not null) + { + foreach (var kvp in StopwordPresets.OrderBy(kvp => kvp.Key, StringComparer.Ordinal)) + { + hash.Add(kvp.Key); + foreach (var word in kvp.Value) + hash.Add(word); + } + } return hash.ToHashCode(); } @@ -106,6 +123,30 @@ UsingBlockMaxWAND is not null ) return false; + if (!StopwordPresetsEqual(StopwordPresets, other.StopwordPresets)) + return false; + + return true; + } + + private static bool StopwordPresetsEqual( + IDictionary>? a, + IDictionary>? b + ) + { + if (ReferenceEquals(a, b)) + return true; + if (a is null || b is null) + return false; + if (a.Count != b.Count) + return false; + foreach (var kvp in a) + { + if (!b.TryGetValue(kvp.Key, out var otherValue)) + return false; + if (!kvp.Value.SequenceEqual(otherValue)) + return false; + } return true; } } diff --git a/src/Weaviate.Client/Models/Property.cs b/src/Weaviate.Client/Models/Property.cs index 8dc2440d..7180559b 100644 --- a/src/Weaviate.Client/Models/Property.cs +++ b/src/Weaviate.Client/Models/Property.cs @@ -606,6 +606,13 @@ public required string Name /// public bool VectorizePropertyName { get; init; } = true; + /// + /// Optional property-level text analyzer configuration. When set, the property's + /// indexed and query tokens are post-processed according to the configured + /// ASCII-folding and stopword preset. Requires Weaviate ≥ 1.37.0. + /// + public TextAnalyzerConfig? TextAnalyzer { get; init; } + /// Gets a factory for creating text properties. public static PropertyFactory Text => PropertyHelper.Factory(DataType.Text); @@ -795,6 +802,7 @@ public override int GetHashCode() hash.Add(NestedProperties); hash.Add(SkipVectorization); hash.Add(VectorizePropertyName); + hash.Add(TextAnalyzer); return hash.ToHashCode(); } @@ -820,6 +828,10 @@ public virtual bool Equals(Property? other) && PropertyTokenization == other.PropertyTokenization && SkipVectorization == other.SkipVectorization && VectorizePropertyName == other.VectorizePropertyName + && EqualityComparer.Default.Equals( + TextAnalyzer, + other.TextAnalyzer + ) && ( (NestedProperties == null && other.NestedProperties == null) || ( diff --git a/src/Weaviate.Client/Models/Tokenize.cs b/src/Weaviate.Client/Models/Tokenize.cs index 3b10d137..f0695642 100644 --- a/src/Weaviate.Client/Models/Tokenize.cs +++ b/src/Weaviate.Client/Models/Tokenize.cs @@ -5,7 +5,7 @@ namespace Weaviate.Client.Models; /// /// ASCII-folding configuration: enables accent/diacritic folding, with an /// optional list of characters to exclude. When set on -/// , folding is applied; when +/// , folding is applied; when /// null, folding is disabled. /// /// @@ -17,7 +17,7 @@ public sealed record AsciiFoldConfig(IReadOnlyList? Ignore = null); /// Optional text-analyzer configuration for the tokenize endpoint. /// Mirrors the server's TextAnalyzerConfig. /// -public sealed record TokenizeAnalyzerConfig +public sealed record TextAnalyzerConfig { /// /// ASCII-folding configuration. When non-null, accent/diacritic marks are @@ -30,7 +30,7 @@ public sealed record TokenizeAnalyzerConfig /// /// Stopword preset name. May be a built-in preset ("en", "none") /// or the name of a custom preset provided via - /// 's + /// 's /// stopwordPresets dictionary. /// public string? StopwordPreset { get; init; } @@ -59,7 +59,7 @@ public sealed record TokenizeResult /// /// The text-analyzer configuration that was applied, if any. /// - public TokenizeAnalyzerConfig? AnalyzerConfig { get; init; } + public TextAnalyzerConfig? AnalyzerConfig { get; init; } /// /// The stopword configuration that was applied, if any. @@ -80,7 +80,7 @@ internal static PropertyTokenization ToTokenization(string? wireValue) => ? PropertyTokenization.Word : wireValue.FromEnumMemberString(); - internal static Rest.Dto.TextAnalyzerConfig? ToDto(this TokenizeAnalyzerConfig? config) => + internal static Rest.Dto.TextAnalyzerConfig? ToDto(this TextAnalyzerConfig? config) => config is null ? null : new Rest.Dto.TextAnalyzerConfig @@ -92,10 +92,10 @@ config is null StopwordPreset = config.StopwordPreset, }; - internal static TokenizeAnalyzerConfig? ToModel(this Rest.Dto.TextAnalyzerConfig? dto) => + internal static TextAnalyzerConfig? ToModel(this Rest.Dto.TextAnalyzerConfig? dto) => dto is null ? null - : new TokenizeAnalyzerConfig + : new TextAnalyzerConfig { AsciiFold = dto.AsciiFold == true diff --git a/src/Weaviate.Client/PublicAPI.Unshipped.txt b/src/Weaviate.Client/PublicAPI.Unshipped.txt index 453a24d9..557c9f65 100644 --- a/src/Weaviate.Client/PublicAPI.Unshipped.txt +++ b/src/Weaviate.Client/PublicAPI.Unshipped.txt @@ -6812,7 +6812,7 @@ Weaviate.Client.ClientConfigurationExtensions static Weaviate.Client.ClientConfigurationExtensions.WithIntegration(this Weaviate.Client.ClientConfiguration! config, string! integrationValue) -> Weaviate.Client.ClientConfiguration! static Weaviate.Client.WeaviateClientBuilderExtensions.WithIntegration(this Weaviate.Client.WeaviateClientBuilder! builder, string! integrationValue) -> Weaviate.Client.WeaviateClientBuilder! Weaviate.Client.TokenizeClient -Weaviate.Client.TokenizeClient.Text(string! text, Weaviate.Client.Models.PropertyTokenization tokenization, Weaviate.Client.Models.TokenizeAnalyzerConfig? analyzerConfig = null, System.Collections.Generic.IDictionary? stopwordPresets = null, System.Threading.CancellationToken cancellationToken = default(System.Threading.CancellationToken)) -> System.Threading.Tasks.Task! +Weaviate.Client.TokenizeClient.Text(string! text, Weaviate.Client.Models.PropertyTokenization tokenization, Weaviate.Client.Models.TextAnalyzerConfig? analyzerConfig = null, System.Collections.Generic.IDictionary? stopwordPresets = null, System.Threading.CancellationToken cancellationToken = default(System.Threading.CancellationToken)) -> System.Threading.Tasks.Task! Weaviate.Client.WeaviateClient.Tokenize.get -> Weaviate.Client.TokenizeClient! Weaviate.Client.CollectionTokenizeClient Weaviate.Client.CollectionTokenizeClient.Property(string! propertyName, string! text, System.Threading.CancellationToken cancellationToken = default(System.Threading.CancellationToken)) -> System.Threading.Tasks.Task! @@ -6829,19 +6829,19 @@ override Weaviate.Client.Models.AsciiFoldConfig.GetHashCode() -> int override Weaviate.Client.Models.AsciiFoldConfig.ToString() -> string! static Weaviate.Client.Models.AsciiFoldConfig.operator !=(Weaviate.Client.Models.AsciiFoldConfig? left, Weaviate.Client.Models.AsciiFoldConfig? right) -> bool static Weaviate.Client.Models.AsciiFoldConfig.operator ==(Weaviate.Client.Models.AsciiFoldConfig? left, Weaviate.Client.Models.AsciiFoldConfig? right) -> bool -Weaviate.Client.Models.TokenizeAnalyzerConfig -Weaviate.Client.Models.TokenizeAnalyzerConfig.TokenizeAnalyzerConfig() -> void -Weaviate.Client.Models.TokenizeAnalyzerConfig.AsciiFold.get -> Weaviate.Client.Models.AsciiFoldConfig? -Weaviate.Client.Models.TokenizeAnalyzerConfig.AsciiFold.init -> void -Weaviate.Client.Models.TokenizeAnalyzerConfig.StopwordPreset.get -> string? -Weaviate.Client.Models.TokenizeAnalyzerConfig.StopwordPreset.init -> void -Weaviate.Client.Models.TokenizeAnalyzerConfig.Equals(Weaviate.Client.Models.TokenizeAnalyzerConfig? other) -> bool -Weaviate.Client.Models.TokenizeAnalyzerConfig.$() -> Weaviate.Client.Models.TokenizeAnalyzerConfig! -override Weaviate.Client.Models.TokenizeAnalyzerConfig.Equals(object? obj) -> bool -override Weaviate.Client.Models.TokenizeAnalyzerConfig.GetHashCode() -> int -override Weaviate.Client.Models.TokenizeAnalyzerConfig.ToString() -> string! -static Weaviate.Client.Models.TokenizeAnalyzerConfig.operator !=(Weaviate.Client.Models.TokenizeAnalyzerConfig? left, Weaviate.Client.Models.TokenizeAnalyzerConfig? right) -> bool -static Weaviate.Client.Models.TokenizeAnalyzerConfig.operator ==(Weaviate.Client.Models.TokenizeAnalyzerConfig? left, Weaviate.Client.Models.TokenizeAnalyzerConfig? right) -> bool +Weaviate.Client.Models.TextAnalyzerConfig +Weaviate.Client.Models.TextAnalyzerConfig.TextAnalyzerConfig() -> void +Weaviate.Client.Models.TextAnalyzerConfig.AsciiFold.get -> Weaviate.Client.Models.AsciiFoldConfig? +Weaviate.Client.Models.TextAnalyzerConfig.AsciiFold.init -> void +Weaviate.Client.Models.TextAnalyzerConfig.StopwordPreset.get -> string? +Weaviate.Client.Models.TextAnalyzerConfig.StopwordPreset.init -> void +Weaviate.Client.Models.TextAnalyzerConfig.Equals(Weaviate.Client.Models.TextAnalyzerConfig? other) -> bool +Weaviate.Client.Models.TextAnalyzerConfig.$() -> Weaviate.Client.Models.TextAnalyzerConfig! +override Weaviate.Client.Models.TextAnalyzerConfig.Equals(object? obj) -> bool +override Weaviate.Client.Models.TextAnalyzerConfig.GetHashCode() -> int +override Weaviate.Client.Models.TextAnalyzerConfig.ToString() -> string! +static Weaviate.Client.Models.TextAnalyzerConfig.operator !=(Weaviate.Client.Models.TextAnalyzerConfig? left, Weaviate.Client.Models.TextAnalyzerConfig? right) -> bool +static Weaviate.Client.Models.TextAnalyzerConfig.operator ==(Weaviate.Client.Models.TextAnalyzerConfig? left, Weaviate.Client.Models.TextAnalyzerConfig? right) -> bool Weaviate.Client.Models.TokenizeResult Weaviate.Client.Models.TokenizeResult.TokenizeResult() -> void Weaviate.Client.Models.TokenizeResult.Tokenization.get -> Weaviate.Client.Models.PropertyTokenization @@ -6850,7 +6850,7 @@ Weaviate.Client.Models.TokenizeResult.Indexed.get -> System.Collections.Immutabl Weaviate.Client.Models.TokenizeResult.Indexed.init -> void Weaviate.Client.Models.TokenizeResult.Query.get -> System.Collections.Immutable.ImmutableList! Weaviate.Client.Models.TokenizeResult.Query.init -> void -Weaviate.Client.Models.TokenizeResult.AnalyzerConfig.get -> Weaviate.Client.Models.TokenizeAnalyzerConfig? +Weaviate.Client.Models.TokenizeResult.AnalyzerConfig.get -> Weaviate.Client.Models.TextAnalyzerConfig? Weaviate.Client.Models.TokenizeResult.AnalyzerConfig.init -> void Weaviate.Client.Models.TokenizeResult.StopwordConfig.get -> Weaviate.Client.Models.StopwordConfig? Weaviate.Client.Models.TokenizeResult.StopwordConfig.init -> void @@ -6861,3 +6861,9 @@ override Weaviate.Client.Models.TokenizeResult.GetHashCode() -> int override Weaviate.Client.Models.TokenizeResult.ToString() -> string! static Weaviate.Client.Models.TokenizeResult.operator !=(Weaviate.Client.Models.TokenizeResult? left, Weaviate.Client.Models.TokenizeResult? right) -> bool static Weaviate.Client.Models.TokenizeResult.operator ==(Weaviate.Client.Models.TokenizeResult? left, Weaviate.Client.Models.TokenizeResult? right) -> bool +Weaviate.Client.Models.Property.TextAnalyzer.get -> Weaviate.Client.Models.TextAnalyzerConfig? +Weaviate.Client.Models.Property.TextAnalyzer.init -> void +Weaviate.Client.Models.InvertedIndexConfig.StopwordPresets.get -> System.Collections.Generic.IDictionary!>? +Weaviate.Client.Models.InvertedIndexConfig.StopwordPresets.set -> void +Weaviate.Client.Models.InvertedIndexConfigUpdate.StopwordPresets.get -> System.Collections.Generic.IDictionary!>? +Weaviate.Client.Models.InvertedIndexConfigUpdate.StopwordPresets.set -> void diff --git a/src/Weaviate.Client/Rest/Dto/Extensions.cs b/src/Weaviate.Client/Rest/Dto/Extensions.cs index 0b915bd1..7caf7898 100644 --- a/src/Weaviate.Client/Rest/Dto/Extensions.cs +++ b/src/Weaviate.Client/Rest/Dto/Extensions.cs @@ -88,6 +88,7 @@ public Models.Property ToModel() IndexRangeFilters = IndexRangeFilters, PropertyTokenization = (Models.PropertyTokenization?)Tokenization, NestedProperties = NestedProperties?.Select(np => np.ToModel()).ToArray(), + TextAnalyzer = Weaviate.Client.Models.TokenizeMapping.ToModel(TextAnalyzer), }; } } @@ -150,6 +151,7 @@ public Models.Property ToModel() NestedProperties = NestedProperties?.Select(np => np.ToModel()).ToArray(), SkipVectorization = skipVectorization, VectorizePropertyName = vectorizePropertyName, + TextAnalyzer = Weaviate.Client.Models.TokenizeMapping.ToModel(TextAnalyzer), }; } diff --git a/src/Weaviate.Client/TokenizeClient.cs b/src/Weaviate.Client/TokenizeClient.cs index 16b7f01e..ffb4b933 100644 --- a/src/Weaviate.Client/TokenizeClient.cs +++ b/src/Weaviate.Client/TokenizeClient.cs @@ -26,7 +26,7 @@ internal TokenizeClient(WeaviateClient client) /// Optional text analyzer configuration (e.g. ASCII folding, stopword preset). /// /// Optional named stopword configurations. Each key is a preset name that can be referenced by - /// . Each value is a . + /// . Each value is a . /// /// Cancellation token. /// @@ -36,7 +36,7 @@ internal TokenizeClient(WeaviateClient client) public async Task Text( string text, PropertyTokenization tokenization, - TokenizeAnalyzerConfig? analyzerConfig = null, + TextAnalyzerConfig? analyzerConfig = null, IDictionary? stopwordPresets = null, CancellationToken cancellationToken = default ) From ec2054f0f736ae72b79755f10884026dcd106cf6 Mon Sep 17 00:00:00 2001 From: Michelangelo Partipilo Date: Wed, 22 Apr 2026 00:02:49 +0200 Subject: [PATCH 4/4] test: assert WeaviateUnprocessableEntityException for server-side StopwordPresets rejections MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The `StopwordPresets_RemoveInUse_RejectedByServer` and `StopwordPresets_RemoveReferencedByNested_RejectedByServer` tests expected `WeaviateClientException`, but the server returns HTTP 422 which the client maps to `WeaviateUnprocessableEntityException : WeaviateServerException`. The test names already indicate these are server-side rejections — align the assertions with the actual (and correct) exception type. Co-Authored-By: Claude Opus 4.7 --- .../Integration/TestCollectionTextAnalyzer.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Weaviate.Client.Tests/Integration/TestCollectionTextAnalyzer.cs b/src/Weaviate.Client.Tests/Integration/TestCollectionTextAnalyzer.cs index c9e59b37..6619eb22 100644 --- a/src/Weaviate.Client.Tests/Integration/TestCollectionTextAnalyzer.cs +++ b/src/Weaviate.Client.Tests/Integration/TestCollectionTextAnalyzer.cs @@ -142,7 +142,7 @@ public async Task StopwordPresets_RemoveInUse_RejectedByServer() } ); - await Assert.ThrowsAnyAsync(async () => + await Assert.ThrowsAnyAsync(async () => { await collection.Config.Update( c => @@ -240,7 +240,7 @@ public async Task StopwordPresets_RemoveReferencedByNested_RejectedByServer() } ); - await Assert.ThrowsAnyAsync(async () => + await Assert.ThrowsAnyAsync(async () => { await collection.Config.Update( c =>