diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 4bd1c5569..7808b5a7d 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -92,7 +92,7 @@ jobs: fail-fast: false matrix: WEAVIATE_VERSION: - ["1.32.24", "1.33.11", "1.34.7", "1.35.2", "1.36.9", "1.37.0-rc.0"] + ["1.32.24", "1.33.11", "1.34.7", "1.35.2", "1.36.9", "1.37.1"] steps: - uses: actions/checkout@v4 diff --git a/src/it/java/io/weaviate/containers/Weaviate.java b/src/it/java/io/weaviate/containers/Weaviate.java index f0730fb4d..2e1305f90 100644 --- a/src/it/java/io/weaviate/containers/Weaviate.java +++ b/src/it/java/io/weaviate/containers/Weaviate.java @@ -45,7 +45,7 @@ public enum Version { V134(1, 34, 7), V135(1, 35, 2), V136(1, 36, 9), - V137(1, 37, "0-rc.0"); + V137(1, 37, 1); public final SemanticVersion semver; diff --git a/src/it/java/io/weaviate/integration/AggregationITest.java b/src/it/java/io/weaviate/integration/AggregationITest.java index 30df82fb5..f65ea1b2d 100644 --- a/src/it/java/io/weaviate/integration/AggregationITest.java +++ b/src/it/java/io/weaviate/integration/AggregationITest.java @@ -144,7 +144,7 @@ public void testNearVector_groupBy_category() { Assertions.assertThat(result) .extracting(AggregateResponseGrouped::groups) .asInstanceOf(InstanceOfAssertFactories.list(AggregateResponseGroup.class)) - .as("group per category").hasSize(3) + .as("group per category").hasSizeBetween(2, 3) // Should be 3 but can flake .allSatisfy(group -> { Assertions.assertThat(group) .extracting(AggregateResponseGroup::groupedBy) diff --git a/src/it/java/io/weaviate/integration/TokenizeITest.java b/src/it/java/io/weaviate/integration/TokenizeITest.java new file mode 100644 index 000000000..222958696 --- /dev/null +++ b/src/it/java/io/weaviate/integration/TokenizeITest.java @@ -0,0 +1,41 @@ +package io.weaviate.integration; + +import org.assertj.core.api.Assertions; +import org.junit.BeforeClass; +import org.junit.Test; + +import io.weaviate.ConcurrentTest; +import io.weaviate.client6.v1.api.WeaviateClient; +import io.weaviate.client6.v1.api.collections.Property; +import io.weaviate.client6.v1.api.collections.Tokenization; +import io.weaviate.containers.Container; +import io.weaviate.containers.Weaviate; + +public class TokenizeITest extends ConcurrentTest { + private static final WeaviateClient client = Container.WEAVIATE.getClient(); + + @BeforeClass + public static void __() { + Weaviate.Version.V137.orSkip(); + } + + @Test + public void testTokenize() throws Exception { + var nsWords = ns("Words"); + client.collections.create(nsWords, + c -> c.properties(Property.text("sentence", + p -> p.tokenization(Tokenization.TRIGRAM)))); + + var sentence = "hello world"; + + // Act + var custom = client.tokenize.text(sentence, + tok -> tok.tokenization(Tokenization.TRIGRAM)); + + var existing = client.tokenize.text(sentence, + nsWords, "sentence"); + + // Assert + Assertions.assertThat(existing).isEqualTo(custom); + } +} diff --git a/src/main/java/io/weaviate/client6/v1/api/WeaviateClient.java b/src/main/java/io/weaviate/client6/v1/api/WeaviateClient.java index 6150ab4d5..166b582f9 100644 --- a/src/main/java/io/weaviate/client6/v1/api/WeaviateClient.java +++ b/src/main/java/io/weaviate/client6/v1/api/WeaviateClient.java @@ -10,6 +10,7 @@ import io.weaviate.client6.v1.api.rbac.groups.WeaviateGroupsClient; import io.weaviate.client6.v1.api.rbac.roles.WeaviateRolesClient; import io.weaviate.client6.v1.api.rbac.users.WeaviateUsersClient; +import io.weaviate.client6.v1.api.tokenize.WeaviateTokenizeClient; import io.weaviate.client6.v1.internal.ObjectBuilder; import io.weaviate.client6.v1.internal.Timeout; import io.weaviate.client6.v1.internal.TokenProvider; @@ -62,6 +63,12 @@ public class WeaviateClient implements AutoCloseable { */ public final WeaviateClusterClient cluster; + /** + * Client for {@code /tokenize} and + * {@code /schema/{collection}/property/{property}/tokenize} endpoints. + */ + public final WeaviateTokenizeClient tokenize; + public WeaviateClient(Config config) { RestTransportOptions restOpt = config.restTransportOptions(); GrpcChannelOptions grpcOpt; @@ -117,6 +124,7 @@ public WeaviateClient(Config config) { this.grpcTransport = new DefaultGrpcTransport(grpcOpt); this.alias = new WeaviateAliasClient(restTransport); this.backup = new WeaviateBackupClient(restTransport); + this.tokenize = new WeaviateTokenizeClient(restTransport); this.collections = new WeaviateCollectionsClient(restTransport, grpcTransport); this.roles = new WeaviateRolesClient(restTransport); this.groups = new WeaviateGroupsClient(restTransport); diff --git a/src/main/java/io/weaviate/client6/v1/api/WeaviateClientAsync.java b/src/main/java/io/weaviate/client6/v1/api/WeaviateClientAsync.java index b38cd20d2..0992212ad 100644 --- a/src/main/java/io/weaviate/client6/v1/api/WeaviateClientAsync.java +++ b/src/main/java/io/weaviate/client6/v1/api/WeaviateClientAsync.java @@ -12,6 +12,7 @@ import io.weaviate.client6.v1.api.rbac.groups.WeaviateGroupsClientAsync; import io.weaviate.client6.v1.api.rbac.roles.WeaviateRolesClientAsync; import io.weaviate.client6.v1.api.rbac.users.WeaviateUsersClientAsync; +import io.weaviate.client6.v1.api.tokenize.WeaviateTokenizeClientAsync; import io.weaviate.client6.v1.internal.ObjectBuilder; import io.weaviate.client6.v1.internal.Timeout; import io.weaviate.client6.v1.internal.TokenProvider; @@ -61,6 +62,12 @@ public class WeaviateClientAsync implements AutoCloseable { */ public final WeaviateClusterClientAsync cluster; + /** + * Client for {@code /tokenize} and + * {@code /schema/{collection}/property/{property}/tokenize} endpoints. + */ + public final WeaviateTokenizeClientAsync tokenize; + /** * This constructor is blocking if {@link Authentication} configured, * as the client will need to do the initial token exchange. @@ -121,6 +128,7 @@ public WeaviateClientAsync(Config config) { this.grpcTransport = new DefaultGrpcTransport(grpcOpt); this.alias = new WeaviateAliasClientAsync(restTransport); this.backup = new WeaviateBackupClientAsync(restTransport); + this.tokenize = new WeaviateTokenizeClientAsync(restTransport); this.roles = new WeaviateRolesClientAsync(restTransport); this.groups = new WeaviateGroupsClientAsync(restTransport); this.users = new WeaviateUsersClientAsync(restTransport); diff --git a/src/main/java/io/weaviate/client6/v1/api/collections/InvertedIndex.java b/src/main/java/io/weaviate/client6/v1/api/collections/InvertedIndex.java index a350f3de0..e00c2c78c 100644 --- a/src/main/java/io/weaviate/client6/v1/api/collections/InvertedIndex.java +++ b/src/main/java/io/weaviate/client6/v1/api/collections/InvertedIndex.java @@ -1,7 +1,9 @@ package io.weaviate.client6.v1.api.collections; import java.util.Arrays; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.function.Function; import com.google.gson.annotations.SerializedName; @@ -15,6 +17,7 @@ public record InvertedIndex( @SerializedName("bm25") Bm25 bm25, /** Common words which should be ignored in queries. */ @SerializedName("stopwords") Stopwords stopwords, + @SerializedName("stopwordPresets") Map> stopwordPresets, /** * If true, indexes object creation and update timestamps, * enabling filtering by creationTimeUnix and lastUpdateTimeUnix. @@ -135,6 +138,7 @@ public InvertedIndex(Builder builder) { builder.cleanupIntervalSeconds, builder.bm25, builder.stopwords, + builder.stopwordPresets, builder.indexTimestamps, builder.indexNulls, builder.indexPropertyLength, @@ -145,6 +149,7 @@ public static class Builder implements ObjectBuilder { private Integer cleanupIntervalSeconds; private Bm25 bm25; private Stopwords stopwords; + private Map> stopwordPresets = new HashMap<>(); private Boolean indexTimestamps; private Boolean indexNulls; private Boolean indexPropertyLength; @@ -168,6 +173,12 @@ public Builder stopwords(Function> f return this; } + /** Supply custom stopword presets. */ + public Builder stopwordPresets(Map> stopwordPresets) { + this.stopwordPresets = stopwordPresets; + return this; + } + /** * Enable / disable creating an index for creation / update timestamps. * diff --git a/src/main/java/io/weaviate/client6/v1/api/collections/Property.java b/src/main/java/io/weaviate/client6/v1/api/collections/Property.java index 039e819f2..f4da24a04 100644 --- a/src/main/java/io/weaviate/client6/v1/api/collections/Property.java +++ b/src/main/java/io/weaviate/client6/v1/api/collections/Property.java @@ -18,6 +18,7 @@ public record Property( @SerializedName("indexRangeFilters") Boolean indexRangeFilters, @SerializedName("indexSearchable") Boolean indexSearchable, @SerializedName("tokenization") Tokenization tokenization, + @SerializedName("textAnalyzer") TextAnalyzer textAnalyzer, @SerializedName("skipVectorization") Boolean skipVectorization, @SerializedName("vectorizePropertyName") Boolean vectorizePropertyName, @SerializedName("nestedProperties") List nestedProperties) { @@ -407,6 +408,7 @@ public Property(Builder builder) { builder.indexRangeFilters, builder.indexSearchable, builder.tokenization, + builder.textAnalyzer, builder.skipVectorization, builder.vectorizePropertyName, builder.nestedProperties.isEmpty() ? null : builder.nestedProperties); @@ -435,6 +437,7 @@ public static class Builder implements ObjectBuilder { private Boolean indexRangeFilters; private Boolean indexSearchable; private Tokenization tokenization; + private TextAnalyzer textAnalyzer; private Boolean skipVectorization; private Boolean vectorizePropertyName; private List nestedProperties = new ArrayList<>(); @@ -555,6 +558,20 @@ public Builder tokenization(Tokenization tokenization) { return this; } + /** + * Configures per-property text analysis for {@code text} and {@code text[]} + * properties that use an inverted index (searchable or filterable). + * + *

+ * Supports ASCII folding (accent/diacritic handling) and selecting + * a stopword preset that overrides the collection-level + * {@code invertedIndexConfig.stopwords} setting for this property only. + */ + public Builder textAnalyzer(TextAnalyzer textAnalyzer) { + this.textAnalyzer = textAnalyzer; + return this; + } + public Builder skipVectorization(boolean skipVectorization) { this.skipVectorization = skipVectorization; return this; diff --git a/src/main/java/io/weaviate/client6/v1/api/collections/TextAnalyzer.java b/src/main/java/io/weaviate/client6/v1/api/collections/TextAnalyzer.java new file mode 100644 index 000000000..3265b63a5 --- /dev/null +++ b/src/main/java/io/weaviate/client6/v1/api/collections/TextAnalyzer.java @@ -0,0 +1,61 @@ +package io.weaviate.client6.v1.api.collections; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.function.Function; + +import com.google.gson.annotations.SerializedName; + +import io.weaviate.client6.v1.internal.ObjectBuilder; + +public record TextAnalyzer( + @SerializedName("ascii_fold") Boolean foldAscii, + @SerializedName("ascii_fold_ignore") List keepAscii, + @SerializedName("stopword_preset") String stopwordPreset) { + + public static TextAnalyzer of() { + return null; + } + + public static TextAnalyzer of(Function> fn) { + return fn.apply(new Builder()).build(); + } + + public TextAnalyzer(Builder builder) { + this( + builder.foldAscii, + builder.keepAscii, + builder.stopwordPreset); + } + + public static class Builder implements ObjectBuilder { + Boolean foldAscii = true; + List keepAscii = new ArrayList<>(); + String stopwordPreset; + + public Builder foldAscii(boolean enable) { + this.foldAscii = enable; + return this; + } + + public Builder keepAscii(String... keepAscii) { + return keepAscii(Arrays.asList(keepAscii)); + } + + public Builder keepAscii(List keepAscii) { + this.keepAscii = keepAscii; + return this; + } + + public Builder stopwordPreset(String stopwordPreset) { + this.stopwordPreset = stopwordPreset; + return this; + } + + @Override + public TextAnalyzer build() { + return new TextAnalyzer(this); + } + } +} diff --git a/src/main/java/io/weaviate/client6/v1/api/tokenize/TokenizeRequest.java b/src/main/java/io/weaviate/client6/v1/api/tokenize/TokenizeRequest.java new file mode 100644 index 000000000..85c19ee98 --- /dev/null +++ b/src/main/java/io/weaviate/client6/v1/api/tokenize/TokenizeRequest.java @@ -0,0 +1,132 @@ +package io.weaviate.client6.v1.api.tokenize; + +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.function.Function; + +import com.google.gson.annotations.SerializedName; + +import io.weaviate.client6.v1.api.collections.InvertedIndex.Stopwords; +import io.weaviate.client6.v1.api.collections.TextAnalyzer; +import io.weaviate.client6.v1.api.collections.Tokenization; +import io.weaviate.client6.v1.internal.ObjectBuilder; +import io.weaviate.client6.v1.internal.json.JSON; +import io.weaviate.client6.v1.internal.rest.Endpoint; +import io.weaviate.client6.v1.internal.rest.SimpleEndpoint; + +public class TokenizeRequest { + @SerializedName("text") + private final String text; + + // These two fields are passed as path parameters. + private final transient String collection; + private final transient String property; + + @SerializedName("tokenization") + private final Tokenization tokenization; + @SerializedName("analyzerConfig") + private final TextAnalyzer textAnalyzer; + @SerializedName("stopwords") + private final Stopwords stopwords; + @SerializedName("stopwordPresets") + private final Map> stopwordPresets; + + public TokenizeRequest(String text, String collection, String property) { + this.text = text; + this.collection = collection; + this.property = property; + this.tokenization = null; + this.textAnalyzer = null; + this.stopwords = null; + this.stopwordPresets = null; + } + + public TokenizeRequest( + String text, + Tokenization tokenization, + TextAnalyzer textAnalyzer, + Stopwords stopwords, + Map> stopwordPresets) { + this.text = text; + this.collection = null; + this.property = null; + this.tokenization = tokenization; + this.textAnalyzer = textAnalyzer; + this.stopwords = stopwords; + this.stopwordPresets = stopwordPresets; + } + + public final static Endpoint _ENDPOINT = new SimpleEndpoint<>( + __ -> "POST", + request -> request.collection != null + ? "/schema/" + request.collection + "/properties/" + request.property + "/tokenize" + : "/tokenize", + __ -> Collections.emptyMap(), + request -> JSON.serialize( + request.collection != null + ? Map.of("text", request.text) + : request), + (statusCode, response) -> JSON.deserialize(response, TokenizeResponse.class)); + + public static final TokenizeRequest of(String text, Function> fn) { + return fn.apply(new Builder(text)).build(); + } + + public TokenizeRequest(Builder builder) { + this( + builder.text, + builder.tokenization, + builder.textAnalyzer, + builder.stopwords, + builder.stopwordPresets); + } + + public static class Builder implements ObjectBuilder { + private final String text; + private Tokenization tokenization; + private TextAnalyzer textAnalyzer; + private Stopwords stopwords; + private Map> stopwordPresets = new HashMap<>(); + + /** Set tokenization strategy. */ + public Builder tokenization(Tokenization tokenization) { + this.tokenization = tokenization; + return this; + } + + /** Configure ASCII character folding. */ + public Builder textAnalyzer(TextAnalyzer textAnalyzer) { + this.textAnalyzer = textAnalyzer; + return this; + } + + /** + * Select a stopwords preset. Mutually exclusive with {@link #stopwordPresets}. + */ + public Builder stopwords(Stopwords stopwords) { + this.stopwords = stopwords; + this.stopwordPresets.clear(); + return this; + } + + /** + * Select multiple stopword presets. Mutually exclusive with {@link #stopwords}. + */ + public Builder stopwordPresets(Map> stopwordPresets) { + this.stopwords = null; + this.stopwordPresets = stopwordPresets; + return this; + } + + public Builder(String text) { + this.text = text; + } + + @Override + public TokenizeRequest build() { + return new TokenizeRequest(this); + } + } +} diff --git a/src/main/java/io/weaviate/client6/v1/api/tokenize/TokenizeResponse.java b/src/main/java/io/weaviate/client6/v1/api/tokenize/TokenizeResponse.java new file mode 100644 index 000000000..10ddf1d47 --- /dev/null +++ b/src/main/java/io/weaviate/client6/v1/api/tokenize/TokenizeResponse.java @@ -0,0 +1,13 @@ +package io.weaviate.client6.v1.api.tokenize; + +import java.util.List; + +import com.google.gson.annotations.SerializedName; + +import io.weaviate.client6.v1.api.collections.Tokenization; + +public record TokenizeResponse( + @SerializedName("tokenization") Tokenization tokenization, + @SerializedName("indexed") List indexed, + @SerializedName("query") List query) { +} diff --git a/src/main/java/io/weaviate/client6/v1/api/tokenize/WeaviateTokenizeClient.java b/src/main/java/io/weaviate/client6/v1/api/tokenize/WeaviateTokenizeClient.java new file mode 100644 index 000000000..e5ccd0384 --- /dev/null +++ b/src/main/java/io/weaviate/client6/v1/api/tokenize/WeaviateTokenizeClient.java @@ -0,0 +1,62 @@ +package io.weaviate.client6.v1.api.tokenize; + +import java.io.IOException; +import java.util.function.Function; + +import io.weaviate.client6.v1.api.WeaviateApiException; +import io.weaviate.client6.v1.internal.ObjectBuilder; +import io.weaviate.client6.v1.internal.rest.RestTransport; + +public class WeaviateTokenizeClient { + private final RestTransport restTransport; + + public WeaviateTokenizeClient(RestTransport restTransport) { + this.restTransport = restTransport; + } + + /** + * Tokenize a text string. + * + * @param text Input text string. + * @param collection Name of the reference collection. + * @param property Name of the property to source tokenization config from. + * @throws WeaviateApiException in case the server returned with an + * error status code. + * @throws IOException in case the request was not sent successfully + * due to a malformed request, a networking error + * or the server being unavailable. + */ + public TokenizeResponse text(String text, String collection, String property) throws IOException { + return text(new TokenizeRequest(text, collection, property)); + } + + /** + * Tokenize a text string. + * + * @param text Input text string. + * @param fn Lambda expression for optional stopwords. + * @throws WeaviateApiException in case the server returned with an + * error status code. + * @throws IOException in case the request was not sent successfully + * due to a malformed request, a networking error + * or the server being unavailable. + */ + public TokenizeResponse text(String text, Function> fn) + throws IOException { + return text(TokenizeRequest.of(text, fn)); + } + + /** + * Tokenize a text string. + * + * @param request Request body. + * @throws WeaviateApiException in case the server returned with an + * error status code. + * @throws IOException in case the request was not sent successfully + * due to a malformed request, a networking error + * or the server being unavailable. + */ + public TokenizeResponse text(TokenizeRequest request) throws IOException { + return this.restTransport.performRequest(request, TokenizeRequest._ENDPOINT); + } +} diff --git a/src/main/java/io/weaviate/client6/v1/api/tokenize/WeaviateTokenizeClientAsync.java b/src/main/java/io/weaviate/client6/v1/api/tokenize/WeaviateTokenizeClientAsync.java new file mode 100644 index 000000000..e69f50376 --- /dev/null +++ b/src/main/java/io/weaviate/client6/v1/api/tokenize/WeaviateTokenizeClientAsync.java @@ -0,0 +1,46 @@ +package io.weaviate.client6.v1.api.tokenize; + +import java.util.concurrent.CompletableFuture; +import java.util.function.Function; + +import io.weaviate.client6.v1.internal.ObjectBuilder; +import io.weaviate.client6.v1.internal.rest.RestTransport; + +public class WeaviateTokenizeClientAsync { + private final RestTransport restTransport; + + public WeaviateTokenizeClientAsync(RestTransport restTransport) { + this.restTransport = restTransport; + } + + /** + * Tokenize a text string. + * + * @param text Input text string. + * @param collection Name of the reference collection. + * @param property Name of the property to source tokenization config from. + */ + public CompletableFuture text(String text, String collection, String property) { + return text(new TokenizeRequest(text, collection, property)); + } + + /** + * Tokenize a text string. + * + * @param text Input text string. + * @param fn Lambda expression for optional stopwords. + */ + public CompletableFuture text(String text, + Function> fn) { + return text(TokenizeRequest.of(text, fn)); + } + + /** + * Tokenize a text string. + * + * @param request Request body. + */ + public CompletableFuture text(TokenizeRequest request) { + return this.restTransport.performRequestAsync(request, TokenizeRequest._ENDPOINT); + } +}