From 3624e8b571d8c388befc61e232e0cecff832b0f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Mour=C3=A3o?= Date: Mon, 20 Apr 2026 15:52:55 +0100 Subject: [PATCH 01/14] refactor: tokenization executor and models to support stopword configurations and improve response handling --- integration/test_tokenize.py | 245 +++++++++++++++++++----------- weaviate/tokenization/executor.py | 49 +++++- weaviate/tokenization/models.py | 46 +----- 3 files changed, 202 insertions(+), 138 deletions(-) diff --git a/integration/test_tokenize.py b/integration/test_tokenize.py index 97587235b..d692a4808 100644 --- a/integration/test_tokenize.py +++ b/integration/test_tokenize.py @@ -2,10 +2,17 @@ These tests cover the client's responsibilities: - Correct serialization of inputs (enums, _TextAnalyzerConfigCreate, _StopwordsCreate) -- Correct deserialization of responses into typed objects -- Client-side validation (_TextAnalyzerConfigCreate rejects invalid input) +- Correct deserialization of responses into the TokenizeResult object +- Client-side validation (_TextAnalyzerConfigCreate, stopwords/stopword_presets mutex) - Version gate (>= 1.37.0) - Both sync and async client paths + +Server-side behavior this client relies on: +- Word tokenization defaults to preset "en" when no stopword config is sent. +- The generic /v1/tokenize response is minimal: only ``indexed`` and ``query`` + are returned. The property-level endpoint additionally returns ``tokenization``. +- ``stopwords`` and ``stopword_presets`` are mutually exclusive on the generic + endpoint — the server rejects requests that set both. """ from typing import AsyncGenerator, Generator @@ -15,9 +22,7 @@ import weaviate from weaviate.collections.classes.config import ( - StopwordsConfig, StopwordsPreset, - TextAnalyzerConfig, Tokenization, _StopwordsCreate, _TextAnalyzerConfigCreate, @@ -62,13 +67,31 @@ class TestSerialization: """Verify the client correctly serializes different input forms.""" @pytest.mark.parametrize( - "tokenization,text,expected_tokens", + "tokenization,text,expected_indexed,expected_query", [ - (Tokenization.WORD, "The quick brown fox", ["the", "quick", "brown", "fox"]), - (Tokenization.LOWERCASE, "Hello World Test", ["hello", "world", "test"]), - (Tokenization.WHITESPACE, "Hello World Test", ["Hello", "World", "Test"]), - (Tokenization.FIELD, " Hello World ", ["Hello World"]), - (Tokenization.TRIGRAM, "Hello", ["hel", "ell", "llo"]), + # "the" is an English stopword — filtered from the query output + # by the server's default "en" preset for word tokenization. + ( + Tokenization.WORD, + "The quick brown fox", + ["the", "quick", "brown", "fox"], + ["quick", "brown", "fox"], + ), + # Non-word tokenizations do not apply the default "en" preset. + ( + Tokenization.LOWERCASE, + "Hello World Test", + ["hello", "world", "test"], + ["hello", "world", "test"], + ), + ( + Tokenization.WHITESPACE, + "Hello World Test", + ["Hello", "World", "Test"], + ["Hello", "World", "Test"], + ), + (Tokenization.FIELD, " Hello World ", ["Hello World"], ["Hello World"]), + (Tokenization.TRIGRAM, "Hello", ["hel", "ell", "llo"], ["hel", "ell", "llo"]), ], ) def test_tokenization_enum( @@ -76,19 +99,35 @@ def test_tokenization_enum( client: weaviate.WeaviateClient, tokenization: Tokenization, text: str, - expected_tokens: list, + expected_indexed: list, + expected_query: list, ) -> None: result = client.tokenization.text(text=text, tokenization=tokenization) assert isinstance(result, TokenizeResult) - assert result.tokenization == tokenization - assert result.indexed == expected_tokens - assert result.query == expected_tokens + assert result.indexed == expected_indexed + assert result.query == expected_query + # Generic endpoint does not echo tokenization back. + assert result.tokenization is None + + def test_default_en_applied_for_word(self, client: weaviate.WeaviateClient) -> None: + """Word tokenization defaults to the 'en' preset when no stopword + config is supplied.""" + result = client.tokenization.text( + text="The quick brown fox", tokenization=Tokenization.WORD + ) + assert result.indexed == ["the", "quick", "brown", "fox"] + # "the" removed by the server's default en preset. + assert result.query == ["quick", "brown", "fox"] - def test_no_analyzer_config(self, client: weaviate.WeaviateClient) -> None: - result = client.tokenization.text(text="hello world", tokenization=Tokenization.WORD) - assert result.tokenization == Tokenization.WORD - assert result.indexed == ["hello", "world"] - assert result.analyzer_config is None + def test_opt_out_of_default_en(self, client: weaviate.WeaviateClient) -> None: + """analyzerConfig.stopwordPreset='none' disables the default en.""" + cfg = _TextAnalyzerConfigCreate(stopword_preset=StopwordsPreset.NONE) + result = client.tokenization.text( + text="The quick brown fox", + tokenization=Tokenization.WORD, + analyzer_config=cfg, + ) + assert result.query == ["the", "quick", "brown", "fox"] def test_ascii_fold(self, client: weaviate.WeaviateClient) -> None: cfg = _TextAnalyzerConfigCreate(ascii_fold=True) @@ -140,33 +179,74 @@ def test_ascii_fold_combined_with_stopwords(self, client: weaviate.WeaviateClien assert "the" not in result.query assert "école" in result.query - def test_stopword_presets_custom_additions(self, client: weaviate.WeaviateClient) -> None: - cfg = _TextAnalyzerConfigCreate(stopword_preset="custom") + def test_stopwords_fallback(self, client: weaviate.WeaviateClient) -> None: + """Top-level stopwords acts as the fallback detector when no + analyzerConfig.stopwordPreset is set.""" + sw = _StopwordsCreate( + preset=StopwordsPreset.EN, additions=["quick"], removals=None + ) + result = client.tokenization.text( + text="the quick brown fox", + tokenization=Tokenization.WORD, + stopwords=sw, + ) + assert result.indexed == ["the", "quick", "brown", "fox"] + # "the" (en) and "quick" (addition) filtered. + assert result.query == ["brown", "fox"] + + def test_stopwords_additions_default_preset_to_en( + self, client: weaviate.WeaviateClient + ) -> None: + """Caller omits preset, passes only additions. Server defaults preset + to 'en' and builds detector from en + additions.""" + sw = _StopwordsCreate(preset=None, additions=["hello"], removals=None) + result = client.tokenization.text( + text="the quick hello world", + tokenization=Tokenization.WORD, + stopwords=sw, + ) + assert result.query == ["quick", "world"] + + def test_stopwords_removals_default_preset_to_en( + self, client: weaviate.WeaviateClient + ) -> None: + """Caller omits preset, passes only removals. 'the' is removed from + the en list so it passes through.""" + sw = _StopwordsCreate(preset=None, additions=None, removals=["the"]) + result = client.tokenization.text( + text="the quick is fast", + tokenization=Tokenization.WORD, + stopwords=sw, + ) + # "is" still in en, "the" removed. + assert result.query == ["the", "quick", "fast"] + + def test_stopword_presets_named_reference(self, client: weaviate.WeaviateClient) -> None: + """Define a named preset via stopword_presets, select it via + analyzerConfig.stopwordPreset. Word lists use the collection shape.""" result = client.tokenization.text( text="hello world test", tokenization=Tokenization.WORD, - analyzer_config=cfg, - stopword_presets={ - "custom": _StopwordsCreate(preset=None, additions=["test"], removals=None), - }, + analyzer_config=_TextAnalyzerConfigCreate(stopword_preset="custom"), + stopword_presets={"custom": ["test"]}, ) assert result.indexed == ["hello", "world", "test"] assert result.query == ["hello", "world"] - def test_stopword_presets_with_base_and_removals(self, client: weaviate.WeaviateClient) -> None: - cfg = _TextAnalyzerConfigCreate(stopword_preset="en-no-the") + def test_stopword_presets_override_builtin_en( + self, client: weaviate.WeaviateClient + ) -> None: + """A user-defined preset sharing a name with a built-in replaces the + built-in entirely, including on the default-en path for word + tokenization.""" result = client.tokenization.text( - text="the quick", + text="the quick hello world", tokenization=Tokenization.WORD, - analyzer_config=cfg, - stopword_presets={ - "en-no-the": _StopwordsCreate( - preset=StopwordsPreset.EN, additions=None, removals=["the"] - ), - }, + stopword_presets={"en": ["hello"]}, ) - assert result.indexed == ["the", "quick"] - assert result.query == ["the", "quick"] + assert result.indexed == ["the", "quick", "hello", "world"] + # "the" no longer filtered (built-in en replaced), "hello" is. + assert result.query == ["the", "quick", "world"] # --------------------------------------------------------------------------- @@ -176,61 +256,23 @@ def test_stopword_presets_with_base_and_removals(self, client: weaviate.Weaviate @pytest.mark.usefixtures("require_1_37") class TestDeserialization: - """Verify the client correctly deserializes response fields into typed objects.""" + """Verify the client correctly deserializes response fields into + TokenizeResult.""" - def test_result_type(self, client: weaviate.WeaviateClient) -> None: + def test_generic_result_shape(self, client: weaviate.WeaviateClient) -> None: + """Generic endpoint returns only indexed and query; tokenization is + not echoed back.""" result = client.tokenization.text(text="hello", tokenization=Tokenization.WORD) assert isinstance(result, TokenizeResult) assert isinstance(result.indexed, list) assert isinstance(result.query, list) + assert result.tokenization is None - def test_analyzer_config_deserialized(self, client: weaviate.WeaviateClient) -> None: - cfg = _TextAnalyzerConfigCreate( - ascii_fold=True, ascii_fold_ignore=["é"], stopword_preset=StopwordsPreset.EN - ) - result = client.tokenization.text( - text="L'école", - tokenization=Tokenization.WORD, - analyzer_config=cfg, - ) - assert isinstance(result.analyzer_config, TextAnalyzerConfig) - assert result.analyzer_config.ascii_fold is True - assert result.analyzer_config.ascii_fold_ignore == ["é"] - assert result.analyzer_config.stopword_preset == "en" - - def test_no_analyzer_config_returns_none(self, client: weaviate.WeaviateClient) -> None: - result = client.tokenization.text(text="hello", tokenization=Tokenization.WORD) - assert result.analyzer_config is None - - def test_stopword_config_deserialized_on_property( + def test_property_result_populates_tokenization( self, client: weaviate.WeaviateClient ) -> None: - client.collections.delete("TestDeserStopword") - try: - client.collections.create_from_dict( - { - "class": "TestDeserStopword", - "vectorizer": "none", - "properties": [ - { - "name": "title", - "dataType": ["text"], - "tokenization": "word", - "textAnalyzer": {"stopwordPreset": "en"}, - }, - ], - } - ) - col = client.collections.get("TestDeserStopword") - result = col.config.tokenize_property(property_name="title", text="the quick") - assert isinstance(result, TokenizeResult) - assert result.tokenization == Tokenization.WORD - if result.stopword_config is not None: - assert isinstance(result.stopword_config, StopwordsConfig) - finally: - client.collections.delete("TestDeserStopword") - - def test_property_result_types(self, client: weaviate.WeaviateClient) -> None: + """Property endpoint returns tokenization — the server resolved it + from the property's schema rather than the caller sending it.""" client.collections.delete("TestDeserPropTypes") try: client.collections.create_from_dict( @@ -256,12 +298,13 @@ def test_property_result_types(self, client: weaviate.WeaviateClient) -> None: # --------------------------------------------------------------------------- -# Client-side validation (_TextAnalyzerConfigCreate) +# Client-side validation # --------------------------------------------------------------------------- class TestClientSideValidation: - """Verify that _TextAnalyzerConfigCreate rejects invalid input before hitting the server.""" + """Verify that client-side validation rejects invalid input before + hitting the server.""" def test_ascii_fold_ignore_without_fold_raises(self) -> None: with pytest.raises(ValueError, match="asciiFoldIgnore"): @@ -291,6 +334,23 @@ def test_empty_config_is_valid(self) -> None: assert cfg.asciiFoldIgnore is None assert cfg.stopwordPreset is None + def test_stopwords_and_stopword_presets_mutex( + self, client: weaviate.WeaviateClient + ) -> None: + """Client rejects the mutex violation locally with ValueError, before + sending the request (which the server would also reject with 422).""" + if client._connection._weaviate_version.is_lower_than(1, 37, 0): + pytest.skip("Tokenization requires Weaviate >= 1.37.0") + with pytest.raises(ValueError, match="mutually exclusive"): + client.tokenization.text( + text="hello", + tokenization=Tokenization.WORD, + stopwords=_StopwordsCreate( + preset=StopwordsPreset.EN, additions=None, removals=None + ), + stopword_presets={"custom": ["hello"]}, + ) + # --------------------------------------------------------------------------- # Version gate @@ -331,20 +391,21 @@ async def test_text_tokenize(self, async_client: weaviate.WeaviateAsyncClient) - ) assert isinstance(result, TokenizeResult) assert result.indexed == ["the", "quick", "brown", "fox"] + # default "en" applied server-side. + assert result.query == ["quick", "brown", "fox"] @pytest.mark.asyncio - async def test_text_with_analyzer_config( + async def test_text_with_stopwords_fallback( self, async_client: weaviate.WeaviateAsyncClient ) -> None: - cfg = _TextAnalyzerConfigCreate(ascii_fold=True, stopword_preset=StopwordsPreset.EN) + sw = _StopwordsCreate(preset=StopwordsPreset.EN, additions=["quick"], removals=None) result = await async_client.tokenization.text( - text="L'école est fermée", + text="the quick brown fox", tokenization=Tokenization.WORD, - analyzer_config=cfg, + stopwords=sw, ) - assert result.indexed == ["l", "ecole", "est", "fermee"] - assert isinstance(result.analyzer_config, TextAnalyzerConfig) - assert result.analyzer_config.ascii_fold is True + assert result.indexed == ["the", "quick", "brown", "fox"] + assert result.query == ["brown", "fox"] @pytest.mark.asyncio async def test_property_tokenize(self, async_client: weaviate.WeaviateAsyncClient) -> None: diff --git a/weaviate/tokenization/executor.py b/weaviate/tokenization/executor.py index 5093c14e9..3a79d6ee1 100644 --- a/weaviate/tokenization/executor.py +++ b/weaviate/tokenization/executor.py @@ -1,6 +1,6 @@ """Tokenize executor.""" -from typing import Any, Dict, Generic, Optional +from typing import Any, Dict, Generic, List, Optional from httpx import Response @@ -33,26 +33,52 @@ def text( tokenization: Tokenization, *, analyzer_config: Optional[_TextAnalyzerConfigCreate] = None, - stopword_presets: Optional[Dict[str, _StopwordsCreate]] = None, + stopwords: Optional[_StopwordsCreate] = None, + stopword_presets: Optional[Dict[str, List[str]]] = None, ) -> executor.Result[TokenizeResult]: """Tokenize text using the generic /v1/tokenize endpoint. + For ``word`` tokenization the server defaults to the built-in ``en`` + stopword preset when no stopword configuration is supplied. Pass + ``analyzer_config=TextAnalyzerConfig(stopword_preset="none")`` or + equivalent to opt out. + Args: text: The text to tokenize. tokenization: The tokenization method to use (e.g. Tokenization.WORD). - analyzer_config: Text analyzer settings (ASCII folding, stopword preset). - stopword_presets: Custom stopword preset definitions, keyed by name. - Each value is a ``_StopwordsCreate`` with optional preset, additions, - and removals fields. + analyzer_config: Text analyzer settings (ASCII folding, stopword + preset name). ``stopword_preset`` may reference a built-in preset + (``en`` / ``none``) or a name defined in ``stopword_presets``. + stopwords: Fallback stopword config applied when + ``analyzer_config.stopword_preset`` is not set. Same shape as a + collection's ``invertedIndexConfig.stopwords`` — a base preset + optionally tweaked with ``additions`` / ``removals``. An empty + ``preset`` defaults to ``en``. + stopword_presets: User-defined named stopword presets, each a plain + list of words. A name matching a built-in (``en`` / ``none``) + replaces the built-in entirely. + + Note: + ``stopwords`` and ``stopword_presets`` are mutually exclusive on the + server — pass one or the other, not both. The server returns HTTP + 422 if both are supplied. Returns: - A TokenizeResult with indexed and query token lists. + A TokenizeResult with indexed and query token lists. The generic + endpoint does not echo request fields (tokenization, analyzer_config, + stopwords, stopword_presets) back in the response. Raises: WeaviateUnsupportedFeatureError: If the server version is below 1.37.0. + ValueError: If both ``stopwords`` and ``stopword_presets`` are passed. """ self._check_version() + if stopwords is not None and stopword_presets is not None: + raise ValueError( + "stopwords and stopword_presets are mutually exclusive; pass only one" + ) + payload: Dict[str, Any] = { "text": text, "tokenization": tokenization.value, @@ -63,9 +89,16 @@ def text( if ac_dict: payload["analyzerConfig"] = ac_dict + if stopwords is not None: + sw_dict = stopwords._to_dict() + if sw_dict: + payload["stopwords"] = sw_dict + if stopword_presets is not None: + # Plain word-list shape matching a collection's + # invertedIndexConfig.stopwordPresets. payload["stopwordPresets"] = { - name: cfg._to_dict() for name, cfg in stopword_presets.items() + name: list(words) for name, words in stopword_presets.items() } def resp(response: Response) -> TokenizeResult: diff --git a/weaviate/tokenization/models.py b/weaviate/tokenization/models.py index 8bfa508f8..3bf980597 100644 --- a/weaviate/tokenization/models.py +++ b/weaviate/tokenization/models.py @@ -1,56 +1,26 @@ """Return types for tokenization operations.""" -from typing import Any, Dict, List, Optional +from typing import List, Optional -from pydantic import BaseModel, ConfigDict, Field, field_validator +from pydantic import BaseModel, ConfigDict -from weaviate.collections.classes.config import ( - StopwordsConfig, - StopwordsPreset, - TextAnalyzerConfig, - Tokenization, -) +from weaviate.collections.classes.config import Tokenization class TokenizeResult(BaseModel): """Result of a tokenization operation. Attributes: - tokenization: The tokenization method that was applied. indexed: Tokens as they would be stored in the inverted index. query: Tokens as they would be used for querying (after stopword removal). - analyzer_config: The text analyzer configuration that was used, if any. - stopword_config: The stopword configuration that was used, if any. + tokenization: The tokenization method that was applied. Populated only by + the property-level endpoint, where the tokenization is resolved from + the property's schema. The generic ``/v1/tokenize`` endpoint does not + echo it back (the caller passed it). """ model_config = ConfigDict(populate_by_name=True, arbitrary_types_allowed=True) - tokenization: Tokenization indexed: List[str] query: List[str] - analyzer_config: Optional[TextAnalyzerConfig] = Field(default=None, alias="analyzerConfig") - stopword_config: Optional[StopwordsConfig] = Field(default=None, alias="stopwordConfig") - - @field_validator("analyzer_config", mode="before") - @classmethod - def _parse_analyzer_config(cls, v: Optional[Dict[str, Any]]) -> Optional[TextAnalyzerConfig]: - if v is None: - return None - if "asciiFold" not in v and "stopwordPreset" not in v: - return None - return TextAnalyzerConfig( - ascii_fold=v.get("asciiFold", False), - ascii_fold_ignore=v.get("asciiFoldIgnore"), - stopword_preset=v.get("stopwordPreset"), - ) - - @field_validator("stopword_config", mode="before") - @classmethod - def _parse_stopword_config(cls, v: Optional[Dict[str, Any]]) -> Optional[StopwordsConfig]: - if v is None: - return None - return StopwordsConfig( - preset=StopwordsPreset(v["preset"]), - additions=v.get("additions"), - removals=v.get("removals"), - ) + tokenization: Optional[Tokenization] = None From 5a12f134c36a5f6ed6329abaae101abf42efb7cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Mour=C3=A3o?= Date: Mon, 20 Apr 2026 15:55:45 +0100 Subject: [PATCH 02/14] fix: update Weaviate 1.37.1 version to include specific build identifier --- .github/workflows/main.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 8dd157443..ee9b69537 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -28,7 +28,7 @@ env: WEAVIATE_134: 1.34.19 WEAVIATE_135: 1.35.16-efdedfa WEAVIATE_136: 1.36.9-d905e6c - WEAVIATE_137: 1.37.1 + WEAVIATE_137: 1.37.1-5f911bc jobs: lint-and-format: From 60887f3ab37171d2ea12aa34de57a927fcca8267 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Mour=C3=A3o?= Date: Mon, 20 Apr 2026 18:08:38 +0100 Subject: [PATCH 03/14] fix: update Weaviate 1.37.1 version to include architecture suffix --- .github/workflows/main.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index b2c567cee..94f75b089 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -28,7 +28,7 @@ env: WEAVIATE_134: 1.34.19 WEAVIATE_135: 1.35.18 WEAVIATE_136: 1.36.12 - WEAVIATE_137: 1.37.1-5f911bc + WEAVIATE_137: 1.37.1-5f911bc.amd64 jobs: lint-and-format: From 9fd83b881c953f2ea09c1fb5c89dc38a96173c29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Mour=C3=A3o?= Date: Tue, 21 Apr 2026 09:23:37 +0100 Subject: [PATCH 04/14] fix: refactor tokenization tests to use parameterized cases for improved readability and maintainability --- integration/test_tokenize.py | 327 +++++++++++++++++++---------------- 1 file changed, 176 insertions(+), 151 deletions(-) diff --git a/integration/test_tokenize.py b/integration/test_tokenize.py index 788cefc31..51f154479 100644 --- a/integration/test_tokenize.py +++ b/integration/test_tokenize.py @@ -109,131 +109,146 @@ def test_tokenization_enum( # Generic endpoint does not echo tokenization back. assert result.tokenization is None - def test_default_en_applied_for_word(self, client: weaviate.WeaviateClient) -> None: - """Word tokenization defaults to the 'en' preset when no stopword config is supplied.""" - result = client.tokenization.text( - text="The quick brown fox", tokenization=Tokenization.WORD - ) - assert result.indexed == ["the", "quick", "brown", "fox"] - # "the" removed by the server's default en preset. - assert result.query == ["quick", "brown", "fox"] - - def test_opt_out_of_default_en(self, client: weaviate.WeaviateClient) -> None: - """analyzerConfig.stopwordPreset='none' disables the default en.""" - cfg = _TextAnalyzerConfigCreate(stopword_preset=StopwordsPreset.NONE) - result = client.tokenization.text( - text="The quick brown fox", - tokenization=Tokenization.WORD, - analyzer_config=cfg, - ) - assert result.query == ["the", "quick", "brown", "fox"] - - def test_ascii_fold(self, client: weaviate.WeaviateClient) -> None: - cfg = _TextAnalyzerConfigCreate(ascii_fold=True) - result = client.tokenization.text( - text="L'école est fermée", - tokenization=Tokenization.WORD, - analyzer_config=cfg, - ) - assert result.indexed == ["l", "ecole", "est", "fermee"] - - def test_ascii_fold_with_ignore(self, client: weaviate.WeaviateClient) -> None: - cfg = _TextAnalyzerConfigCreate(ascii_fold=True, ascii_fold_ignore=["é"]) - result = client.tokenization.text( - text="L'école est fermée", - tokenization=Tokenization.WORD, - analyzer_config=cfg, - ) - assert result.indexed == ["l", "école", "est", "fermée"] - - def test_stopword_preset_enum(self, client: weaviate.WeaviateClient) -> None: - cfg = _TextAnalyzerConfigCreate(stopword_preset=StopwordsPreset.EN) - result = client.tokenization.text( - text="The quick brown fox", - tokenization=Tokenization.WORD, - analyzer_config=cfg, - ) - assert "the" not in result.query - assert "quick" in result.query - - def test_stopword_preset_string(self, client: weaviate.WeaviateClient) -> None: - cfg = _TextAnalyzerConfigCreate(stopword_preset="en") - result = client.tokenization.text( - text="The quick brown fox", - tokenization=Tokenization.WORD, - analyzer_config=cfg, - ) - assert "the" not in result.query - - def test_ascii_fold_combined_with_stopwords(self, client: weaviate.WeaviateClient) -> None: - cfg = _TextAnalyzerConfigCreate( - ascii_fold=True, ascii_fold_ignore=["é"], stopword_preset=StopwordsPreset.EN - ) - result = client.tokenization.text( - text="The école est fermée", - tokenization=Tokenization.WORD, - analyzer_config=cfg, - ) - assert result.indexed == ["the", "école", "est", "fermée"] - assert "the" not in result.query - assert "école" in result.query - - def test_stopwords_fallback(self, client: weaviate.WeaviateClient) -> None: - """Top-level stopwords acts as the fallback detector when no analyzerConfig.stopwordPreset is set.""" - sw = _StopwordsCreate(preset=StopwordsPreset.EN, additions=["quick"], removals=None) - result = client.tokenization.text( - text="the quick brown fox", - tokenization=Tokenization.WORD, - stopwords=sw, - ) - assert result.indexed == ["the", "quick", "brown", "fox"] - # "the" (en) and "quick" (addition) filtered. - assert result.query == ["brown", "fox"] - - def test_stopwords_additions_default_preset_to_en( - self, client: weaviate.WeaviateClient + @pytest.mark.parametrize( + "call_kwargs,expected_indexed,expected_query", + [ + ( + {"text": "The quick brown fox"}, + ["the", "quick", "brown", "fox"], + ["quick", "brown", "fox"], + ), + ( + { + "text": "The quick brown fox", + "analyzer_config": _TextAnalyzerConfigCreate( + stopword_preset=StopwordsPreset.NONE + ), + }, + ["the", "quick", "brown", "fox"], + ["the", "quick", "brown", "fox"], + ), + ( + { + "text": "L'école est fermée", + "analyzer_config": _TextAnalyzerConfigCreate(ascii_fold=True), + }, + ["l", "ecole", "est", "fermee"], + ["l", "ecole", "fermee"], + ), + ( + { + "text": "L'école est fermée", + "analyzer_config": _TextAnalyzerConfigCreate( + ascii_fold=True, ascii_fold_ignore=["é"] + ), + }, + ["l", "école", "est", "fermée"], + ["l", "école", "fermée"], + ), + ( + { + "text": "The quick brown fox", + "analyzer_config": _TextAnalyzerConfigCreate( + stopword_preset=StopwordsPreset.EN + ), + }, + ["the", "quick", "brown", "fox"], + ["quick", "brown", "fox"], + ), + ( + { + "text": "The quick brown fox", + "analyzer_config": _TextAnalyzerConfigCreate(stopword_preset="en"), + }, + ["the", "quick", "brown", "fox"], + ["quick", "brown", "fox"], + ), + ( + { + "text": "The école est fermée", + "analyzer_config": _TextAnalyzerConfigCreate( + ascii_fold=True, + ascii_fold_ignore=["é"], + stopword_preset=StopwordsPreset.EN, + ), + }, + ["the", "école", "est", "fermée"], + ["école", "est", "fermée"], + ), + ( + { + "text": "the quick brown fox", + "stopwords": _StopwordsCreate( + preset=StopwordsPreset.EN, additions=["quick"], removals=None + ), + }, + ["the", "quick", "brown", "fox"], + ["brown", "fox"], + ), + ( + { + "text": "the quick hello world", + "stopwords": _StopwordsCreate( + preset=None, additions=["hello"], removals=None + ), + }, + ["the", "quick", "hello", "world"], + ["quick", "world"], + ), + ( + { + "text": "the quick is fast", + "stopwords": _StopwordsCreate( + preset=None, additions=None, removals=["the"] + ), + }, + ["the", "quick", "is", "fast"], + ["the", "quick", "fast"], + ), + ( + { + "text": "hello world test", + "analyzer_config": _TextAnalyzerConfigCreate(stopword_preset="custom"), + "stopword_presets": {"custom": ["test"]}, + }, + ["hello", "world", "test"], + ["hello", "world"], + ), + ( + { + "text": "the quick hello world", + "stopword_presets": {"en": ["hello"]}, + }, + ["the", "quick", "hello", "world"], + ["the", "quick", "world"], + ), + ], + ids=[ + "default_en_applied_for_word", + "opt_out_of_default_en", + "ascii_fold", + "ascii_fold_with_ignore", + "stopword_preset_enum", + "stopword_preset_string", + "ascii_fold_combined_with_stopwords", + "stopwords_fallback", + "stopwords_additions_default_preset_to_en", + "stopwords_removals_default_preset_to_en", + "stopword_presets_named_reference", + "stopword_presets_override_builtin_en", + ], + ) + def test_text_tokenize( + self, + client: weaviate.WeaviateClient, + call_kwargs: dict, + expected_indexed: list, + expected_query: list, ) -> None: - """Caller omits preset, passes only additions. Server defaults preset to 'en' and builds detector from en + additions.""" - sw = _StopwordsCreate(preset=None, additions=["hello"], removals=None) - result = client.tokenization.text( - text="the quick hello world", - tokenization=Tokenization.WORD, - stopwords=sw, - ) - assert result.query == ["quick", "world"] - - def test_stopwords_removals_default_preset_to_en(self, client: weaviate.WeaviateClient) -> None: - """Caller omits preset, passes only removals. 'the' is removed from the en list so it passes through.""" - sw = _StopwordsCreate(preset=None, additions=None, removals=["the"]) - result = client.tokenization.text( - text="the quick is fast", - tokenization=Tokenization.WORD, - stopwords=sw, - ) - # "is" still in en, "the" removed. - assert result.query == ["the", "quick", "fast"] - - def test_stopword_presets_named_reference(self, client: weaviate.WeaviateClient) -> None: - """Define a named preset via stopword_presets, select it via analyzerConfig.stopwordPreset. Word lists use the collection shape.""" - result = client.tokenization.text( - text="hello world test", - tokenization=Tokenization.WORD, - analyzer_config=_TextAnalyzerConfigCreate(stopword_preset="custom"), - stopword_presets={"custom": ["test"]}, - ) - assert result.indexed == ["hello", "world", "test"] - assert result.query == ["hello", "world"] - - def test_stopword_presets_override_builtin_en(self, client: weaviate.WeaviateClient) -> None: - """A user-defined preset sharing a name with a built-in replaces the built-in entirely, including on the default-en path for word tokenization.""" - result = client.tokenization.text( - text="the quick hello world", - tokenization=Tokenization.WORD, - stopword_presets={"en": ["hello"]}, - ) - assert result.indexed == ["the", "quick", "hello", "world"] - # "the" no longer filtered (built-in en replaced), "hello" is. - assert result.query == ["the", "quick", "world"] + result = client.tokenization.text(tokenization=Tokenization.WORD, **call_kwargs) + assert isinstance(result, TokenizeResult) + assert result.indexed == expected_indexed + assert result.query == expected_query # --------------------------------------------------------------------------- @@ -287,33 +302,44 @@ def test_property_result_populates_tokenization(self, client: weaviate.WeaviateC class TestClientSideValidation: """Verify that client-side validation rejects invalid input before hitting the server.""" - def test_ascii_fold_ignore_without_fold_raises(self) -> None: - with pytest.raises(ValueError, match="asciiFoldIgnore"): - _TextAnalyzerConfigCreate(ascii_fold=False, ascii_fold_ignore=["é"]) - - def test_ascii_fold_ignore_without_fold_default_raises(self) -> None: + @pytest.mark.parametrize( + "kwargs", + [ + {"ascii_fold": False, "ascii_fold_ignore": ["é"]}, + {"ascii_fold_ignore": ["é"]}, + ], + ids=["explicit_false", "default"], + ) + def test_ascii_fold_ignore_without_fold_raises(self, kwargs: dict) -> None: with pytest.raises(ValueError, match="asciiFoldIgnore"): - _TextAnalyzerConfigCreate(ascii_fold_ignore=["é"]) + _TextAnalyzerConfigCreate(**kwargs) - def test_valid_config_does_not_raise(self) -> None: - cfg = _TextAnalyzerConfigCreate(ascii_fold=True, ascii_fold_ignore=["é", "ñ"]) - assert cfg.asciiFold is True - assert cfg.asciiFoldIgnore == ["é", "ñ"] - - def test_fold_without_ignore_is_valid(self) -> None: - cfg = _TextAnalyzerConfigCreate(ascii_fold=True) - assert cfg.asciiFold is True - assert cfg.asciiFoldIgnore is None - - def test_stopword_preset_only_is_valid(self) -> None: - cfg = _TextAnalyzerConfigCreate(stopword_preset="en") - assert cfg.stopwordPreset == "en" - - def test_empty_config_is_valid(self) -> None: - cfg = _TextAnalyzerConfigCreate() - assert cfg.asciiFold is None - assert cfg.asciiFoldIgnore is None - assert cfg.stopwordPreset is None + @pytest.mark.parametrize( + "kwargs,expected", + [ + ( + {"ascii_fold": True, "ascii_fold_ignore": ["é", "ñ"]}, + {"asciiFold": True, "asciiFoldIgnore": ["é", "ñ"]}, + ), + ( + {"ascii_fold": True}, + {"asciiFold": True, "asciiFoldIgnore": None}, + ), + ( + {"stopword_preset": "en"}, + {"stopwordPreset": "en"}, + ), + ( + {}, + {"asciiFold": None, "asciiFoldIgnore": None, "stopwordPreset": None}, + ), + ], + ids=["fold_with_ignore", "fold_without_ignore", "stopword_preset_only", "empty"], + ) + def test_valid_config(self, kwargs: dict, expected: dict) -> None: + cfg = _TextAnalyzerConfigCreate(**kwargs) + for attr, value in expected.items(): + assert getattr(cfg, attr) == value def test_stopwords_and_stopword_presets_mutex(self, client: weaviate.WeaviateClient) -> None: """Client rejects the mutex violation locally with ValueError, before sending the request (which the server would also reject with 422).""" @@ -411,7 +437,6 @@ async def test_property_tokenize(self, async_client: weaviate.WeaviateAsyncClien assert isinstance(result, TokenizeResult) assert result.tokenization == Tokenization.WORD assert result.indexed == ["the", "quick", "brown", "fox"] - assert "the" not in result.query - assert "quick" in result.query + assert result.query == ["quick", "brown", "fox"] finally: await async_client.collections.delete("TestAsyncPropTokenize") From e9d681226e1917b80fed0312b086a5818cac2e9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Mour=C3=A3o?= Date: Tue, 21 Apr 2026 09:52:24 +0100 Subject: [PATCH 05/14] fix: update Weaviate 1.37.1 version and enhance tokenization tests with new fixtures --- .github/workflows/main.yaml | 2 +- integration/test_tokenize.py | 77 ++++++++++++++++++++++++++++++------ 2 files changed, 66 insertions(+), 13 deletions(-) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 94f75b089..a1ff94f98 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -28,7 +28,7 @@ env: WEAVIATE_134: 1.34.19 WEAVIATE_135: 1.35.18 WEAVIATE_136: 1.36.12 - WEAVIATE_137: 1.37.1-5f911bc.amd64 + WEAVIATE_137: 1.37.1-4e61e26.amd64 jobs: lint-and-format: diff --git a/integration/test_tokenize.py b/integration/test_tokenize.py index 51f154479..61d54e095 100644 --- a/integration/test_tokenize.py +++ b/integration/test_tokenize.py @@ -9,8 +9,7 @@ Server-side behavior this client relies on: - Word tokenization defaults to preset "en" when no stopword config is sent. -- The generic /v1/tokenize response is minimal: only ``indexed`` and ``query`` - are returned. The property-level endpoint additionally returns ``tokenization``. +- Both endpoints return only ``indexed`` and ``query``. - ``stopwords`` and ``stopword_presets`` are mutually exclusive on the generic endpoint — the server rejects requests that set both. """ @@ -57,6 +56,29 @@ async def async_client() -> AsyncGenerator[weaviate.WeaviateAsyncClient, None]: await c.close() +@pytest.fixture +def recipe_collection(client: weaviate.WeaviateClient) -> Generator: + """Collection with a `recipe` word-tokenized property and an en + ["quick"] stopwords config.""" + name = "TestTokenizeRecipe" + client.collections.delete(name) + client.collections.create_from_dict( + { + "class": name, + "vectorizer": "none", + "invertedIndexConfig": { + "stopwords": {"preset": "en", "additions": ["quick"]}, + }, + "properties": [ + {"name": "recipe", "dataType": ["text"], "tokenization": "word"}, + ], + } + ) + try: + yield client.collections.get(name) + finally: + client.collections.delete(name) + + # --------------------------------------------------------------------------- # Serialization # --------------------------------------------------------------------------- @@ -106,8 +128,6 @@ def test_tokenization_enum( assert isinstance(result, TokenizeResult) assert result.indexed == expected_indexed assert result.query == expected_query - # Generic endpoint does not echo tokenization back. - assert result.tokenization is None @pytest.mark.parametrize( "call_kwargs,expected_indexed,expected_query", @@ -133,7 +153,7 @@ def test_tokenization_enum( "analyzer_config": _TextAnalyzerConfigCreate(ascii_fold=True), }, ["l", "ecole", "est", "fermee"], - ["l", "ecole", "fermee"], + ["l", "ecole", "est", "fermee"], ), ( { @@ -143,7 +163,7 @@ def test_tokenization_enum( ), }, ["l", "école", "est", "fermée"], - ["l", "école", "fermée"], + ["l", "école", "est", "fermée"], ), ( { @@ -250,6 +270,42 @@ def test_text_tokenize( assert result.indexed == expected_indexed assert result.query == expected_query + def test_text_from_collection_config( + self, client: weaviate.WeaviateClient, recipe_collection + ) -> None: + """Values round-tripped through config.get() feed back into tokenization.text().""" + config = recipe_collection.config.get() + recipe = next(p for p in config.properties if p.name == "recipe") + stopwords = config.inverted_index_config.stopwords + result = client.tokenization.text( + text="the quick brown fox", + tokenization=recipe.tokenization, + stopwords=_StopwordsCreate(**stopwords.__dict__), + ) + assert result.indexed == ["the", "quick", "brown", "fox"] + assert result.query == ["brown", "fox"] + + def test_property_and_generic_endpoints_agree( + self, client: weaviate.WeaviateClient, recipe_collection + ) -> None: + """Property endpoint (server resolves config from schema) produces the same indexed/query as the generic endpoint fed the same config.""" + config = recipe_collection.config.get() + recipe = next(p for p in config.properties if p.name == "recipe") + stopwords = config.inverted_index_config.stopwords + + text = "the quick brown fox" + via_property = recipe_collection.config.tokenize_property( + property_name="recipe", text=text + ) + via_generic = client.tokenization.text( + text=text, + tokenization=recipe.tokenization, + stopwords=_StopwordsCreate(**stopwords.__dict__), + ) + + assert via_property.indexed == via_generic.indexed + assert via_property.query == via_generic.query + # --------------------------------------------------------------------------- # Deserialization @@ -261,15 +317,14 @@ class TestDeserialization: """Verify the client correctly deserializes response fields into TokenizeResult.""" def test_generic_result_shape(self, client: weaviate.WeaviateClient) -> None: - """Generic endpoint returns only indexed and query; tokenization is not echoed back.""" + """Generic endpoint response deserializes into TokenizeResult with indexed and query lists.""" result = client.tokenization.text(text="hello", tokenization=Tokenization.WORD) assert isinstance(result, TokenizeResult) assert isinstance(result.indexed, list) assert isinstance(result.query, list) - assert result.tokenization is None - def test_property_result_populates_tokenization(self, client: weaviate.WeaviateClient) -> None: - """Property endpoint returns tokenization — the server resolved it from the property's schema rather than the caller sending it.""" + def test_property_result_shape(self, client: weaviate.WeaviateClient) -> None: + """Property endpoint response deserializes into TokenizeResult — server resolves tokenization from the property's schema.""" client.collections.delete("TestDeserPropTypes") try: client.collections.create_from_dict( @@ -288,7 +343,6 @@ def test_property_result_populates_tokenization(self, client: weaviate.WeaviateC col = client.collections.get("TestDeserPropTypes") result = col.config.tokenize_property(property_name="tag", text=" Hello World ") assert isinstance(result, TokenizeResult) - assert result.tokenization == Tokenization.FIELD assert result.indexed == ["Hello World"] finally: client.collections.delete("TestDeserPropTypes") @@ -435,7 +489,6 @@ async def test_property_tokenize(self, async_client: weaviate.WeaviateAsyncClien text="The quick brown fox", ) assert isinstance(result, TokenizeResult) - assert result.tokenization == Tokenization.WORD assert result.indexed == ["the", "quick", "brown", "fox"] assert result.query == ["quick", "brown", "fox"] finally: From 959f554c7df129a226ddbc5f412e95a3879891d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Mour=C3=A3o?= Date: Tue, 21 Apr 2026 09:55:35 +0100 Subject: [PATCH 06/14] refactor: ruff format --- integration/test_tokenize.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/integration/test_tokenize.py b/integration/test_tokenize.py index 61d54e095..d2a8442d8 100644 --- a/integration/test_tokenize.py +++ b/integration/test_tokenize.py @@ -208,9 +208,7 @@ def test_tokenization_enum( ( { "text": "the quick hello world", - "stopwords": _StopwordsCreate( - preset=None, additions=["hello"], removals=None - ), + "stopwords": _StopwordsCreate(preset=None, additions=["hello"], removals=None), }, ["the", "quick", "hello", "world"], ["quick", "world"], @@ -218,9 +216,7 @@ def test_tokenization_enum( ( { "text": "the quick is fast", - "stopwords": _StopwordsCreate( - preset=None, additions=None, removals=["the"] - ), + "stopwords": _StopwordsCreate(preset=None, additions=None, removals=["the"]), }, ["the", "quick", "is", "fast"], ["the", "quick", "fast"], @@ -294,9 +290,7 @@ def test_property_and_generic_endpoints_agree( stopwords = config.inverted_index_config.stopwords text = "the quick brown fox" - via_property = recipe_collection.config.tokenize_property( - property_name="recipe", text=text - ) + via_property = recipe_collection.config.tokenize_property(property_name="recipe", text=text) via_generic = client.tokenization.text( text=text, tokenization=recipe.tokenization, From 0f7fe47cac92107f502fb5a22c925896084535ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Mour=C3=A3o?= Date: Tue, 21 Apr 2026 14:24:33 +0100 Subject: [PATCH 07/14] test: refactor output types and tests to config --- integration/test_tokenize.py | 32 ++++++++++++++++++++++++ weaviate/tokenization/executor.py | 41 ++++++++++++++++++++++--------- weaviate/tokenization/models.py | 5 ---- 3 files changed, 62 insertions(+), 16 deletions(-) diff --git a/integration/test_tokenize.py b/integration/test_tokenize.py index d2a8442d8..c939e8c5b 100644 --- a/integration/test_tokenize.py +++ b/integration/test_tokenize.py @@ -403,6 +403,38 @@ def test_stopwords_and_stopword_presets_mutex(self, client: weaviate.WeaviateCli stopword_presets={"custom": ["hello"]}, ) + @pytest.mark.parametrize( + "stopword_presets,match", + [ + ({"custom": "hello"}, "must be a list of strings"), + ( + { + "custom": _StopwordsCreate( + preset=StopwordsPreset.EN, additions=None, removals=None + ), + }, + "must be a list of strings", + ), + ({"custom": ["hello", 123]}, "must contain only strings"), + ], + ids=["str_value", "pydantic_model_value", "non_string_element"], + ) + def test_stopword_presets_invalid_shape_raises( + self, + client: weaviate.WeaviateClient, + stopword_presets: dict, + match: str, + ) -> None: + """Client rejects malformed stopword_presets values locally before sending — str would silently split into characters; a pydantic model would serialize to field tuples.""" + if client._connection._weaviate_version.is_lower_than(1, 37, 0): + pytest.skip("Tokenization requires Weaviate >= 1.37.0") + with pytest.raises(ValueError, match=match): + client.tokenization.text( + text="hello", + tokenization=Tokenization.WORD, + stopword_presets=stopword_presets, + ) + # --------------------------------------------------------------------------- # Version gate diff --git a/weaviate/tokenization/executor.py b/weaviate/tokenization/executor.py index 8cedb6e51..825faee05 100644 --- a/weaviate/tokenization/executor.py +++ b/weaviate/tokenization/executor.py @@ -40,14 +40,15 @@ def text( For ``word`` tokenization the server defaults to the built-in ``en`` stopword preset when no stopword configuration is supplied. Pass - ``analyzer_config=TextAnalyzerConfig(stopword_preset="none")`` or - equivalent to opt out. + ``analyzer_config=Configure.text_analyzer(stopword_preset=StopwordsPreset.NONE)`` + (or equivalent) to opt out. Args: text: The text to tokenize. tokenization: The tokenization method to use (e.g. Tokenization.WORD). analyzer_config: Text analyzer settings (ASCII folding, stopword - preset name). ``stopword_preset`` may reference a built-in preset + preset name), built via ``Configure.text_analyzer(...)``. + ``stopword_preset`` may reference a built-in preset (``en`` / ``none``) or a name defined in ``stopword_presets``. stopwords: Fallback stopword config applied when ``analyzer_config.stopword_preset`` is not set. Same shape as a @@ -64,13 +65,13 @@ def text( 422 if both are supplied. Returns: - A TokenizeResult with indexed and query token lists. The generic - endpoint does not echo request fields (tokenization, analyzer_config, - stopwords, stopword_presets) back in the response. + A TokenizeResult with indexed and query token lists. The response + does not echo request fields back. Raises: WeaviateUnsupportedFeatureError: If the server version is below 1.37.0. - ValueError: If both ``stopwords`` and ``stopword_presets`` are passed. + ValueError: If both ``stopwords`` and ``stopword_presets`` are passed, + or if any ``stopword_presets`` value is not a list/tuple of strings. """ self.__check_version() @@ -94,10 +95,28 @@ def text( if stopword_presets is not None: # Plain word-list shape matching a collection's - # invertedIndexConfig.stopwordPresets. - payload["stopwordPresets"] = { - name: list(words) for name, words in stopword_presets.items() - } + # invertedIndexConfig.stopwordPresets. Reject str (would + # silently split into characters) and pydantic models / + # other non-sequence shapes up-front so callers get a clear + # error instead of a malformed payload. + validated: Dict[str, List[str]] = {} + for name, words in stopword_presets.items(): + if isinstance(words, (str, bytes)): + raise ValueError( + f"stopword_presets[{name!r}] must be a list of strings, " + f"got {type(words).__name__}" + ) + if not isinstance(words, (list, tuple)): + raise ValueError( + f"stopword_presets[{name!r}] must be a list of strings, " + f"got {type(words).__name__}" + ) + if not all(isinstance(w, str) for w in words): + raise ValueError( + f"stopword_presets[{name!r}] must contain only strings" + ) + validated[name] = list(words) + payload["stopwordPresets"] = validated def resp(response: Response) -> TokenizeResult: return TokenizeResult.model_validate(response.json()) diff --git a/weaviate/tokenization/models.py b/weaviate/tokenization/models.py index 3bf980597..017abe429 100644 --- a/weaviate/tokenization/models.py +++ b/weaviate/tokenization/models.py @@ -13,14 +13,9 @@ class TokenizeResult(BaseModel): Attributes: indexed: Tokens as they would be stored in the inverted index. query: Tokens as they would be used for querying (after stopword removal). - tokenization: The tokenization method that was applied. Populated only by - the property-level endpoint, where the tokenization is resolved from - the property's schema. The generic ``/v1/tokenize`` endpoint does not - echo it back (the caller passed it). """ model_config = ConfigDict(populate_by_name=True, arbitrary_types_allowed=True) indexed: List[str] query: List[str] - tokenization: Optional[Tokenization] = None From 52c2c8c8133eb1828be86f10ea824d74b822ca8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Mour=C3=A3o?= Date: Tue, 21 Apr 2026 14:31:17 +0100 Subject: [PATCH 08/14] refactor: remove unused imports in tokenization models and format --- weaviate/tokenization/executor.py | 4 +--- weaviate/tokenization/models.py | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/weaviate/tokenization/executor.py b/weaviate/tokenization/executor.py index 825faee05..25b36e1d3 100644 --- a/weaviate/tokenization/executor.py +++ b/weaviate/tokenization/executor.py @@ -112,9 +112,7 @@ def text( f"got {type(words).__name__}" ) if not all(isinstance(w, str) for w in words): - raise ValueError( - f"stopword_presets[{name!r}] must contain only strings" - ) + raise ValueError(f"stopword_presets[{name!r}] must contain only strings") validated[name] = list(words) payload["stopwordPresets"] = validated diff --git a/weaviate/tokenization/models.py b/weaviate/tokenization/models.py index 017abe429..f8fe7cb67 100644 --- a/weaviate/tokenization/models.py +++ b/weaviate/tokenization/models.py @@ -1,11 +1,9 @@ """Return types for tokenization operations.""" -from typing import List, Optional +from typing import List from pydantic import BaseModel, ConfigDict -from weaviate.collections.classes.config import Tokenization - class TokenizeResult(BaseModel): """Result of a tokenization operation. From 3de0955c0520358b5d12f81b094b98ef3d208559 Mon Sep 17 00:00:00 2001 From: Dirk Kulawiak Date: Tue, 21 Apr 2026 15:07:26 +0200 Subject: [PATCH 09/14] Use public classes for .text endpoint --- integration/test_tokenize.py | 48 ++++++++++++-------------- weaviate/classes/config.py | 4 +++ weaviate/classes/tokenization.py | 15 ++++++++ weaviate/collections/classes/config.py | 4 +++ weaviate/tokenization/async_.pyi | 8 ++--- weaviate/tokenization/executor.py | 8 ++--- weaviate/tokenization/sync.pyi | 8 ++--- 7 files changed, 57 insertions(+), 38 deletions(-) create mode 100644 weaviate/classes/tokenization.py diff --git a/integration/test_tokenize.py b/integration/test_tokenize.py index c939e8c5b..a5b16da32 100644 --- a/integration/test_tokenize.py +++ b/integration/test_tokenize.py @@ -1,9 +1,9 @@ """Integration tests for the tokenization module. These tests cover the client's responsibilities: -- Correct serialization of inputs (enums, _TextAnalyzerConfigCreate, _StopwordsCreate) +- Correct serialization of inputs (enums, TextAnalyzerConfigCreate, StopwordsCreate) - Correct deserialization of responses into the TokenizeResult object -- Client-side validation (_TextAnalyzerConfigCreate, stopwords/stopword_presets mutex) +- Client-side validation (TextAnalyzerConfigCreate, stopwords/stopword_presets mutex) - Version gate (>= 1.37.0) - Both sync and async client paths @@ -20,15 +20,15 @@ import pytest_asyncio import weaviate -from weaviate.collections.classes.config import ( +from weaviate.classes.tokenization import ( + StopwordsCreate, StopwordsPreset, + TextAnalyzerConfigCreate, Tokenization, - _StopwordsCreate, - _TextAnalyzerConfigCreate, + TokenizeResult, ) from weaviate.config import AdditionalConfig from weaviate.exceptions import WeaviateUnsupportedFeatureError -from weaviate.tokenization.models import TokenizeResult @pytest.fixture(scope="module") @@ -140,7 +140,7 @@ def test_tokenization_enum( ( { "text": "The quick brown fox", - "analyzer_config": _TextAnalyzerConfigCreate( + "analyzer_config": TextAnalyzerConfigCreate( stopword_preset=StopwordsPreset.NONE ), }, @@ -150,7 +150,7 @@ def test_tokenization_enum( ( { "text": "L'école est fermée", - "analyzer_config": _TextAnalyzerConfigCreate(ascii_fold=True), + "analyzer_config": TextAnalyzerConfigCreate(ascii_fold=True), }, ["l", "ecole", "est", "fermee"], ["l", "ecole", "est", "fermee"], @@ -158,7 +158,7 @@ def test_tokenization_enum( ( { "text": "L'école est fermée", - "analyzer_config": _TextAnalyzerConfigCreate( + "analyzer_config": TextAnalyzerConfigCreate( ascii_fold=True, ascii_fold_ignore=["é"] ), }, @@ -168,9 +168,7 @@ def test_tokenization_enum( ( { "text": "The quick brown fox", - "analyzer_config": _TextAnalyzerConfigCreate( - stopword_preset=StopwordsPreset.EN - ), + "analyzer_config": TextAnalyzerConfigCreate(stopword_preset=StopwordsPreset.EN), }, ["the", "quick", "brown", "fox"], ["quick", "brown", "fox"], @@ -178,7 +176,7 @@ def test_tokenization_enum( ( { "text": "The quick brown fox", - "analyzer_config": _TextAnalyzerConfigCreate(stopword_preset="en"), + "analyzer_config": TextAnalyzerConfigCreate(stopword_preset="en"), }, ["the", "quick", "brown", "fox"], ["quick", "brown", "fox"], @@ -186,7 +184,7 @@ def test_tokenization_enum( ( { "text": "The école est fermée", - "analyzer_config": _TextAnalyzerConfigCreate( + "analyzer_config": TextAnalyzerConfigCreate( ascii_fold=True, ascii_fold_ignore=["é"], stopword_preset=StopwordsPreset.EN, @@ -198,7 +196,7 @@ def test_tokenization_enum( ( { "text": "the quick brown fox", - "stopwords": _StopwordsCreate( + "stopwords": StopwordsCreate( preset=StopwordsPreset.EN, additions=["quick"], removals=None ), }, @@ -208,7 +206,7 @@ def test_tokenization_enum( ( { "text": "the quick hello world", - "stopwords": _StopwordsCreate(preset=None, additions=["hello"], removals=None), + "stopwords": StopwordsCreate(preset=None, additions=["hello"], removals=None), }, ["the", "quick", "hello", "world"], ["quick", "world"], @@ -216,7 +214,7 @@ def test_tokenization_enum( ( { "text": "the quick is fast", - "stopwords": _StopwordsCreate(preset=None, additions=None, removals=["the"]), + "stopwords": StopwordsCreate(preset=None, additions=None, removals=["the"]), }, ["the", "quick", "is", "fast"], ["the", "quick", "fast"], @@ -224,7 +222,7 @@ def test_tokenization_enum( ( { "text": "hello world test", - "analyzer_config": _TextAnalyzerConfigCreate(stopword_preset="custom"), + "analyzer_config": TextAnalyzerConfigCreate(stopword_preset="custom"), "stopword_presets": {"custom": ["test"]}, }, ["hello", "world", "test"], @@ -276,7 +274,7 @@ def test_text_from_collection_config( result = client.tokenization.text( text="the quick brown fox", tokenization=recipe.tokenization, - stopwords=_StopwordsCreate(**stopwords.__dict__), + stopwords=StopwordsCreate(**stopwords.__dict__), ) assert result.indexed == ["the", "quick", "brown", "fox"] assert result.query == ["brown", "fox"] @@ -294,7 +292,7 @@ def test_property_and_generic_endpoints_agree( via_generic = client.tokenization.text( text=text, tokenization=recipe.tokenization, - stopwords=_StopwordsCreate(**stopwords.__dict__), + stopwords=StopwordsCreate(**stopwords.__dict__), ) assert via_property.indexed == via_generic.indexed @@ -360,7 +358,7 @@ class TestClientSideValidation: ) def test_ascii_fold_ignore_without_fold_raises(self, kwargs: dict) -> None: with pytest.raises(ValueError, match="asciiFoldIgnore"): - _TextAnalyzerConfigCreate(**kwargs) + TextAnalyzerConfigCreate(**kwargs) @pytest.mark.parametrize( "kwargs,expected", @@ -385,7 +383,7 @@ def test_ascii_fold_ignore_without_fold_raises(self, kwargs: dict) -> None: ids=["fold_with_ignore", "fold_without_ignore", "stopword_preset_only", "empty"], ) def test_valid_config(self, kwargs: dict, expected: dict) -> None: - cfg = _TextAnalyzerConfigCreate(**kwargs) + cfg = TextAnalyzerConfigCreate(**kwargs) for attr, value in expected.items(): assert getattr(cfg, attr) == value @@ -397,9 +395,7 @@ def test_stopwords_and_stopword_presets_mutex(self, client: weaviate.WeaviateCli client.tokenization.text( text="hello", tokenization=Tokenization.WORD, - stopwords=_StopwordsCreate( - preset=StopwordsPreset.EN, additions=None, removals=None - ), + stopwords=StopwordsCreate(preset=StopwordsPreset.EN, additions=None, removals=None), stopword_presets={"custom": ["hello"]}, ) @@ -482,7 +478,7 @@ async def test_text_tokenize(self, async_client: weaviate.WeaviateAsyncClient) - async def test_text_with_stopwords_fallback( self, async_client: weaviate.WeaviateAsyncClient ) -> None: - sw = _StopwordsCreate(preset=StopwordsPreset.EN, additions=["quick"], removals=None) + sw = StopwordsCreate(preset=StopwordsPreset.EN, additions=["quick"], removals=None) result = await async_client.tokenization.text( text="the quick brown fox", tokenization=Tokenization.WORD, diff --git a/weaviate/classes/config.py b/weaviate/classes/config.py index 868cd1c79..c154062d3 100644 --- a/weaviate/classes/config.py +++ b/weaviate/classes/config.py @@ -11,8 +11,10 @@ ReferenceProperty, ReplicationDeletionStrategy, Rerankers, + StopwordsCreate, StopwordsPreset, TextAnalyzerConfig, + TextAnalyzerConfigCreate, Tokenization, VectorDistances, ) @@ -39,8 +41,10 @@ "PQEncoderType", "ReferenceProperty", "Rerankers", + "StopwordsCreate", "StopwordsPreset", "TextAnalyzerConfig", + "TextAnalyzerConfigCreate", "Tokenization", "Vectorizers", "VectorDistances", diff --git a/weaviate/classes/tokenization.py b/weaviate/classes/tokenization.py new file mode 100644 index 000000000..ffb050614 --- /dev/null +++ b/weaviate/classes/tokenization.py @@ -0,0 +1,15 @@ +from weaviate.collections.classes.config import ( + StopwordsCreate, + StopwordsPreset, + TextAnalyzerConfigCreate, + Tokenization, +) +from weaviate.tokenization.models import TokenizeResult + +__all__ = [ + "StopwordsCreate", + "StopwordsPreset", + "TextAnalyzerConfigCreate", + "Tokenization", + "TokenizeResult", +] diff --git a/weaviate/collections/classes/config.py b/weaviate/collections/classes/config.py index 6d60482a3..068399d70 100644 --- a/weaviate/collections/classes/config.py +++ b/weaviate/collections/classes/config.py @@ -1647,6 +1647,7 @@ class _StopwordsConfig(_ConfigBase): StopwordsConfig = _StopwordsConfig +StopwordsCreate = _StopwordsCreate @dataclass @@ -2224,6 +2225,9 @@ def _validate_ascii_fold_ignore(self) -> "_TextAnalyzerConfigCreate": return self +TextAnalyzerConfigCreate = _TextAnalyzerConfigCreate + + class Property(_ConfigCreateModel): """This class defines the structure of a data property that a collection can have within Weaviate. diff --git a/weaviate/tokenization/async_.pyi b/weaviate/tokenization/async_.pyi index 11f4a13fc..59e815d87 100644 --- a/weaviate/tokenization/async_.pyi +++ b/weaviate/tokenization/async_.pyi @@ -1,9 +1,9 @@ from typing import Dict, List, Optional from weaviate.collections.classes.config import ( + StopwordsCreate, + TextAnalyzerConfigCreate, Tokenization, - _StopwordsCreate, - _TextAnalyzerConfigCreate, ) from weaviate.connect.v4 import ConnectionAsync from weaviate.tokenization.models import TokenizeResult @@ -16,7 +16,7 @@ class _TokenizationAsync(_TokenizationExecutor[ConnectionAsync]): text: str, tokenization: Tokenization, *, - analyzer_config: Optional[_TextAnalyzerConfigCreate] = None, - stopwords: Optional[_StopwordsCreate] = None, + analyzer_config: Optional[TextAnalyzerConfigCreate] = None, + stopwords: Optional[StopwordsCreate] = None, stopword_presets: Optional[Dict[str, List[str]]] = None, ) -> TokenizeResult: ... diff --git a/weaviate/tokenization/executor.py b/weaviate/tokenization/executor.py index 25b36e1d3..a3beffd44 100644 --- a/weaviate/tokenization/executor.py +++ b/weaviate/tokenization/executor.py @@ -5,9 +5,9 @@ from httpx import Response from weaviate.collections.classes.config import ( + StopwordsCreate, + TextAnalyzerConfigCreate, Tokenization, - _StopwordsCreate, - _TextAnalyzerConfigCreate, ) from weaviate.connect import executor from weaviate.connect.v4 import ConnectionType, _ExpectedStatusCodes @@ -32,8 +32,8 @@ def text( text: str, tokenization: Tokenization, *, - analyzer_config: Optional[_TextAnalyzerConfigCreate] = None, - stopwords: Optional[_StopwordsCreate] = None, + analyzer_config: Optional[TextAnalyzerConfigCreate] = None, + stopwords: Optional[StopwordsCreate] = None, stopword_presets: Optional[Dict[str, List[str]]] = None, ) -> executor.Result[TokenizeResult]: """Tokenize text using the generic /v1/tokenize endpoint. diff --git a/weaviate/tokenization/sync.pyi b/weaviate/tokenization/sync.pyi index d931aae51..2c2470f85 100644 --- a/weaviate/tokenization/sync.pyi +++ b/weaviate/tokenization/sync.pyi @@ -1,9 +1,9 @@ from typing import Dict, List, Optional from weaviate.collections.classes.config import ( + StopwordsCreate, + TextAnalyzerConfigCreate, Tokenization, - _StopwordsCreate, - _TextAnalyzerConfigCreate, ) from weaviate.connect.v4 import ConnectionSync from weaviate.tokenization.models import TokenizeResult @@ -16,7 +16,7 @@ class _Tokenization(_TokenizationExecutor[ConnectionSync]): text: str, tokenization: Tokenization, *, - analyzer_config: Optional[_TextAnalyzerConfigCreate] = None, - stopwords: Optional[_StopwordsCreate] = None, + analyzer_config: Optional[TextAnalyzerConfigCreate] = None, + stopwords: Optional[StopwordsCreate] = None, stopword_presets: Optional[Dict[str, List[str]]] = None, ) -> TokenizeResult: ... From 55b136adfd37f8289b1aa9ffd3816335b25fd599 Mon Sep 17 00:00:00 2001 From: Dirk Kulawiak Date: Tue, 21 Apr 2026 15:40:47 +0200 Subject: [PATCH 10/14] Add overloads for exclusivity of stopwrods --- integration/test_tokenize.py | 13 ++--- weaviate/tokenization/async_.pyi | 17 +++++-- weaviate/tokenization/executor.py | 83 ++++++++++++++++++++++++------- weaviate/tokenization/sync.pyi | 17 +++++-- 4 files changed, 93 insertions(+), 37 deletions(-) diff --git a/integration/test_tokenize.py b/integration/test_tokenize.py index a5b16da32..dc244d2c3 100644 --- a/integration/test_tokenize.py +++ b/integration/test_tokenize.py @@ -274,7 +274,7 @@ def test_text_from_collection_config( result = client.tokenization.text( text="the quick brown fox", tokenization=recipe.tokenization, - stopwords=StopwordsCreate(**stopwords.__dict__), + stopwords=stopwords, ) assert result.indexed == ["the", "quick", "brown", "fox"] assert result.query == ["brown", "fox"] @@ -292,7 +292,7 @@ def test_property_and_generic_endpoints_agree( via_generic = client.tokenization.text( text=text, tokenization=recipe.tokenization, - stopwords=StopwordsCreate(**stopwords.__dict__), + stopwords=stopwords, ) assert via_property.indexed == via_generic.indexed @@ -308,13 +308,6 @@ def test_property_and_generic_endpoints_agree( class TestDeserialization: """Verify the client correctly deserializes response fields into TokenizeResult.""" - def test_generic_result_shape(self, client: weaviate.WeaviateClient) -> None: - """Generic endpoint response deserializes into TokenizeResult with indexed and query lists.""" - result = client.tokenization.text(text="hello", tokenization=Tokenization.WORD) - assert isinstance(result, TokenizeResult) - assert isinstance(result.indexed, list) - assert isinstance(result.query, list) - def test_property_result_shape(self, client: weaviate.WeaviateClient) -> None: """Property endpoint response deserializes into TokenizeResult — server resolves tokenization from the property's schema.""" client.collections.delete("TestDeserPropTypes") @@ -405,7 +398,7 @@ def test_stopwords_and_stopword_presets_mutex(self, client: weaviate.WeaviateCli ({"custom": "hello"}, "must be a list of strings"), ( { - "custom": _StopwordsCreate( + "custom": StopwordsCreate( preset=StopwordsPreset.EN, additions=None, removals=None ), }, diff --git a/weaviate/tokenization/async_.pyi b/weaviate/tokenization/async_.pyi index 59e815d87..6bd2d9e8a 100644 --- a/weaviate/tokenization/async_.pyi +++ b/weaviate/tokenization/async_.pyi @@ -1,4 +1,4 @@ -from typing import Dict, List, Optional +from typing import Dict, List, Optional, overload from weaviate.collections.classes.config import ( StopwordsCreate, @@ -11,12 +11,21 @@ from weaviate.tokenization.models import TokenizeResult from .executor import _TokenizationExecutor class _TokenizationAsync(_TokenizationExecutor[ConnectionAsync]): + @overload async def text( self, text: str, tokenization: Tokenization, *, - analyzer_config: Optional[TextAnalyzerConfigCreate] = None, - stopwords: Optional[StopwordsCreate] = None, - stopword_presets: Optional[Dict[str, List[str]]] = None, + analyzer_config: Optional[TextAnalyzerConfigCreate] = ..., + stopwords: Optional[StopwordsCreate] = ..., + ) -> TokenizeResult: ... + @overload + async def text( + self, + text: str, + tokenization: Tokenization, + *, + analyzer_config: Optional[TextAnalyzerConfigCreate] = ..., + stopword_presets: Optional[Dict[str, List[str]]] = ..., ) -> TokenizeResult: ... diff --git a/weaviate/tokenization/executor.py b/weaviate/tokenization/executor.py index a3beffd44..150cc6dd9 100644 --- a/weaviate/tokenization/executor.py +++ b/weaviate/tokenization/executor.py @@ -1,6 +1,6 @@ """Tokenize executor.""" -from typing import Any, Dict, Generic, List, Optional +from typing import Any, Dict, Generic, List, Optional, overload from httpx import Response @@ -27,6 +27,29 @@ def __check_version(self) -> None: "1.37.0", ) + # Overloads make ``stopwords`` and ``stopword_presets`` mutually exclusive + # at type-check time. Passing both is additionally rejected at runtime with + # ``ValueError`` in the implementation below. + @overload + def text( + self, + text: str, + tokenization: Tokenization, + *, + analyzer_config: Optional[TextAnalyzerConfigCreate] = ..., + stopwords: Optional[StopwordsCreate] = ..., + ) -> executor.Result[TokenizeResult]: ... + + @overload + def text( + self, + text: str, + tokenization: Tokenization, + *, + analyzer_config: Optional[TextAnalyzerConfigCreate] = ..., + stopword_presets: Optional[Dict[str, List[str]]] = ..., + ) -> executor.Result[TokenizeResult]: ... + def text( self, text: str, @@ -40,33 +63,55 @@ def text( For ``word`` tokenization the server defaults to the built-in ``en`` stopword preset when no stopword configuration is supplied. Pass - ``analyzer_config=Configure.text_analyzer(stopword_preset=StopwordsPreset.NONE)`` - (or equivalent) to opt out. + ``analyzer_config=TextAnalyzerConfigCreate(stopword_preset="none")`` + or equivalent to opt out. + + Call patterns for stopword handling (``stopwords`` and + ``stopword_presets`` are mutually exclusive — pass at most one): + + 1. **No stopword config** — rely on the server default (``en`` for + word tokenization, none otherwise):: + + client.tokenization.text(text=..., tokenization=Tokenization.WORD) + + 2. **Apply a one-off stopwords block** via ``stopwords`` — the block + filters the query tokens directly, same shape as a collection's + ``invertedIndexConfig.stopwords``:: + + client.tokenization.text( + text=..., + tokenization=Tokenization.WORD, + stopwords=StopwordsCreate(preset=StopwordsPreset.EN, additions=["foo"]), + ) + + 3. **Register a named-preset catalog** via ``stopword_presets`` and + reference one by name from ``analyzer_config.stopword_preset``. + The catalog can also override built-in presets such as ``en``:: + + client.tokenization.text( + text=..., + tokenization=Tokenization.WORD, + analyzer_config=TextAnalyzerConfigCreate(stopword_preset="custom"), + stopword_presets={"custom": ["foo", "bar"]}, + ) Args: text: The text to tokenize. - tokenization: The tokenization method to use (e.g. Tokenization.WORD). + tokenization: The tokenization method to use (e.g. ``Tokenization.WORD``). analyzer_config: Text analyzer settings (ASCII folding, stopword preset name), built via ``Configure.text_analyzer(...)``. ``stopword_preset`` may reference a built-in preset (``en`` / ``none``) or a name defined in ``stopword_presets``. - stopwords: Fallback stopword config applied when - ``analyzer_config.stopword_preset`` is not set. Same shape as a - collection's ``invertedIndexConfig.stopwords`` — a base preset - optionally tweaked with ``additions`` / ``removals``. An empty - ``preset`` defaults to ``en``. - stopword_presets: User-defined named stopword presets, each a plain - list of words. A name matching a built-in (``en`` / ``none``) - replaces the built-in entirely. - - Note: - ``stopwords`` and ``stopword_presets`` are mutually exclusive on the - server — pass one or the other, not both. The server returns HTTP - 422 if both are supplied. + stopwords: One-off stopwords block applied directly to this request. + Mutually exclusive with ``stopword_presets``. + stopword_presets: Named-preset catalog (name → word list). Entries + can be referenced from ``analyzer_config.stopword_preset`` or + override built-ins like ``en``. Mutually exclusive with + ``stopwords``. Returns: - A TokenizeResult with indexed and query token lists. The response - does not echo request fields back. + A ``TokenizeResult`` with indexed and query token lists. The generic + endpoint does not echo request fields back in the response. Raises: WeaviateUnsupportedFeatureError: If the server version is below 1.37.0. diff --git a/weaviate/tokenization/sync.pyi b/weaviate/tokenization/sync.pyi index 2c2470f85..7edf8994a 100644 --- a/weaviate/tokenization/sync.pyi +++ b/weaviate/tokenization/sync.pyi @@ -1,4 +1,4 @@ -from typing import Dict, List, Optional +from typing import Dict, List, Optional, overload from weaviate.collections.classes.config import ( StopwordsCreate, @@ -11,12 +11,21 @@ from weaviate.tokenization.models import TokenizeResult from .executor import _TokenizationExecutor class _Tokenization(_TokenizationExecutor[ConnectionSync]): + @overload def text( self, text: str, tokenization: Tokenization, *, - analyzer_config: Optional[TextAnalyzerConfigCreate] = None, - stopwords: Optional[StopwordsCreate] = None, - stopword_presets: Optional[Dict[str, List[str]]] = None, + analyzer_config: Optional[TextAnalyzerConfigCreate] = ..., + stopwords: Optional[StopwordsCreate] = ..., + ) -> TokenizeResult: ... + @overload + def text( + self, + text: str, + tokenization: Tokenization, + *, + analyzer_config: Optional[TextAnalyzerConfigCreate] = ..., + stopword_presets: Optional[Dict[str, List[str]]] = ..., ) -> TokenizeResult: ... From 7924e457cdd3315325db9a9e7ce2cc51bf7d6d04 Mon Sep 17 00:00:00 2001 From: Dirk Kulawiak Date: Tue, 21 Apr 2026 15:58:08 +0200 Subject: [PATCH 11/14] Accept collection config classes as stopwords --- weaviate/classes/tokenization.py | 2 ++ weaviate/collections/classes/config.py | 20 ++++++++++++++++++++ weaviate/tokenization/async_.pyi | 5 +++-- weaviate/tokenization/executor.py | 21 +++++++++++++++++---- weaviate/tokenization/sync.pyi | 5 +++-- 5 files changed, 45 insertions(+), 8 deletions(-) diff --git a/weaviate/classes/tokenization.py b/weaviate/classes/tokenization.py index ffb050614..0e89fc64b 100644 --- a/weaviate/classes/tokenization.py +++ b/weaviate/classes/tokenization.py @@ -1,4 +1,5 @@ from weaviate.collections.classes.config import ( + StopwordsConfig, StopwordsCreate, StopwordsPreset, TextAnalyzerConfigCreate, @@ -7,6 +8,7 @@ from weaviate.tokenization.models import TokenizeResult __all__ = [ + "StopwordsConfig", "StopwordsCreate", "StopwordsPreset", "TextAnalyzerConfigCreate", diff --git a/weaviate/collections/classes/config.py b/weaviate/collections/classes/config.py index 068399d70..43d86375d 100644 --- a/weaviate/collections/classes/config.py +++ b/weaviate/collections/classes/config.py @@ -1,5 +1,6 @@ import datetime from dataclasses import dataclass +from dataclasses import fields as _dataclass_fields from typing import ( Any, ClassVar, @@ -1649,6 +1650,25 @@ class _StopwordsConfig(_ConfigBase): StopwordsConfig = _StopwordsConfig StopwordsCreate = _StopwordsCreate +# Invariant: the read-side dataclass (_StopwordsConfig) and the write-side +# pydantic model (_StopwordsCreate) must carry the same set of field names so +# that values round-tripped from ``collection.config.get()`` can flow back into +# ``tokenization.text()`` without silent data loss. If a field is added to one +# but not the other, importing this module fails loudly; the read→write +# conversion in ``weaviate/tokenization/executor.py::_TokenizationExecutor.text`` +# depends on this parity. +_read_fields = {f.name for f in _dataclass_fields(_StopwordsConfig)} +_write_fields = set(_StopwordsCreate.model_fields.keys()) +if _read_fields != _write_fields: + raise RuntimeError( + "_StopwordsConfig / _StopwordsCreate field drift detected — " + f"read-only={_read_fields - _write_fields}, " + f"write-only={_write_fields - _read_fields}. " + "Update both classes together, or adapt the read→write conversion in " + "weaviate/tokenization/executor.py::_TokenizationExecutor.text." + ) +del _read_fields, _write_fields + @dataclass class _InvertedIndexConfig(_ConfigBase): diff --git a/weaviate/tokenization/async_.pyi b/weaviate/tokenization/async_.pyi index 6bd2d9e8a..156e25c90 100644 --- a/weaviate/tokenization/async_.pyi +++ b/weaviate/tokenization/async_.pyi @@ -1,6 +1,7 @@ -from typing import Dict, List, Optional, overload +from typing import Dict, List, Optional, Union, overload from weaviate.collections.classes.config import ( + StopwordsConfig, StopwordsCreate, TextAnalyzerConfigCreate, Tokenization, @@ -18,7 +19,7 @@ class _TokenizationAsync(_TokenizationExecutor[ConnectionAsync]): tokenization: Tokenization, *, analyzer_config: Optional[TextAnalyzerConfigCreate] = ..., - stopwords: Optional[StopwordsCreate] = ..., + stopwords: Optional[Union[StopwordsCreate, StopwordsConfig]] = ..., ) -> TokenizeResult: ... @overload async def text( diff --git a/weaviate/tokenization/executor.py b/weaviate/tokenization/executor.py index 150cc6dd9..ea36e1cda 100644 --- a/weaviate/tokenization/executor.py +++ b/weaviate/tokenization/executor.py @@ -1,10 +1,11 @@ """Tokenize executor.""" -from typing import Any, Dict, Generic, List, Optional, overload +from typing import Any, Dict, Generic, List, Optional, Union, overload from httpx import Response from weaviate.collections.classes.config import ( + StopwordsConfig, StopwordsCreate, TextAnalyzerConfigCreate, Tokenization, @@ -29,7 +30,10 @@ def __check_version(self) -> None: # Overloads make ``stopwords`` and ``stopword_presets`` mutually exclusive # at type-check time. Passing both is additionally rejected at runtime with - # ``ValueError`` in the implementation below. + # ``ValueError`` in the implementation below. ``stopwords`` accepts either a + # ``StopwordsCreate`` (the write-side shape) or a ``StopwordsConfig`` (the + # read-side shape returned by ``collection.config.get()``), so values round- + # tripped through config reads can be passed back in directly. @overload def text( self, @@ -37,7 +41,7 @@ def text( tokenization: Tokenization, *, analyzer_config: Optional[TextAnalyzerConfigCreate] = ..., - stopwords: Optional[StopwordsCreate] = ..., + stopwords: Optional[Union[StopwordsCreate, StopwordsConfig]] = ..., ) -> executor.Result[TokenizeResult]: ... @overload @@ -56,7 +60,7 @@ def text( tokenization: Tokenization, *, analyzer_config: Optional[TextAnalyzerConfigCreate] = None, - stopwords: Optional[StopwordsCreate] = None, + stopwords: Optional[Union[StopwordsCreate, StopwordsConfig]] = None, stopword_presets: Optional[Dict[str, List[str]]] = None, ) -> executor.Result[TokenizeResult]: """Tokenize text using the generic /v1/tokenize endpoint. @@ -134,6 +138,15 @@ def text( payload["analyzerConfig"] = ac_dict if stopwords is not None: + if isinstance(stopwords, StopwordsConfig): + # Widen from the read-side shape returned by config.get() to the + # write-side shape the server expects. Field parity between the + # two classes is enforced at import time in + # ``weaviate/collections/classes/config.py``, so iterating + # ``StopwordsCreate.model_fields`` copies every field. + stopwords = StopwordsCreate( + **{name: getattr(stopwords, name) for name in StopwordsCreate.model_fields} + ) sw_dict = stopwords._to_dict() if sw_dict: payload["stopwords"] = sw_dict diff --git a/weaviate/tokenization/sync.pyi b/weaviate/tokenization/sync.pyi index 7edf8994a..389edd485 100644 --- a/weaviate/tokenization/sync.pyi +++ b/weaviate/tokenization/sync.pyi @@ -1,6 +1,7 @@ -from typing import Dict, List, Optional, overload +from typing import Dict, List, Optional, Union, overload from weaviate.collections.classes.config import ( + StopwordsConfig, StopwordsCreate, TextAnalyzerConfigCreate, Tokenization, @@ -18,7 +19,7 @@ class _Tokenization(_TokenizationExecutor[ConnectionSync]): tokenization: Tokenization, *, analyzer_config: Optional[TextAnalyzerConfigCreate] = ..., - stopwords: Optional[StopwordsCreate] = ..., + stopwords: Optional[Union[StopwordsCreate, StopwordsConfig]] = ..., ) -> TokenizeResult: ... @overload def text( From 64bed62ea2dc6f3a05984d1ae4ce0700600027c7 Mon Sep 17 00:00:00 2001 From: Dirk Kulawiak Date: Tue, 21 Apr 2026 16:24:06 +0200 Subject: [PATCH 12/14] Improve docstring --- weaviate/tokenization/executor.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/weaviate/tokenization/executor.py b/weaviate/tokenization/executor.py index ea36e1cda..0d287ba0e 100644 --- a/weaviate/tokenization/executor.py +++ b/weaviate/tokenization/executor.py @@ -107,11 +107,15 @@ def text( ``stopword_preset`` may reference a built-in preset (``en`` / ``none``) or a name defined in ``stopword_presets``. stopwords: One-off stopwords block applied directly to this request. + Mirrors the collection-level ``invertedIndexConfig.stopwords`` + shape — hence the rich model with preset / additions / removals. Mutually exclusive with ``stopword_presets``. - stopword_presets: Named-preset catalog (name → word list). Entries - can be referenced from ``analyzer_config.stopword_preset`` or - override built-ins like ``en``. Mutually exclusive with - ``stopwords``. + stopword_presets: Named-preset catalog (name → word list). Mirrors + the property-level preset catalog — a plain mapping, since a + property only references a preset by name (via + ``analyzer_config.stopword_preset``) rather than carrying the + full stopwords block. Entries can override built-ins like + ``en``. Mutually exclusive with ``stopwords``. Returns: A ``TokenizeResult`` with indexed and query token lists. The generic From 220e839360848a8c67b7eab322a97b232d12d5c9 Mon Sep 17 00:00:00 2001 From: Dirk Kulawiak Date: Tue, 21 Apr 2026 16:26:17 +0200 Subject: [PATCH 13/14] Hook up tokenization and clean up model --- weaviate/classes/__init__.py | 2 ++ weaviate/tokenization/models.py | 4 +--- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/weaviate/classes/__init__.py b/weaviate/classes/__init__.py index d495744ac..69af5d920 100644 --- a/weaviate/classes/__init__.py +++ b/weaviate/classes/__init__.py @@ -13,6 +13,7 @@ rbac, replication, tenants, + tokenization, ) # noqa: F401 from .config import ConsistencyLevel @@ -29,6 +30,7 @@ "init", "query", "tenants", + "tokenization", "rbac", "replication", ] diff --git a/weaviate/tokenization/models.py b/weaviate/tokenization/models.py index f8fe7cb67..baeac140c 100644 --- a/weaviate/tokenization/models.py +++ b/weaviate/tokenization/models.py @@ -2,7 +2,7 @@ from typing import List -from pydantic import BaseModel, ConfigDict +from pydantic import BaseModel class TokenizeResult(BaseModel): @@ -13,7 +13,5 @@ class TokenizeResult(BaseModel): query: Tokens as they would be used for querying (after stopword removal). """ - model_config = ConfigDict(populate_by_name=True, arbitrary_types_allowed=True) - indexed: List[str] query: List[str] From 081aaef36f83890eeb839e44363851477c44d1ca Mon Sep 17 00:00:00 2001 From: Dirk Kulawiak Date: Wed, 22 Apr 2026 16:54:36 +0200 Subject: [PATCH 14/14] Move property back to tokenization --- integration/test_tokenize.py | 18 ++++++----- weaviate/collections/config/async_.pyi | 2 -- weaviate/collections/config/executor.py | 40 ------------------------- weaviate/collections/config/sync.pyi | 2 -- weaviate/tokenization/async_.pyi | 3 ++ weaviate/tokenization/executor.py | 40 +++++++++++++++++++++++++ weaviate/tokenization/sync.pyi | 1 + 7 files changed, 54 insertions(+), 52 deletions(-) diff --git a/integration/test_tokenize.py b/integration/test_tokenize.py index dc244d2c3..d2d46916d 100644 --- a/integration/test_tokenize.py +++ b/integration/test_tokenize.py @@ -288,7 +288,9 @@ def test_property_and_generic_endpoints_agree( stopwords = config.inverted_index_config.stopwords text = "the quick brown fox" - via_property = recipe_collection.config.tokenize_property(property_name="recipe", text=text) + via_property = client.tokenization.for_property( + collection=recipe_collection.name, property_name="recipe", text=text + ) via_generic = client.tokenization.text( text=text, tokenization=recipe.tokenization, @@ -325,8 +327,9 @@ def test_property_result_shape(self, client: weaviate.WeaviateClient) -> None: ], } ) - col = client.collections.get("TestDeserPropTypes") - result = col.config.tokenize_property(property_name="tag", text=" Hello World ") + result = client.tokenization.for_property( + collection="TestDeserPropTypes", property_name="tag", text=" Hello World " + ) assert isinstance(result, TokenizeResult) assert result.indexed == ["Hello World"] finally: @@ -442,9 +445,8 @@ def test_text_raises_on_old_server(self, client: weaviate.WeaviateClient) -> Non def test_tokenize_property_raises_on_old_server(self, client: weaviate.WeaviateClient) -> None: if client._connection._weaviate_version.is_at_least(1, 37, 0): pytest.skip("Version gate only applies to Weaviate < 1.37.0") - col = client.collections.get("Any") with pytest.raises(WeaviateUnsupportedFeatureError): - col.config.tokenize_property(property_name="title", text="hello") + client.tokenization.for_property(collection="Any", property_name="title", text="hello") # --------------------------------------------------------------------------- @@ -454,7 +456,7 @@ def test_tokenize_property_raises_on_old_server(self, client: weaviate.WeaviateC @pytest.mark.usefixtures("require_1_37") class TestAsyncClient: - """Verify text() and tokenize_property() work through the async client.""" + """Verify tokenization.text() and tokenization.for_property() work through the async client.""" @pytest.mark.asyncio async def test_text_tokenize(self, async_client: weaviate.WeaviateAsyncClient) -> None: @@ -498,8 +500,8 @@ async def test_property_tokenize(self, async_client: weaviate.WeaviateAsyncClien ], } ) - col = async_client.collections.get("TestAsyncPropTokenize") - result = await col.config.tokenize_property( + result = await async_client.tokenization.for_property( + collection="TestAsyncPropTokenize", property_name="title", text="The quick brown fox", ) diff --git a/weaviate/collections/config/async_.pyi b/weaviate/collections/config/async_.pyi index a1f740ded..015b70dab 100644 --- a/weaviate/collections/config/async_.pyi +++ b/weaviate/collections/config/async_.pyi @@ -27,7 +27,6 @@ from weaviate.collections.classes.config import ( from weaviate.collections.classes.config_object_ttl import _ObjectTTLConfigUpdate from weaviate.collections.classes.config_vector_index import _VectorIndexConfigDynamicUpdate from weaviate.connect.v4 import ConnectionAsync -from weaviate.tokenization.models import TokenizeResult from .executor import _ConfigCollectionExecutor @@ -91,4 +90,3 @@ class _ConfigCollectionAsync(_ConfigCollectionExecutor[ConnectionAsync]): self, *, vector_config: Union[_VectorConfigCreate, List[_VectorConfigCreate]] ) -> None: ... async def delete_property_index(self, property_name: str, index_name: IndexName) -> bool: ... - async def tokenize_property(self, property_name: str, text: str) -> TokenizeResult: ... diff --git a/weaviate/collections/config/executor.py b/weaviate/collections/config/executor.py index fe9f5ec0d..103ab70ac 100644 --- a/weaviate/collections/config/executor.py +++ b/weaviate/collections/config/executor.py @@ -56,7 +56,6 @@ WeaviateInvalidInputError, WeaviateUnsupportedFeatureError, ) -from weaviate.tokenization.models import TokenizeResult from weaviate.util import ( _capitalize_first_letter, _decode_json_response_dict, @@ -667,42 +666,3 @@ def resp(res: Response) -> bool: error_msg="Property may not exist", status_codes=_ExpectedStatusCodes(ok_in=[200], error="property exists"), ) - - def tokenize_property( - self, - property_name: str, - text: str, - ) -> executor.Result[TokenizeResult]: - """Tokenize text using a property's configured tokenization settings. - - Args: - property_name: The property name whose tokenization config to use. - text: The text to tokenize. - - Returns: - A TokenizeResult with indexed and query token lists. - - Raises: - WeaviateUnsupportedFeatureError: If the server version is below 1.37.0. - """ - if self._connection._weaviate_version.is_lower_than(1, 37, 0): - raise WeaviateUnsupportedFeatureError( - "Tokenization", - str(self._connection._weaviate_version), - "1.37.0", - ) - - path = f"/schema/{self._name}/properties/{property_name}/tokenize" - payload: Dict[str, Any] = {"text": text} - - def resp(response: Response) -> TokenizeResult: - return TokenizeResult.model_validate(response.json()) - - return executor.execute( - response_callback=resp, - method=self._connection.post, - path=path, - weaviate_object=payload, - error_msg="Property tokenization failed", - status_codes=_ExpectedStatusCodes(ok_in=[200], error="tokenize property text"), - ) diff --git a/weaviate/collections/config/sync.pyi b/weaviate/collections/config/sync.pyi index 3664a0e1b..e54d8c8fc 100644 --- a/weaviate/collections/config/sync.pyi +++ b/weaviate/collections/config/sync.pyi @@ -27,7 +27,6 @@ from weaviate.collections.classes.config import ( from weaviate.collections.classes.config_object_ttl import _ObjectTTLConfigUpdate from weaviate.collections.classes.config_vector_index import _VectorIndexConfigDynamicUpdate from weaviate.connect.v4 import ConnectionSync -from weaviate.tokenization.models import TokenizeResult from .executor import _ConfigCollectionExecutor @@ -89,4 +88,3 @@ class _ConfigCollection(_ConfigCollectionExecutor[ConnectionSync]): self, *, vector_config: Union[_VectorConfigCreate, List[_VectorConfigCreate]] ) -> None: ... def delete_property_index(self, property_name: str, index_name: IndexName) -> bool: ... - def tokenize_property(self, property_name: str, text: str) -> TokenizeResult: ... diff --git a/weaviate/tokenization/async_.pyi b/weaviate/tokenization/async_.pyi index 156e25c90..ba12abc2a 100644 --- a/weaviate/tokenization/async_.pyi +++ b/weaviate/tokenization/async_.pyi @@ -30,3 +30,6 @@ class _TokenizationAsync(_TokenizationExecutor[ConnectionAsync]): analyzer_config: Optional[TextAnalyzerConfigCreate] = ..., stopword_presets: Optional[Dict[str, List[str]]] = ..., ) -> TokenizeResult: ... + async def for_property( + self, collection: str, property_name: str, text: str + ) -> TokenizeResult: ... diff --git a/weaviate/tokenization/executor.py b/weaviate/tokenization/executor.py index 0d287ba0e..33f1c05f9 100644 --- a/weaviate/tokenization/executor.py +++ b/weaviate/tokenization/executor.py @@ -14,6 +14,7 @@ from weaviate.connect.v4 import ConnectionType, _ExpectedStatusCodes from weaviate.exceptions import WeaviateUnsupportedFeatureError from weaviate.tokenization.models import TokenizeResult +from weaviate.util import _capitalize_first_letter class _TokenizationExecutor(Generic[ConnectionType]): @@ -189,3 +190,42 @@ def resp(response: Response) -> TokenizeResult: error_msg="Tokenization failed", status_codes=_ExpectedStatusCodes(ok_in=[200], error="tokenize text"), ) + + def for_property( + self, + collection: str, + property_name: str, + text: str, + ) -> executor.Result[TokenizeResult]: + """Tokenize text using a property's configured tokenization settings. + + The server resolves the tokenization and analyzer configuration from + the property's schema, so callers only supply the text. + + Args: + collection: The collection that owns the property. + property_name: The property name whose tokenization config to use. + text: The text to tokenize. + + Returns: + A TokenizeResult with indexed and query token lists. + + Raises: + WeaviateUnsupportedFeatureError: If the server version is below 1.37.0. + """ + self.__check_version() + + path = f"/schema/{_capitalize_first_letter(collection)}/properties/{property_name}/tokenize" + payload: Dict[str, Any] = {"text": text} + + def resp(response: Response) -> TokenizeResult: + return TokenizeResult.model_validate(response.json()) + + return executor.execute( + response_callback=resp, + method=self._connection.post, + path=path, + weaviate_object=payload, + error_msg="Property tokenization failed", + status_codes=_ExpectedStatusCodes(ok_in=[200], error="tokenize property text"), + ) diff --git a/weaviate/tokenization/sync.pyi b/weaviate/tokenization/sync.pyi index 389edd485..71aaaea5c 100644 --- a/weaviate/tokenization/sync.pyi +++ b/weaviate/tokenization/sync.pyi @@ -30,3 +30,4 @@ class _Tokenization(_TokenizationExecutor[ConnectionSync]): analyzer_config: Optional[TextAnalyzerConfigCreate] = ..., stopword_presets: Optional[Dict[str, List[str]]] = ..., ) -> TokenizeResult: ... + def for_property(self, collection: str, property_name: str, text: str) -> TokenizeResult: ...