Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/main.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@ env:
WEAVIATE_132: 1.32.27
WEAVIATE_133: 1.33.18
WEAVIATE_134: 1.34.19
WEAVIATE_135: 1.35.17
WEAVIATE_136: 1.36.10
WEAVIATE_137: 1.37.1
WEAVIATE_135: 1.35.18
WEAVIATE_136: 1.36.12
WEAVIATE_137: 1.37.1-4e61e26.amd64

jobs:
lint-and-format:
Expand Down
520 changes: 327 additions & 193 deletions integration/test_tokenize.py

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions weaviate/classes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
rbac,
replication,
tenants,
tokenization,
) # noqa: F401
from .config import ConsistencyLevel

Expand All @@ -29,6 +30,7 @@
"init",
"query",
"tenants",
"tokenization",
"rbac",
"replication",
]
4 changes: 4 additions & 0 deletions weaviate/classes/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,10 @@
ReferenceProperty,
ReplicationDeletionStrategy,
Rerankers,
StopwordsCreate,
StopwordsPreset,
TextAnalyzerConfig,
TextAnalyzerConfigCreate,
Tokenization,
VectorDistances,
)
Expand All @@ -39,8 +41,10 @@
"PQEncoderType",
"ReferenceProperty",
"Rerankers",
"StopwordsCreate",
"StopwordsPreset",
"TextAnalyzerConfig",
"TextAnalyzerConfigCreate",
"Tokenization",
"Vectorizers",
"VectorDistances",
Expand Down
17 changes: 17 additions & 0 deletions weaviate/classes/tokenization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from weaviate.collections.classes.config import (
StopwordsConfig,
StopwordsCreate,
StopwordsPreset,
TextAnalyzerConfigCreate,
Tokenization,
)
from weaviate.tokenization.models import TokenizeResult

__all__ = [
"StopwordsConfig",
"StopwordsCreate",
"StopwordsPreset",
"TextAnalyzerConfigCreate",
"Tokenization",
"TokenizeResult",
]
24 changes: 24 additions & 0 deletions weaviate/collections/classes/config.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import datetime
from dataclasses import dataclass
from dataclasses import fields as _dataclass_fields
from typing import (
Any,
ClassVar,
Expand Down Expand Up @@ -1647,6 +1648,26 @@ class _StopwordsConfig(_ConfigBase):


StopwordsConfig = _StopwordsConfig
StopwordsCreate = _StopwordsCreate

# Invariant: the read-side dataclass (_StopwordsConfig) and the write-side
# pydantic model (_StopwordsCreate) must carry the same set of field names so
# that values round-tripped from ``collection.config.get()`` can flow back into
# ``tokenization.text()`` without silent data loss. If a field is added to one
# but not the other, importing this module fails loudly; the read→write
# conversion in ``weaviate/tokenization/executor.py::_TokenizationExecutor.text``
# depends on this parity.
_read_fields = {f.name for f in _dataclass_fields(_StopwordsConfig)}
_write_fields = set(_StopwordsCreate.model_fields.keys())
if _read_fields != _write_fields:
raise RuntimeError(
"_StopwordsConfig / _StopwordsCreate field drift detected — "
f"read-only={_read_fields - _write_fields}, "
f"write-only={_write_fields - _read_fields}. "
"Update both classes together, or adapt the read→write conversion in "
"weaviate/tokenization/executor.py::_TokenizationExecutor.text."
)
del _read_fields, _write_fields
Comment thread
dirkkul marked this conversation as resolved.


@dataclass
Expand Down Expand Up @@ -2224,6 +2245,9 @@ def _validate_ascii_fold_ignore(self) -> "_TextAnalyzerConfigCreate":
return self


TextAnalyzerConfigCreate = _TextAnalyzerConfigCreate


class Property(_ConfigCreateModel):
"""This class defines the structure of a data property that a collection can have within Weaviate.

Expand Down
2 changes: 0 additions & 2 deletions weaviate/collections/config/async_.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ from weaviate.collections.classes.config import (
from weaviate.collections.classes.config_object_ttl import _ObjectTTLConfigUpdate
from weaviate.collections.classes.config_vector_index import _VectorIndexConfigDynamicUpdate
from weaviate.connect.v4 import ConnectionAsync
from weaviate.tokenization.models import TokenizeResult

from .executor import _ConfigCollectionExecutor

Expand Down Expand Up @@ -91,4 +90,3 @@ class _ConfigCollectionAsync(_ConfigCollectionExecutor[ConnectionAsync]):
self, *, vector_config: Union[_VectorConfigCreate, List[_VectorConfigCreate]]
) -> None: ...
async def delete_property_index(self, property_name: str, index_name: IndexName) -> bool: ...
async def tokenize_property(self, property_name: str, text: str) -> TokenizeResult: ...
40 changes: 0 additions & 40 deletions weaviate/collections/config/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@
WeaviateInvalidInputError,
WeaviateUnsupportedFeatureError,
)
from weaviate.tokenization.models import TokenizeResult
from weaviate.util import (
_capitalize_first_letter,
_decode_json_response_dict,
Expand Down Expand Up @@ -667,42 +666,3 @@ def resp(res: Response) -> bool:
error_msg="Property may not exist",
status_codes=_ExpectedStatusCodes(ok_in=[200], error="property exists"),
)

def tokenize_property(
self,
property_name: str,
text: str,
) -> executor.Result[TokenizeResult]:
"""Tokenize text using a property's configured tokenization settings.

Args:
property_name: The property name whose tokenization config to use.
text: The text to tokenize.

Returns:
A TokenizeResult with indexed and query token lists.

Raises:
WeaviateUnsupportedFeatureError: If the server version is below 1.37.0.
"""
if self._connection._weaviate_version.is_lower_than(1, 37, 0):
raise WeaviateUnsupportedFeatureError(
"Tokenization",
str(self._connection._weaviate_version),
"1.37.0",
)

path = f"/schema/{self._name}/properties/{property_name}/tokenize"
payload: Dict[str, Any] = {"text": text}

def resp(response: Response) -> TokenizeResult:
return TokenizeResult.model_validate(response.json())

return executor.execute(
response_callback=resp,
method=self._connection.post,
path=path,
weaviate_object=payload,
error_msg="Property tokenization failed",
status_codes=_ExpectedStatusCodes(ok_in=[200], error="tokenize property text"),
)
2 changes: 0 additions & 2 deletions weaviate/collections/config/sync.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ from weaviate.collections.classes.config import (
from weaviate.collections.classes.config_object_ttl import _ObjectTTLConfigUpdate
from weaviate.collections.classes.config_vector_index import _VectorIndexConfigDynamicUpdate
from weaviate.connect.v4 import ConnectionSync
from weaviate.tokenization.models import TokenizeResult

from .executor import _ConfigCollectionExecutor

Expand Down Expand Up @@ -89,4 +88,3 @@ class _ConfigCollection(_ConfigCollectionExecutor[ConnectionSync]):
self, *, vector_config: Union[_VectorConfigCreate, List[_VectorConfigCreate]]
) -> None: ...
def delete_property_index(self, property_name: str, index_name: IndexName) -> bool: ...
def tokenize_property(self, property_name: str, text: str) -> TokenizeResult: ...
24 changes: 19 additions & 5 deletions weaviate/tokenization/async_.pyi
Original file line number Diff line number Diff line change
@@ -1,21 +1,35 @@
from typing import Dict, Optional
from typing import Dict, List, Optional, Union, overload

from weaviate.collections.classes.config import (
StopwordsConfig,
StopwordsCreate,
TextAnalyzerConfigCreate,
Tokenization,
_StopwordsCreate,
_TextAnalyzerConfigCreate,
)
from weaviate.connect.v4 import ConnectionAsync
from weaviate.tokenization.models import TokenizeResult

from .executor import _TokenizationExecutor

class _TokenizationAsync(_TokenizationExecutor[ConnectionAsync]):
@overload
async def text(
self,
text: str,
tokenization: Tokenization,
*,
analyzer_config: Optional[_TextAnalyzerConfigCreate] = None,
stopword_presets: Optional[Dict[str, _StopwordsCreate]] = None,
analyzer_config: Optional[TextAnalyzerConfigCreate] = ...,
stopwords: Optional[Union[StopwordsCreate, StopwordsConfig]] = ...,
) -> TokenizeResult: ...
@overload
async def text(
self,
text: str,
tokenization: Tokenization,
*,
analyzer_config: Optional[TextAnalyzerConfigCreate] = ...,
stopword_presets: Optional[Dict[str, List[str]]] = ...,
) -> TokenizeResult: ...
async def for_property(
self, collection: str, property_name: str, text: str
) -> TokenizeResult: ...
Loading
Loading