From 612b5ac20be4646973f1aab0043a59a99bd94205 Mon Sep 17 00:00:00 2001 From: Jamie Milsom Date: Wed, 24 Jun 2026 11:14:33 +0100 Subject: [PATCH 1/4] docs: fix typos, spelling and docstring formatting in vectorisers --- src/classifai/vectorisers/__init__.py | 28 +++++++++++------------- src/classifai/vectorisers/base.py | 12 +++++----- src/classifai/vectorisers/gcp.py | 8 +++---- src/classifai/vectorisers/huggingface.py | 10 ++++----- src/classifai/vectorisers/ollama.py | 10 ++++----- 5 files changed, 32 insertions(+), 36 deletions(-) diff --git a/src/classifai/vectorisers/__init__.py b/src/classifai/vectorisers/__init__.py index 8d6cb21..0c588c1 100644 --- a/src/classifai/vectorisers/__init__.py +++ b/src/classifai/vectorisers/__init__.py @@ -1,5 +1,5 @@ # pylint: disable=C0301 -"""This module provides classes for creating and utilizing embedding models. +"""This module provides classes for creating and utilising embedding models from different services. The Vectoriser module offers a unified interface to interact with various other ClassifAI Package Modules. Generally Vectorisers are used to convert text data into numerical embeddings that can be used for machine learning tasks. @@ -8,11 +8,10 @@ ########################### # Vectoriser Overview -In our Package, Vectoriser have a simple role: - - * Take in text data (as a string or list of strings) - * Output numerical embeddings (as a numpy array) - * Each Vectortiser should provide a `transform` method to perform this conversion. +In our Package, Vectorisers have a simple role: + - Take in text data (as a string or list of strings) + - Output numerical embeddings (as a numpy array) + - Each Vectoriser should provide a `transform` method to perform this conversion. It is possible for users to implement their own Vectoriser classes by inheriting from the `VectoriserBase` abstract base class and implementing the `transform` method. @@ -22,15 +21,14 @@ ########################### # Implemented Vectorisers -We provide several robust implementations of Vectorisers that interface with popular services and libraries. - -The module contains the following 'ready-made' classes: +We provide several quick implementations of Vectorisers that interface with popular services and libraries. - * `GcpVectoriser`: A class for embedding text using Google Cloud Platform's GenAI API. - * `HuggingFaceVectoriser`: A general wrapper class for Huggingface Transformers - models to generate text embeddings. - * `OllamaVectoriser`: A general wrapper class for using a locally running ollama - server to generate text embeddings. +This module contains the following 'ready-made' classes: +- `GcpVectoriser`: A class for embedding text using Google Cloud Platform's GenAI API. +- `HuggingFaceVectoriser`: A general wrapper class for Huggingface Transformers +models to generate text embeddings. +- `OllamaVectoriser`: A general wrapper class for using a locally running Ollama +server to generate text embeddings. Each class is designed to interface with a specific service that provides embedding model functionality. @@ -39,7 +37,7 @@ The `HuggingFaceVectoriser` class utilizes models from the Huggingface Transformers library. -The `OllamaVectoriser` class can use any local/downloaded model which can be served by ollama. +The `OllamaVectoriser` class can use any local/downloaded model which can be served by Ollama. These classes abstract the underlying implementation details, providing a simple and consistent interface for embedding text using different services. diff --git a/src/classifai/vectorisers/base.py b/src/classifai/vectorisers/base.py index f721901..b001134 100644 --- a/src/classifai/vectorisers/base.py +++ b/src/classifai/vectorisers/base.py @@ -1,4 +1,4 @@ -"""This module provides classes for creating and utilizing embedding models from different services. +"""This module provides classes for creating and utilising embedding models from different services. The Vectoriser module offers a unified interface to interact with various other ClassifAI Package Modules. Generally Vectorisers are used to convert text data into numerical embeddings that can be used for machine learning tasks. @@ -7,10 +7,10 @@ ########################### # Vectoriser Overview -In our Package, Vectoriser have a simple role: +In our Package, Vectorisers have a simple role: - Take in text data (as a string or list of strings) - Output numerical embeddings (as a numpy array) - - Each Vectortiser should provide a `transform` method to perform this conversion. + - Each Vectoriser should provide a `transform` method to perform this conversion. It is possible for users to implement their own Vectoriser classes by inheriting from the `VectoriserBase` abstract base class and implementing the `transform` method. @@ -22,11 +22,11 @@ We provide several quick implementations of Vectorisers that interface with popular services and libraries. -The module contains the following 'ready-made' classes: +This module contains the following 'ready-made' classes: - `GcpVectoriser`: A class for embedding text using Google Cloud Platform's GenAI API. - `HuggingFaceVectoriser`: A general wrapper class for Huggingface Transformers models to generate text embeddings. -- `OllamaVectoriser`: A general wrapper class for using a locally running ollama +- `OllamaVectoriser`: A general wrapper class for using a locally running Ollama server to generate text embeddings. Each class is designed to interface with a specific service that provides embedding model @@ -36,7 +36,7 @@ The `HuggingFaceVectoriser` class utilizes models from the Huggingface Transformers library. -The `OllamaVectoriser` class can use any local/downloaded model which can be served by ollama. +The `OllamaVectoriser` class can use any local/downloaded model which can be served by Ollama. These classes abstract the underlying implementation details, providing a simple and consistent interface for embedding text using different services. diff --git a/src/classifai/vectorisers/gcp.py b/src/classifai/vectorisers/gcp.py index 10f55d3..f086f08 100644 --- a/src/classifai/vectorisers/gcp.py +++ b/src/classifai/vectorisers/gcp.py @@ -48,12 +48,12 @@ def __init__( task_type="CLASSIFICATION", **client_kwargs, ): - """Initializes the GcpVectoriser with the specified project ID, location, and model name. + """Initialises the GcpVectoriser with the specified project ID, location, and model name. Args: project_id (str): [optional] The Google Cloud project ID. Defaults to None. api_key (str): [optional] The API key for authenticating with the GenAI API. Defaults to None. - location (str): [optional] The location of the GenAI API. Defaults to None. + location (str): [optional] The location of the GenAI API. Defaults to "europe-west2". model_name (str): [optional] The name of the embedding model. Defaults to "text-embedding-004". task_type (str): [optional] The embedding task. Defaults to "CLASSIFICATION". See https://cloud.google.com/vertex-ai/generative-ai/docs/embeddings/task-types @@ -61,7 +61,7 @@ def __init__( **client_kwargs: [optional] Additional keyword arguments to pass to the GenAI client. Raises: - `ConfigurationError`: If the GenAI client fails to initialize. + `ConfigurationError`: If the authentication arguments are invalid, or if the GenAI client fails to initialise. """ check_deps(["google-genai"], extra="gcp") from google import genai # type: ignore @@ -86,7 +86,7 @@ def __init__( ) except Exception as e: raise ConfigurationError( - "Failed to initialize GCP GenAI client.", + "Failed to initialise GCP GenAI client.", context={"vectoriser": "gcp", "cause": str(e), "cause_type": type(e).__name__}, ) from e diff --git a/src/classifai/vectorisers/huggingface.py b/src/classifai/vectorisers/huggingface.py index 5ac4d66..fb4fa84 100644 --- a/src/classifai/vectorisers/huggingface.py +++ b/src/classifai/vectorisers/huggingface.py @@ -13,7 +13,7 @@ class HuggingFaceVectoriser(VectoriserBase): The `HuggingFaceVectoriser` accepts most encoder-based models from the Huggingface Transformers library, and provides a simple interface to generate embeddings from text data. Additional configuration options, - such as `trust_remote` or a HuggingFaceAPI token can be passed via the `tokenizer_kwargs` and `model_kwargs` + such as `trust_remote_code` or a HuggingFaceAPI token can be passed via the `tokenizer_kwargs` and `model_kwargs` parameters. Attributes: @@ -21,8 +21,6 @@ class HuggingFaceVectoriser(VectoriserBase): tokenizer (transformers.PreTrainedTokenizer): The tokenizer for the specified model. model (transformers.PreTrainedModel): The Huggingface model instance. device (torch.device): The device (CPU or GPU) on which the model is loaded. - tokenizer_kwargs (dict): Additional keyword arguments passed to the tokenizer. - model_kwargs (dict): Additional keyword arguments passed to the model. """ def __init__( @@ -33,7 +31,7 @@ def __init__( tokenizer_kwargs: dict | None = None, model_kwargs: dict | None = None, ): - """Initializes the HuggingfaceVectoriser with the specified model name and device. + """Initialises the HuggingfaceVectoriser with the specified model name and device. Args: model_name (str): The name of the Huggingface model to use. @@ -44,7 +42,7 @@ def __init__( Raises: `ExternalServiceError`: If the model or tokenizer cannot be loaded. - `ConfigurationError`: If the model cannot be initialized on the specified device. + `ConfigurationError`: If the model cannot be initialised on the specified device. """ check_deps(["transformers", "torch"], extra="huggingface") import torch # type: ignore @@ -85,7 +83,7 @@ def __init__( self.model.eval() except Exception as e: raise ConfigurationError( - "Failed to initialize model on device.", + "Failed to initialise model on device.", context={ "vectoriser": "huggingface", "model": model_name, diff --git a/src/classifai/vectorisers/ollama.py b/src/classifai/vectorisers/ollama.py index 4202c4a..09ff36a 100644 --- a/src/classifai/vectorisers/ollama.py +++ b/src/classifai/vectorisers/ollama.py @@ -9,7 +9,7 @@ class OllamaVectoriser(VectoriserBase): - """A wrapper class allowing a locally-running ollama server to generate text embeddings. + """A wrapper class allowing a locally-running Ollama server to generate text embeddings. The `OllamaVectoriser` interacts with a locally-running Ollama server, which must be set up by the user separately. @@ -22,23 +22,23 @@ class OllamaVectoriser(VectoriserBase): """ def __init__(self, model_name: str): - """Initializes the OllamaVectoriser with the specified model name and device. + """Initialises the OllamaVectoriser with the specified model name and device. Args: model_name (str): The name of the local model to use. Notes: - requires an ollama server to be running locally (`ollama serve`) + requires an Ollama server to be running locally (`ollama serve`) """ check_deps(["ollama"], extra="ollama") self.model_name = model_name def transform(self, texts: str | list[str]) -> np.ndarray: - """Transforms input text(s) into embeddings using the Huggingface model. + """Transforms input text(s) into embeddings using the Ollama model. Args: - texts (str ,list [str]): The input text(s) to embed. Can be a single string or a list of strings. + texts (str, list [str]): The input text(s) to embed. Can be a single string or a list of strings. Returns: numpy.ndarray: A 2D array of embeddings, where each row corresponds to an input text. From e82cc8a96275f8c065a639baa01014c7e6c4377f Mon Sep 17 00:00:00 2001 From: Jamie Milsom Date: Wed, 24 Jun 2026 15:29:47 +0100 Subject: [PATCH 2/4] add newlines after docstring summaries --- src/classifai/vectorisers/__init__.py | 1 + src/classifai/vectorisers/base.py | 1 + 2 files changed, 2 insertions(+) diff --git a/src/classifai/vectorisers/__init__.py b/src/classifai/vectorisers/__init__.py index 0c588c1..a361e5d 100644 --- a/src/classifai/vectorisers/__init__.py +++ b/src/classifai/vectorisers/__init__.py @@ -1,5 +1,6 @@ # pylint: disable=C0301 """This module provides classes for creating and utilising embedding models from different services. + The Vectoriser module offers a unified interface to interact with various other ClassifAI Package Modules. Generally Vectorisers are used to convert text data into numerical embeddings that can be used for machine learning tasks. diff --git a/src/classifai/vectorisers/base.py b/src/classifai/vectorisers/base.py index b001134..ace54d7 100644 --- a/src/classifai/vectorisers/base.py +++ b/src/classifai/vectorisers/base.py @@ -1,4 +1,5 @@ """This module provides classes for creating and utilising embedding models from different services. + The Vectoriser module offers a unified interface to interact with various other ClassifAI Package Modules. Generally Vectorisers are used to convert text data into numerical embeddings that can be used for machine learning tasks. From 3a5020132b0527eab1be0c03ecc1e504778d6a9a Mon Sep 17 00:00:00 2001 From: Jamie Milsom Date: Thu, 25 Jun 2026 11:55:32 +0100 Subject: [PATCH 3/4] fix bullet point formatting for quarto --- src/classifai/vectorisers/__init__.py | 7 ++++--- src/classifai/vectorisers/base.py | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/classifai/vectorisers/__init__.py b/src/classifai/vectorisers/__init__.py index a361e5d..53f9eb3 100644 --- a/src/classifai/vectorisers/__init__.py +++ b/src/classifai/vectorisers/__init__.py @@ -10,9 +10,10 @@ # Vectoriser Overview In our Package, Vectorisers have a simple role: - - Take in text data (as a string or list of strings) - - Output numerical embeddings (as a numpy array) - - Each Vectoriser should provide a `transform` method to perform this conversion. + + - Take in text data (as a string or list of strings) + - Output numerical embeddings (as a numpy array) + - Each Vectoriser should provide a `transform` method to perform this conversion. It is possible for users to implement their own Vectoriser classes by inheriting from the `VectoriserBase` abstract base class and implementing the `transform` method. diff --git a/src/classifai/vectorisers/base.py b/src/classifai/vectorisers/base.py index ace54d7..db71ae1 100644 --- a/src/classifai/vectorisers/base.py +++ b/src/classifai/vectorisers/base.py @@ -9,10 +9,11 @@ # Vectoriser Overview In our Package, Vectorisers have a simple role: - - Take in text data (as a string or list of strings) - - Output numerical embeddings (as a numpy array) - - Each Vectoriser should provide a `transform` method to perform this conversion. + - Take in text data (as a string or list of strings) + - Output numerical embeddings (as a numpy array) + - Each Vectoriser should provide a `transform` method to perform this conversion. + It is possible for users to implement their own Vectoriser classes by inheriting from the `VectoriserBase` abstract base class and implementing the `transform` method. From 4fda6b564b4403d00cb4d3562ec9a748fe31dc03 Mon Sep 17 00:00:00 2001 From: Jamie Milsom Date: Thu, 25 Jun 2026 17:16:36 +0100 Subject: [PATCH 4/4] fix ruff formatting mistake --- src/classifai/vectorisers/base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/classifai/vectorisers/base.py b/src/classifai/vectorisers/base.py index db71ae1..085b966 100644 --- a/src/classifai/vectorisers/base.py +++ b/src/classifai/vectorisers/base.py @@ -13,7 +13,6 @@ - Take in text data (as a string or list of strings) - Output numerical embeddings (as a numpy array) - Each Vectoriser should provide a `transform` method to perform this conversion. - It is possible for users to implement their own Vectoriser classes by inheriting from the `VectoriserBase` abstract base class and implementing the `transform` method.