diff --git a/src/classifai/vectorisers/__init__.py b/src/classifai/vectorisers/__init__.py index 8d6cb21..53f9eb3 100644 --- a/src/classifai/vectorisers/__init__.py +++ b/src/classifai/vectorisers/__init__.py @@ -1,5 +1,6 @@ # pylint: disable=C0301 -"""This module provides classes for creating and utilizing embedding models. +"""This module provides classes for creating and utilising embedding models from different services. + The Vectoriser module offers a unified interface to interact with various other ClassifAI Package Modules. Generally Vectorisers are used to convert text data into numerical embeddings that can be used for machine learning tasks. @@ -8,11 +9,11 @@ ########################### # Vectoriser Overview -In our Package, Vectoriser have a simple role: +In our Package, Vectorisers have a simple role: - * Take in text data (as a string or list of strings) - * Output numerical embeddings (as a numpy array) - * Each Vectortiser should provide a `transform` method to perform this conversion. + - Take in text data (as a string or list of strings) + - Output numerical embeddings (as a numpy array) + - Each Vectoriser should provide a `transform` method to perform this conversion. It is possible for users to implement their own Vectoriser classes by inheriting from the `VectoriserBase` abstract base class and implementing the `transform` method. @@ -22,15 +23,14 @@ ########################### # Implemented Vectorisers -We provide several robust implementations of Vectorisers that interface with popular services and libraries. - -The module contains the following 'ready-made' classes: +We provide several quick implementations of Vectorisers that interface with popular services and libraries. - * `GcpVectoriser`: A class for embedding text using Google Cloud Platform's GenAI API. - * `HuggingFaceVectoriser`: A general wrapper class for Huggingface Transformers - models to generate text embeddings. - * `OllamaVectoriser`: A general wrapper class for using a locally running ollama - server to generate text embeddings. +This module contains the following 'ready-made' classes: +- `GcpVectoriser`: A class for embedding text using Google Cloud Platform's GenAI API. +- `HuggingFaceVectoriser`: A general wrapper class for Huggingface Transformers +models to generate text embeddings. +- `OllamaVectoriser`: A general wrapper class for using a locally running Ollama +server to generate text embeddings. Each class is designed to interface with a specific service that provides embedding model functionality. @@ -39,7 +39,7 @@ The `HuggingFaceVectoriser` class utilizes models from the Huggingface Transformers library. -The `OllamaVectoriser` class can use any local/downloaded model which can be served by ollama. +The `OllamaVectoriser` class can use any local/downloaded model which can be served by Ollama. These classes abstract the underlying implementation details, providing a simple and consistent interface for embedding text using different services. diff --git a/src/classifai/vectorisers/base.py b/src/classifai/vectorisers/base.py index f721901..085b966 100644 --- a/src/classifai/vectorisers/base.py +++ b/src/classifai/vectorisers/base.py @@ -1,4 +1,5 @@ -"""This module provides classes for creating and utilizing embedding models from different services. +"""This module provides classes for creating and utilising embedding models from different services. + The Vectoriser module offers a unified interface to interact with various other ClassifAI Package Modules. Generally Vectorisers are used to convert text data into numerical embeddings that can be used for machine learning tasks. @@ -7,11 +8,11 @@ ########################### # Vectoriser Overview -In our Package, Vectoriser have a simple role: - - Take in text data (as a string or list of strings) - - Output numerical embeddings (as a numpy array) - - Each Vectortiser should provide a `transform` method to perform this conversion. +In our Package, Vectorisers have a simple role: + - Take in text data (as a string or list of strings) + - Output numerical embeddings (as a numpy array) + - Each Vectoriser should provide a `transform` method to perform this conversion. It is possible for users to implement their own Vectoriser classes by inheriting from the `VectoriserBase` abstract base class and implementing the `transform` method. @@ -22,11 +23,11 @@ We provide several quick implementations of Vectorisers that interface with popular services and libraries. -The module contains the following 'ready-made' classes: +This module contains the following 'ready-made' classes: - `GcpVectoriser`: A class for embedding text using Google Cloud Platform's GenAI API. - `HuggingFaceVectoriser`: A general wrapper class for Huggingface Transformers models to generate text embeddings. -- `OllamaVectoriser`: A general wrapper class for using a locally running ollama +- `OllamaVectoriser`: A general wrapper class for using a locally running Ollama server to generate text embeddings. Each class is designed to interface with a specific service that provides embedding model @@ -36,7 +37,7 @@ The `HuggingFaceVectoriser` class utilizes models from the Huggingface Transformers library. -The `OllamaVectoriser` class can use any local/downloaded model which can be served by ollama. +The `OllamaVectoriser` class can use any local/downloaded model which can be served by Ollama. These classes abstract the underlying implementation details, providing a simple and consistent interface for embedding text using different services. diff --git a/src/classifai/vectorisers/gcp.py b/src/classifai/vectorisers/gcp.py index 10f55d3..f086f08 100644 --- a/src/classifai/vectorisers/gcp.py +++ b/src/classifai/vectorisers/gcp.py @@ -48,12 +48,12 @@ def __init__( task_type="CLASSIFICATION", **client_kwargs, ): - """Initializes the GcpVectoriser with the specified project ID, location, and model name. + """Initialises the GcpVectoriser with the specified project ID, location, and model name. Args: project_id (str): [optional] The Google Cloud project ID. Defaults to None. api_key (str): [optional] The API key for authenticating with the GenAI API. Defaults to None. - location (str): [optional] The location of the GenAI API. Defaults to None. + location (str): [optional] The location of the GenAI API. Defaults to "europe-west2". model_name (str): [optional] The name of the embedding model. Defaults to "text-embedding-004". task_type (str): [optional] The embedding task. Defaults to "CLASSIFICATION". See https://cloud.google.com/vertex-ai/generative-ai/docs/embeddings/task-types @@ -61,7 +61,7 @@ def __init__( **client_kwargs: [optional] Additional keyword arguments to pass to the GenAI client. Raises: - `ConfigurationError`: If the GenAI client fails to initialize. + `ConfigurationError`: If the authentication arguments are invalid, or if the GenAI client fails to initialise. """ check_deps(["google-genai"], extra="gcp") from google import genai # type: ignore @@ -86,7 +86,7 @@ def __init__( ) except Exception as e: raise ConfigurationError( - "Failed to initialize GCP GenAI client.", + "Failed to initialise GCP GenAI client.", context={"vectoriser": "gcp", "cause": str(e), "cause_type": type(e).__name__}, ) from e diff --git a/src/classifai/vectorisers/huggingface.py b/src/classifai/vectorisers/huggingface.py index 5ac4d66..fb4fa84 100644 --- a/src/classifai/vectorisers/huggingface.py +++ b/src/classifai/vectorisers/huggingface.py @@ -13,7 +13,7 @@ class HuggingFaceVectoriser(VectoriserBase): The `HuggingFaceVectoriser` accepts most encoder-based models from the Huggingface Transformers library, and provides a simple interface to generate embeddings from text data. Additional configuration options, - such as `trust_remote` or a HuggingFaceAPI token can be passed via the `tokenizer_kwargs` and `model_kwargs` + such as `trust_remote_code` or a HuggingFaceAPI token can be passed via the `tokenizer_kwargs` and `model_kwargs` parameters. Attributes: @@ -21,8 +21,6 @@ class HuggingFaceVectoriser(VectoriserBase): tokenizer (transformers.PreTrainedTokenizer): The tokenizer for the specified model. model (transformers.PreTrainedModel): The Huggingface model instance. device (torch.device): The device (CPU or GPU) on which the model is loaded. - tokenizer_kwargs (dict): Additional keyword arguments passed to the tokenizer. - model_kwargs (dict): Additional keyword arguments passed to the model. """ def __init__( @@ -33,7 +31,7 @@ def __init__( tokenizer_kwargs: dict | None = None, model_kwargs: dict | None = None, ): - """Initializes the HuggingfaceVectoriser with the specified model name and device. + """Initialises the HuggingfaceVectoriser with the specified model name and device. Args: model_name (str): The name of the Huggingface model to use. @@ -44,7 +42,7 @@ def __init__( Raises: `ExternalServiceError`: If the model or tokenizer cannot be loaded. - `ConfigurationError`: If the model cannot be initialized on the specified device. + `ConfigurationError`: If the model cannot be initialised on the specified device. """ check_deps(["transformers", "torch"], extra="huggingface") import torch # type: ignore @@ -85,7 +83,7 @@ def __init__( self.model.eval() except Exception as e: raise ConfigurationError( - "Failed to initialize model on device.", + "Failed to initialise model on device.", context={ "vectoriser": "huggingface", "model": model_name, diff --git a/src/classifai/vectorisers/ollama.py b/src/classifai/vectorisers/ollama.py index 4202c4a..09ff36a 100644 --- a/src/classifai/vectorisers/ollama.py +++ b/src/classifai/vectorisers/ollama.py @@ -9,7 +9,7 @@ class OllamaVectoriser(VectoriserBase): - """A wrapper class allowing a locally-running ollama server to generate text embeddings. + """A wrapper class allowing a locally-running Ollama server to generate text embeddings. The `OllamaVectoriser` interacts with a locally-running Ollama server, which must be set up by the user separately. @@ -22,23 +22,23 @@ class OllamaVectoriser(VectoriserBase): """ def __init__(self, model_name: str): - """Initializes the OllamaVectoriser with the specified model name and device. + """Initialises the OllamaVectoriser with the specified model name and device. Args: model_name (str): The name of the local model to use. Notes: - requires an ollama server to be running locally (`ollama serve`) + requires an Ollama server to be running locally (`ollama serve`) """ check_deps(["ollama"], extra="ollama") self.model_name = model_name def transform(self, texts: str | list[str]) -> np.ndarray: - """Transforms input text(s) into embeddings using the Huggingface model. + """Transforms input text(s) into embeddings using the Ollama model. Args: - texts (str ,list [str]): The input text(s) to embed. Can be a single string or a list of strings. + texts (str, list [str]): The input text(s) to embed. Can be a single string or a list of strings. Returns: numpy.ndarray: A 2D array of embeddings, where each row corresponds to an input text.