From d690497f3d56aaf00b4e79027c4523e11ca74acd Mon Sep 17 00:00:00 2001 From: rileyok-ons <205916635+rileyok-ons@users.noreply.github.com> Date: Fri, 6 Mar 2026 16:16:47 +0000 Subject: [PATCH 1/4] chore(indexers): updated vectorstore types --- .pre-commit-config.yaml | 2 +- src/classifai/indexers/main.py | 59 +++++++++++++++++++--------------- 2 files changed, 34 insertions(+), 27 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a1af8ed..ff354c6 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -48,7 +48,7 @@ repos: name: deptry (uv) language: system pass_filenames: false # deptry expects a project path, not filenames - entry: uv run deptry . + entry: uv run deptry --per-rule-ignores "DEP003=plum,DEP004=quartodoc|numpydoc" . - id: forbid-new-init name: Check if __init__.py is added to the src folder diff --git a/src/classifai/indexers/main.py b/src/classifai/indexers/main.py index 76e2b91..1ecf724 100644 --- a/src/classifai/indexers/main.py +++ b/src/classifai/indexers/main.py @@ -32,6 +32,7 @@ import shutil import time import uuid +from typing import Literal import numpy as np import polars as pl @@ -67,12 +68,12 @@ class VectorStore: """A class to model and create 'VectorStore' objects for building and searching vector databases from CSV text files. Attributes: - file_name (str): the original file with the knowledgebase to build the vector store - data_type (str): the data type of the original file (curently only csv supported) - vectoriser (object): A Vectoriser object from the corresponding ClassifAI Pacakge module + file_name (str | os.PathLike[str]): the original file with the knowledgebase to build the vector store + data_type (Literal["csv"]): the data type of the original file (curently only csv supported) + vectoriser (VectoriserBase): A Vectoriser object from the corresponding ClassifAI Pacakge module batch_size (int): the batch size to pass to the vectoriser when embedding meta_data (dict): key-value pairs of metadata to extract from the input file and their correpsonding types - output_dir (str): the path to the output directory where the VectorStore will be saved + output_dir (str | os.PathLike[str]): the path to the output directory where the VectorStore will be saved vectors (np.array): a numpy array of vectors for the vector DB vector_shape (int): the dimension of the vectors num_vectors (int): how many vectors are in the vector store @@ -82,12 +83,12 @@ class VectorStore: def __init__( # noqa: C901, PLR0912, PLR0913, PLR0915 self, - file_name, - data_type, - vectoriser, - batch_size=8, - meta_data=None, - output_dir=None, + file_name: str | os.PathLike[str], + data_type: Literal["csv"], + vectoriser: VectoriserBase, + batch_size: int = 8, + meta_data: dict | None = None, + output_dir: str | os.PathLike[str] | None = None, overwrite=False, hooks=None, ): @@ -95,9 +96,9 @@ def __init__( # noqa: C901, PLR0912, PLR0913, PLR0915 vector embeddings. Args: - file_name (str): The name of the input CSV file. + file_name (str | os.PathLike): The name of the input CSV file. data_type (str): The type of input data (currently supports only "csv"). - vectoriser (object): The vectoriser object used to transform text into + vectoriser (vectoriserBase): The vectoriser object used to transform text into vector embeddings. batch_size (int): [optional] The batch size for processing the input file and batching to vectoriser. Defaults to 8. @@ -116,8 +117,10 @@ def __init__( # noqa: C901, PLR0912, PLR0913, PLR0915 IndexBuildError: If there are failures during index building or saving outputs. """ # ---- Input validation (caller mistakes) -> DataValidationError / ConfigurationError - if not isinstance(file_name, str) or not file_name.strip(): - raise DataValidationError("file_name must be a non-empty string.", context={"file_name": file_name}) + if not isinstance(file_name, (str, os.PathLike)) or not os.fspath(file_name).strip(): + raise DataValidationError( + "file_name must be a non-empty string or os.PathLike.", context={"file_name": file_name} + ) if not os.path.exists(file_name): raise DataValidationError("Input file does not exist.", context={"file_name": file_name}) @@ -146,17 +149,18 @@ def __init__( # noqa: C901, PLR0912, PLR0913, PLR0915 raise DataValidationError("hooks must be a dict or None.", context={"hooks_type": type(hooks).__name__}) # ---- Assign fields + ## all these fields are all initalised from inputs self.file_name = file_name self.data_type = data_type self.vectoriser = vectoriser self.batch_size = batch_size self.meta_data = meta_data if meta_data is not None else {} self.output_dir = output_dir - self.vectors = None - self.vector_shape = None - self.num_vectors = None - self.vectoriser_class = vectoriser.__class__.__name__ self.hooks = {} if hooks is None else hooks + self.vectoriser_class = vectoriser.__class__.__name__ + ## these are all to be filled in from vectors creation + self.vector_shape: int | None = None + self.num_vectors: int | None = None # ---- Output directory handling (filesystem problems) -> ConfigurationError try: @@ -182,7 +186,7 @@ def __init__( # noqa: C901, PLR0912, PLR0913, PLR0915 # ---- Build index (wrap every unexpected failure) -> IndexBuildError try: - self._create_vector_store_index() + self._create_vector_store_index(os.fspath(self.file_name)) except ClassifaiError: # preserve already-classified errors (e.g. vectoriser raised DataValidationError) raise @@ -257,7 +261,7 @@ def _save_metadata(self, path: str): context={"path": path, "metadata": metadata, "cause_type": type(e).__name__, "cause_message": str(e)}, ) from e - def _create_vector_store_index(self): # noqa: C901 + def _create_vector_store_index(self, file_name: str): # noqa: C901 """Processes text strings in batches, generates vector embeddings, and creates the vector store. Called from the constructor once other metadata has been set. @@ -265,6 +269,9 @@ def _create_vector_store_index(self): # noqa: C901 Creates a Polars DataFrame with the captured data and embeddings, and saves it as a Parquet file in the output_dir attribute, and stores in the vectors attribute. + Args: + file_name (str): The filename of csv to read in + Raises: DataValidationError: If there are issues reading or validating the input file. IndexBuildError: If there are failures during embedding or building the vectors table. @@ -273,9 +280,9 @@ def _create_vector_store_index(self): # noqa: C901 try: if self.data_type == "csv": self.vectors = pl.read_csv( - self.file_name, + file_name, columns=["id", "text", *self.meta_data.keys()], - dtypes=self.meta_data | {"id": str, "text": str}, + schema_overrides=self.meta_data | {"id": str, "text": str}, ) self.vectors = self.vectors.with_columns( pl.Series("uuid", [str(uuid.uuid4()) for _ in range(self.vectors.height)]) @@ -705,7 +712,7 @@ def search(self, query: VectorStoreSearchInput, n_results=10, batch_size=8) -> V return result_df @classmethod - def from_filespace(cls, folder_path, vectoriser, hooks: dict | None = None): # noqa: C901, PLR0912, PLR0915 + def from_filespace(cls, folder_path: str | os.PathLike[str], vectoriser: VectoriserBase, hooks: dict | None = None): # noqa: C901, PLR0912, PLR0915 """Creates a `VectorStore` instance from stored metadata and Parquet files. This method reads the metadata and vectors from the specified folder, validates the contents, and initializes a `VectorStore` object with the @@ -717,8 +724,8 @@ def from_filespace(cls, folder_path, vectoriser, hooks: dict | None = None): # needing to reprocess the original text data. Args: - folder_path (str): The folder path containing the metadata and Parquet files. - vectoriser (object): The vectoriser object used to transform text into vector embeddings. + folder_path (str | os.PathLike): The folder path containing the metadata and Parquet files. + vectoriser (VectoriserBase): The vectoriser object used to transform text into vector embeddings. hooks (dict): [optional] A dictionary of user-defined hooks for preprocessing and postprocessing. Defaults to None. Returns: @@ -730,7 +737,7 @@ def from_filespace(cls, folder_path, vectoriser, hooks: dict | None = None): # IndexBuildError: If there are failures during loading or parsing the files. """ # ---- Validate arguments (caller mistakes) -> DataValidationError / ConfigurationError - if not isinstance(folder_path, str) or not folder_path.strip(): + if not isinstance(folder_path, (str, os.PathLike)) or not os.fspath(folder_path).strip(): raise DataValidationError("folder_path must be a non-empty string.", context={"folder_path": folder_path}) if not os.path.isdir(folder_path): From 439fb4626d36a8243b414a5291566febd460f66f Mon Sep 17 00:00:00 2001 From: rileyok-ons <205916635+rileyok-ons@users.noreply.github.com> Date: Fri, 6 Mar 2026 16:28:30 +0000 Subject: [PATCH 2/4] chore(servers): updated pydantic to use modern extras definition --- src/classifai/servers/pydantic_models.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/src/classifai/servers/pydantic_models.py b/src/classifai/servers/pydantic_models.py index 3bb7b75..3f9523a 100644 --- a/src/classifai/servers/pydantic_models.py +++ b/src/classifai/servers/pydantic_models.py @@ -2,7 +2,7 @@ """Pydantic Classes to model request and response data for ClassifAI FastAPI RESTful API.""" import pandas as pd -from pydantic import BaseModel, Extra, Field +from pydantic import BaseModel, ConfigDict, Field class ClassifaiEntry(BaseModel): @@ -36,12 +36,7 @@ class ResultEntry(BaseModel): score: float rank: int - class Config: # pylint: disable=R0903 - """Sub-class to permit additional extra metadata (e.g., metadata columns from vectorstore - construction). - """ - - extra = Extra.allow + model_config = ConfigDict(extra="allow") class ResultsList(BaseModel): @@ -88,8 +83,7 @@ class RevResultEntry(BaseModel): label: str description: str - class Config: - extra = Extra.allow # Allow extra keys (e.g., metadata columns) + model_config = ConfigDict(extra="allow") class RevResultsList(BaseModel): @@ -177,7 +171,7 @@ def convert_dataframe_to_reverse_search_pydantic_response(df: pd.DataFrame, meta # Create a RevResultsList object for the current `id` results_list.append( RevResultsList( - input_id=input_id, + input_id=str(input_id), response=response_entries, ) ) From 2d654c42cadb88c71511f5fd76181173b56ed5ad Mon Sep 17 00:00:00 2001 From: rileyok-ons <205916635+rileyok-ons@users.noreply.github.com> Date: Mon, 9 Mar 2026 11:40:37 +0000 Subject: [PATCH 3/4] chore: removed unneeded none init --- src/classifai/indexers/main.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/classifai/indexers/main.py b/src/classifai/indexers/main.py index 1ecf724..1c4ab3a 100644 --- a/src/classifai/indexers/main.py +++ b/src/classifai/indexers/main.py @@ -158,9 +158,6 @@ def __init__( # noqa: C901, PLR0912, PLR0913, PLR0915 self.output_dir = output_dir self.hooks = {} if hooks is None else hooks self.vectoriser_class = vectoriser.__class__.__name__ - ## these are all to be filled in from vectors creation - self.vector_shape: int | None = None - self.num_vectors: int | None = None # ---- Output directory handling (filesystem problems) -> ConfigurationError try: From 0da106897294e5d52011da49623cf1dca51bdfec Mon Sep 17 00:00:00 2001 From: rileyok-ons <205916635+rileyok-ons@users.noreply.github.com> Date: Mon, 9 Mar 2026 13:57:57 +0000 Subject: [PATCH 4/4] chore: updated misspelling --- src/classifai/indexers/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/classifai/indexers/main.py b/src/classifai/indexers/main.py index 1c4ab3a..2e91dc5 100644 --- a/src/classifai/indexers/main.py +++ b/src/classifai/indexers/main.py @@ -98,7 +98,7 @@ def __init__( # noqa: C901, PLR0912, PLR0913, PLR0915 Args: file_name (str | os.PathLike): The name of the input CSV file. data_type (str): The type of input data (currently supports only "csv"). - vectoriser (vectoriserBase): The vectoriser object used to transform text into + vectoriser (VectoriserBase): The vectoriser object used to transform text into vector embeddings. batch_size (int): [optional] The batch size for processing the input file and batching to vectoriser. Defaults to 8.