From d690497f3d56aaf00b4e79027c4523e11ca74acd Mon Sep 17 00:00:00 2001
From: rileyok-ons <205916635+rileyok-ons@users.noreply.github.com>
Date: Fri, 6 Mar 2026 16:16:47 +0000
Subject: [PATCH 1/4] chore(indexers): updated vectorstore types

---
 .pre-commit-config.yaml        |  2 +-
 src/classifai/indexers/main.py | 59 +++++++++++++++++++---------------
 2 files changed, 34 insertions(+), 27 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index a1af8ed..ff354c6 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -48,7 +48,7 @@ repos:
         name: deptry (uv)
         language: system
         pass_filenames: false # deptry expects a project path, not filenames
-        entry: uv run deptry .
+        entry: uv run deptry  --per-rule-ignores "DEP003=plum,DEP004=quartodoc|numpydoc" .
 
       - id: forbid-new-init
         name: Check if __init__.py is added to the src folder
diff --git a/src/classifai/indexers/main.py b/src/classifai/indexers/main.py
index 76e2b91..1ecf724 100644
--- a/src/classifai/indexers/main.py
+++ b/src/classifai/indexers/main.py
@@ -32,6 +32,7 @@
 import shutil
 import time
 import uuid
+from typing import Literal
 
 import numpy as np
 import polars as pl
@@ -67,12 +68,12 @@ class VectorStore:
     """A class to model and create 'VectorStore' objects for building and searching vector databases from CSV text files.
 
     Attributes:
-        file_name (str): the original file with the knowledgebase to build the vector store
-        data_type (str): the data type of the original file (curently only csv supported)
-        vectoriser (object): A Vectoriser object from the corresponding ClassifAI Pacakge module
+        file_name (str | os.PathLike[str]): the original file with the knowledgebase to build the vector store
+        data_type (Literal["csv"]): the data type of the original file (curently only csv supported)
+        vectoriser (VectoriserBase): A Vectoriser object from the corresponding ClassifAI Pacakge module
         batch_size (int): the batch size to pass to the vectoriser when embedding
         meta_data (dict): key-value pairs of metadata to extract from the input file and their correpsonding types
-        output_dir (str): the path to the output directory where the VectorStore will be saved
+        output_dir (str | os.PathLike[str]): the path to the output directory where the VectorStore will be saved
         vectors (np.array): a numpy array of vectors for the vector DB
         vector_shape (int): the dimension of the vectors
         num_vectors (int): how many vectors are in the vector store
@@ -82,12 +83,12 @@ class VectorStore:
 
     def __init__(  # noqa: C901, PLR0912, PLR0913, PLR0915
         self,
-        file_name,
-        data_type,
-        vectoriser,
-        batch_size=8,
-        meta_data=None,
-        output_dir=None,
+        file_name: str | os.PathLike[str],
+        data_type: Literal["csv"],
+        vectoriser: VectoriserBase,
+        batch_size: int = 8,
+        meta_data: dict | None = None,
+        output_dir: str | os.PathLike[str] | None = None,
         overwrite=False,
         hooks=None,
     ):
@@ -95,9 +96,9 @@ def __init__(  # noqa: C901, PLR0912, PLR0913, PLR0915
         vector embeddings.
 
         Args:
-            file_name (str): The name of the input CSV file.
+            file_name (str | os.PathLike): The name of the input CSV file.
             data_type (str): The type of input data (currently supports only "csv").
-            vectoriser (object): The vectoriser object used to transform text into
+            vectoriser (vectoriserBase): The vectoriser object used to transform text into
                                 vector embeddings.
             batch_size (int): [optional] The batch size for processing the input file and batching to
             vectoriser. Defaults to 8.
@@ -116,8 +117,10 @@ def __init__(  # noqa: C901, PLR0912, PLR0913, PLR0915
             IndexBuildError: If there are failures during index building or saving outputs.
         """
         # ---- Input validation (caller mistakes) -> DataValidationError / ConfigurationError
-        if not isinstance(file_name, str) or not file_name.strip():
-            raise DataValidationError("file_name must be a non-empty string.", context={"file_name": file_name})
+        if not isinstance(file_name, (str, os.PathLike)) or not os.fspath(file_name).strip():
+            raise DataValidationError(
+                "file_name must be a non-empty string or os.PathLike.", context={"file_name": file_name}
+            )
 
         if not os.path.exists(file_name):
             raise DataValidationError("Input file does not exist.", context={"file_name": file_name})
@@ -146,17 +149,18 @@ def __init__(  # noqa: C901, PLR0912, PLR0913, PLR0915
             raise DataValidationError("hooks must be a dict or None.", context={"hooks_type": type(hooks).__name__})
 
         # ---- Assign fields
+        ## all these fields are all initalised from inputs
         self.file_name = file_name
         self.data_type = data_type
         self.vectoriser = vectoriser
         self.batch_size = batch_size
         self.meta_data = meta_data if meta_data is not None else {}
         self.output_dir = output_dir
-        self.vectors = None
-        self.vector_shape = None
-        self.num_vectors = None
-        self.vectoriser_class = vectoriser.__class__.__name__
         self.hooks = {} if hooks is None else hooks
+        self.vectoriser_class = vectoriser.__class__.__name__
+        ## these are all to be filled in from vectors creation
+        self.vector_shape: int | None = None
+        self.num_vectors: int | None = None
 
         # ---- Output directory handling (filesystem problems) -> ConfigurationError
         try:
@@ -182,7 +186,7 @@ def __init__(  # noqa: C901, PLR0912, PLR0913, PLR0915
 
         # ---- Build index (wrap every unexpected failure) -> IndexBuildError
         try:
-            self._create_vector_store_index()
+            self._create_vector_store_index(os.fspath(self.file_name))
         except ClassifaiError:
             # preserve already-classified errors (e.g. vectoriser raised DataValidationError)
             raise
@@ -257,7 +261,7 @@ def _save_metadata(self, path: str):
                 context={"path": path, "metadata": metadata, "cause_type": type(e).__name__, "cause_message": str(e)},
             ) from e
 
-    def _create_vector_store_index(self):  # noqa: C901
+    def _create_vector_store_index(self, file_name: str):  # noqa: C901
         """Processes text strings in batches, generates vector embeddings, and creates the
         vector store.
         Called from the constructor once other metadata has been set.
@@ -265,6 +269,9 @@ def _create_vector_store_index(self):  # noqa: C901
         Creates a Polars DataFrame with the captured data and embeddings, and saves it as
         a Parquet file in the output_dir attribute, and stores in the vectors attribute.
 
+        Args:
+            file_name (str): The filename of csv to read in
+
         Raises:
             DataValidationError: If there are issues reading or validating the input file.
             IndexBuildError: If there are failures during embedding or building the vectors table.
@@ -273,9 +280,9 @@ def _create_vector_store_index(self):  # noqa: C901
         try:
             if self.data_type == "csv":
                 self.vectors = pl.read_csv(
-                    self.file_name,
+                    file_name,
                     columns=["id", "text", *self.meta_data.keys()],
-                    dtypes=self.meta_data | {"id": str, "text": str},
+                    schema_overrides=self.meta_data | {"id": str, "text": str},
                 )
                 self.vectors = self.vectors.with_columns(
                     pl.Series("uuid", [str(uuid.uuid4()) for _ in range(self.vectors.height)])
@@ -705,7 +712,7 @@ def search(self, query: VectorStoreSearchInput, n_results=10, batch_size=8) -> V
         return result_df
 
     @classmethod
-    def from_filespace(cls, folder_path, vectoriser, hooks: dict | None = None):  # noqa: C901, PLR0912, PLR0915
+    def from_filespace(cls, folder_path: str | os.PathLike[str], vectoriser: VectoriserBase, hooks: dict | None = None):  # noqa: C901, PLR0912, PLR0915
         """Creates a `VectorStore` instance from stored metadata and Parquet files.
         This method reads the metadata and vectors from the specified folder,
         validates the contents, and initializes a `VectorStore` object with the
@@ -717,8 +724,8 @@ def from_filespace(cls, folder_path, vectoriser, hooks: dict | None = None):  #
         needing to reprocess the original text data.
 
         Args:
-            folder_path (str): The folder path containing the metadata and Parquet files.
-            vectoriser (object): The vectoriser object used to transform text into vector embeddings.
+            folder_path (str | os.PathLike): The folder path containing the metadata and Parquet files.
+            vectoriser (VectoriserBase): The vectoriser object used to transform text into vector embeddings.
             hooks (dict): [optional] A dictionary of user-defined hooks for preprocessing and postprocessing. Defaults to None.
 
         Returns:
@@ -730,7 +737,7 @@ def from_filespace(cls, folder_path, vectoriser, hooks: dict | None = None):  #
             IndexBuildError: If there are failures during loading or parsing the files.
         """
         # ---- Validate arguments (caller mistakes) -> DataValidationError / ConfigurationError
-        if not isinstance(folder_path, str) or not folder_path.strip():
+        if not isinstance(folder_path, (str, os.PathLike)) or not os.fspath(folder_path).strip():
             raise DataValidationError("folder_path must be a non-empty string.", context={"folder_path": folder_path})
 
         if not os.path.isdir(folder_path):

From 439fb4626d36a8243b414a5291566febd460f66f Mon Sep 17 00:00:00 2001
From: rileyok-ons <205916635+rileyok-ons@users.noreply.github.com>
Date: Fri, 6 Mar 2026 16:28:30 +0000
Subject: [PATCH 2/4] chore(servers): updated pydantic to use modern extras
 definition

---
 src/classifai/servers/pydantic_models.py | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/src/classifai/servers/pydantic_models.py b/src/classifai/servers/pydantic_models.py
index 3bb7b75..3f9523a 100644
--- a/src/classifai/servers/pydantic_models.py
+++ b/src/classifai/servers/pydantic_models.py
@@ -2,7 +2,7 @@
 """Pydantic Classes to model request and response data for ClassifAI FastAPI RESTful API."""
 
 import pandas as pd
-from pydantic import BaseModel, Extra, Field
+from pydantic import BaseModel, ConfigDict, Field
 
 
 class ClassifaiEntry(BaseModel):
@@ -36,12 +36,7 @@ class ResultEntry(BaseModel):
     score: float
     rank: int
 
-    class Config:  # pylint: disable=R0903
-        """Sub-class to permit additional extra metadata (e.g., metadata columns from vectorstore
-        construction).
-        """
-
-        extra = Extra.allow
+    model_config = ConfigDict(extra="allow")
 
 
 class ResultsList(BaseModel):
@@ -88,8 +83,7 @@ class RevResultEntry(BaseModel):
     label: str
     description: str
 
-    class Config:
-        extra = Extra.allow  # Allow extra keys (e.g., metadata columns)
+    model_config = ConfigDict(extra="allow")
 
 
 class RevResultsList(BaseModel):
@@ -177,7 +171,7 @@ def convert_dataframe_to_reverse_search_pydantic_response(df: pd.DataFrame, meta
         # Create a RevResultsList object for the current `id`
         results_list.append(
             RevResultsList(
-                input_id=input_id,
+                input_id=str(input_id),
                 response=response_entries,
             )
         )

From 2d654c42cadb88c71511f5fd76181173b56ed5ad Mon Sep 17 00:00:00 2001
From: rileyok-ons <205916635+rileyok-ons@users.noreply.github.com>
Date: Mon, 9 Mar 2026 11:40:37 +0000
Subject: [PATCH 3/4] chore: removed unneeded none init

---
 src/classifai/indexers/main.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/classifai/indexers/main.py b/src/classifai/indexers/main.py
index 1ecf724..1c4ab3a 100644
--- a/src/classifai/indexers/main.py
+++ b/src/classifai/indexers/main.py
@@ -158,9 +158,6 @@ def __init__(  # noqa: C901, PLR0912, PLR0913, PLR0915
         self.output_dir = output_dir
         self.hooks = {} if hooks is None else hooks
         self.vectoriser_class = vectoriser.__class__.__name__
-        ## these are all to be filled in from vectors creation
-        self.vector_shape: int | None = None
-        self.num_vectors: int | None = None
 
         # ---- Output directory handling (filesystem problems) -> ConfigurationError
         try:

From 0da106897294e5d52011da49623cf1dca51bdfec Mon Sep 17 00:00:00 2001
From: rileyok-ons <205916635+rileyok-ons@users.noreply.github.com>
Date: Mon, 9 Mar 2026 13:57:57 +0000
Subject: [PATCH 4/4] chore: updated misspelling

---
 src/classifai/indexers/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/classifai/indexers/main.py b/src/classifai/indexers/main.py
index 1c4ab3a..2e91dc5 100644
--- a/src/classifai/indexers/main.py
+++ b/src/classifai/indexers/main.py
@@ -98,7 +98,7 @@ def __init__(  # noqa: C901, PLR0912, PLR0913, PLR0915
         Args:
             file_name (str | os.PathLike): The name of the input CSV file.
             data_type (str): The type of input data (currently supports only "csv").
-            vectoriser (vectoriserBase): The vectoriser object used to transform text into
+            vectoriser (VectoriserBase): The vectoriser object used to transform text into
                                 vector embeddings.
             batch_size (int): [optional] The batch size for processing the input file and batching to
             vectoriser. Defaults to 8.