v0.2.35

bartzbeielstein · bartzbeielstein · commit 03f996e857f4 · 2023-06-17T21:24:41.000+02:00
vbdp
diff --git a/pyproject.toml b/pyproject.toml
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "spotPython"
-version = "0.2.34"
+version = "0.2.35"
 authors = [
   { name="T. Bartz-Beielstein", email="tbb@bartzundbartz.de" }
 ]
diff --git a/src/spotPython/data/vbdp.py b/src/spotPython/data/vbdp.py
@@ -2,13 +2,13 @@
 import pandas as pd
 
 
-def cluster_features(df_vbdp):
+def cluster_features(X):
     """Clusters the features of a dataframe based on similarity
 
     Args:
-        df_vbdp (pd.DataFrame): dataframe with features
+        X (pd.DataFrame): dataframe with features
     Returns:
-        df_vbdp (pd.DataFrame): dataframe with new features
+        X (pd.DataFrame): dataframe with new features
     Examples:
         >>> df = pd.DataFrame({"a": [True, False, True], "b": [True, True, False], "c": [False, False, True]})
         >>> df
@@ -22,24 +22,24 @@ def cluster_features(df_vbdp):
         1 False   True  False       1
         2  True  False   True        2
     """
-    c_0 = df_vbdp.columns[df_vbdp.columns.str.contains("pain")]
-    c_1 = df_vbdp.columns[df_vbdp.columns.str.contains("inflammation")]
-    c_2 = df_vbdp.columns[df_vbdp.columns.str.contains("bleed")]
-    c_3 = df_vbdp.columns[df_vbdp.columns.str.contains("skin")]
-    df_vbdp["c_0"] = df_vbdp[c_0].sum(axis=1)
-    df_vbdp["c_1"] = df_vbdp[c_1].sum(axis=1)
-    df_vbdp["c_2"] = df_vbdp[c_2].sum(axis=1)
-    df_vbdp["c_3"] = df_vbdp[c_3].sum(axis=1)
-    return df_vbdp
+    c_0 = X.columns[X.columns.str.contains("pain")]
+    c_1 = X.columns[X.columns.str.contains("inflammation")]
+    c_2 = X.columns[X.columns.str.contains("bleed")]
+    c_3 = X.columns[X.columns.str.contains("skin")]
+    X["c_0"] = X[c_0].sum(axis=1)
+    X["c_1"] = X[c_1].sum(axis=1)
+    X["c_2"] = X[c_2].sum(axis=1)
+    X["c_3"] = X[c_3].sum(axis=1)
+    return X
 
 
-def affinity_propagation_features(df_vbdp):
+def affinity_propagation_features(X):
     """Clusters the features of a dataframe using Affinity Propagation
 
     Args:
-        df_vbdp (pd.DataFrame): dataframe with features
+        X (pd.DataFrame): dataframe with features
     Returns:
-        df_vbdp (pd.DataFrame): dataframe with new features
+        X (pd.DataFrame): dataframe with new features
     Examples:
         >>> df = pd.DataFrame({"a": [True, False, True], "b": [True, True, False], "c": [False, False, True]})
         >>> df
@@ -56,22 +56,22 @@ def affinity_propagation_features(df_vbdp):
     from sklearn.cluster import AffinityPropagation
     from sklearn.metrics.pairwise import manhattan_distances
 
-    X = manhattan_distances(df_vbdp)
-    af = AffinityPropagation(random_state=0, affinity="precomputed").fit(X)
+    D = manhattan_distances(X)
+    af = AffinityPropagation(random_state=0, affinity="precomputed").fit(D)
     cluster_centers_indices = af.cluster_centers_indices_
     n_clusters_ = len(cluster_centers_indices)
     print("Estimated number of clusters: %d" % n_clusters_)
-    df_vbdp["cluster"] = af.labels_
-    return df_vbdp
+    X["cluster"] = af.labels_
+    return X
 
 
-def combine_features(df_vbdp):
+def combine_features(X):
     """Combines all features in a dataframe with each other using bitwise operations
 
     Args:
-        df_vbdp (pd.DataFrame): dataframe with features
+        X (pd.DataFrame): dataframe with features
     Returns:
-        df_vbdp (pd.DataFrame): dataframe with new features
+        X (pd.DataFrame): dataframe with new features
         Examples:
             >>> df = pd.DataFrame({"a": [True, False, True], "b": [True, True, False], "c": [False, False, True]})
             >>> df
@@ -87,12 +87,78 @@ def combine_features(df_vbdp):
     """
     new_cols = []
     # Iterate over all pairs of columns
-    for col1, col2 in itertools.combinations(df_vbdp.columns, 2):
+    for col1, col2 in itertools.combinations(X.columns, 2):
         # Create new columns for the bitwise AND, OR and XOR operations
-        and_col = df_vbdp[[col1, col2]].apply(lambda x: x[col1] & x[col2], axis=1)
-        or_col = df_vbdp[[col1, col2]].apply(lambda x: x[col1] | x[col2], axis=1)
-        xor_col = df_vbdp[[col1, col2]].apply(lambda x: x[col1] ^ x[col2], axis=1)
+        and_col = X[[col1, col2]].apply(lambda x: x[col1] & x[col2], axis=1)
+        or_col = X[[col1, col2]].apply(lambda x: x[col1] | x[col2], axis=1)
+        xor_col = X[[col1, col2]].apply(lambda x: x[col1] ^ x[col2], axis=1)
         new_cols.extend([and_col, or_col, xor_col])
     # Join all the new columns at once
-    df_vbdp = pd.concat([df_vbdp] + new_cols, axis=1)
-    return df_vbdp
+    X = pd.concat([X] + new_cols, axis=1)
+    return X
+
+
+def symptom_features(X, y):
+    """Generate new features based on the joint symptoms of a disease
+    Args:
+        X (pd.DataFrame): dataframe with features
+        y (pd.Series): series with target values
+    """
+    # Combine X and y into one dataframe
+    Xy = pd.concat([X, y], axis=1)
+    # Add names to the columns: x1, x2, ..., xn, y
+    Xy.columns = ["x" + str(i) for i in range(1, X.shape[1] + 1)] + ["y"]
+    # full train data with X and y values
+    marginals = Xy.groupby("y").mean()
+    top_2_symptoms = {}
+    bot_2_symptoms = {}
+    # for feature generation
+    combinations = []
+    for i in range(marginals.shape[0]):
+        symptoms = marginals.iloc[i]
+        # for b in True, False:
+        sorted = symptoms.sort_values(ascending=False)
+        top_1 = sorted.keys()[0]
+        top_1_per = sorted.values[0]
+        top_2 = sorted.keys()[1]
+        top_2_per = sorted.values[1]
+
+        bot_1 = sorted.keys()[-1]
+        bot_1_per = sorted.values[-1]
+        bot_2 = sorted.keys()[-2]
+        bot_2_per = sorted.values[-2]
+
+        name = marginals.index[i]
+        dic = {top_1: top_1_per, top_2: top_2_per}
+        dic_bot = {bot_1: bot_1_per, bot_2: bot_2_per}
+        top_2_symptoms[name] = dic
+        bot_2_symptoms[name] = dic_bot
+        combinations.append(((top_1, top_2), (bot_1, bot_2)))
+    Xy_mod = Xy.copy()
+    convert = Xy.drop(columns=["y"]).columns.values
+    for val in convert:
+        Xy_mod[val] = Xy_mod[val].astype("int")
+    for group in combinations:
+        for comb in group:
+            col1, col2 = comb
+            new_columns = pd.DataFrame(
+                {
+                    f"{col1}_and_{col2}": Xy_mod[col1] & Xy_mod[col2],
+                    f"{col1}_or_{col2}": Xy_mod[col1] | Xy_mod[col2],
+                    f"{col1}_xor_{col2}": Xy_mod[col1] ^ Xy_mod[col2],
+                }
+            )
+            Xy_mod = pd.concat([Xy_mod, new_columns], axis=1)
+    # removing duplicate features
+    Xy_mod = Xy_mod.loc[:, ~Xy_mod.columns.duplicated()].copy()
+    print(f"Number of features: {Xy_mod.shape[1]}")
+    print(f"Number of samples: {Xy_mod.shape[0]}")
+    # remove the column y from the Xy_mod data frame
+    X_mod = Xy_mod.drop(columns=["y"])
+    # print the column names
+    print(f"Column names: {Xy_mod.columns.values}")
+    # X_new = add_logical_columns(X_mod, 2)
+    X_new = combine_features(X_mod)
+    print(f"Number of features: {X_new.shape[1]}")
+    print(f"Number of samples: {X_new.shape[0]}")
+    return X_new, top_2_symptoms, bot_2_symptoms
diff --git a/src/spotPython/utils/convert.py b/src/spotPython/utils/convert.py
@@ -1,5 +1,7 @@
 import importlib
 import numpy as np
+import pandas as pd
+from itertools import combinations
 
 
 def class_for_name(module_name, class_name) -> object:
@@ -67,3 +69,49 @@ def series_to_array(series):
         return series
     else:
         return series.to_numpy()
+
+
+def add_logical_columns(df, arity):
+    """Adds logical columns to a DataFrame.
+    Args:
+        df (pandas.DataFrame): The input DataFrame.
+        arity (int): The arity of the logical columns.
+    Returns:
+        pandas.DataFrame: The output DataFrame.
+    Example:
+        >>> from spotPython.utils.convert import add_logical_columns
+        >>> import pandas as pd
+        >>> df = pd.DataFrame({'A': [True, False, True], 'B': [False, True, True], 'C': [True, True, False]})
+        >>> result = add_logical_columns(df, 2)
+        >>> print(result)
+            A      B      C  and_A_B  or_A_B  xor_A_B  and_A_C  or_A_C  xor_A_C  and_B_C  or_B_C  xor_B_C
+        0   True  False   True    False    True     True     True    True    False   False    True     True
+        1  False   True   True    False    True     True    False    True     True   False    True     True
+        2   True   True  False     True    True    False    False    True     True    True    True    False
+    """
+    # Create a copy of the input DataFrame to avoid modifying it
+    result = df.copy()
+
+    # Create empty DataFrames for the additional columns
+    and_df = pd.DataFrame(index=df.index)
+    or_df = pd.DataFrame(index=df.index)
+    xor_df = pd.DataFrame(index=df.index)
+
+    # Get all combinations of columns with the specified arity
+    column_combinations = list(combinations(df.columns, arity))
+
+    # Apply the logical_and, logical_or and logical_xor functions to all combinations of columns
+    for cols in column_combinations:
+        col_name = "_".join(cols)
+        and_df[f"and_{col_name}"] = result[cols[0]]
+        or_df[f"or_{col_name}"] = result[cols[0]]
+        xor_df[f"xor_{col_name}"] = result[cols[0]]
+        for col in cols[1:]:
+            and_df[f"and_{col_name}"] &= result[col]
+            or_df[f"or_{col_name}"] |= result[col]
+            xor_df[f"xor_{col_name}"] ^= result[col]
+
+    # Concatenate the input DataFrame with the additional columns
+    result = pd.concat([result, and_df, or_df, xor_df], axis=1)
+
+    return result

Original file line number	Diff line number	Diff line change
`@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"`
`7`	`7`
`8`	`8`	`[project]`
`9`	`9`	`name = "spotPython"`
`10`		`-version = "0.2.34"`
	`10`	`+version = "0.2.35"`
`11`	`11`	`authors = [`
`12`	`12`	`{ name="T. Bartz-Beielstein", email="tbb@bartzundbartz.de" }`
`13`	`13`	`]`