Skip to content

Commit 03f996e

Browse files
v0.2.35
vbdp
1 parent f87b5a3 commit 03f996e

3 files changed

Lines changed: 143 additions & 29 deletions

File tree

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
77

88
[project]
99
name = "spotPython"
10-
version = "0.2.34"
10+
version = "0.2.35"
1111
authors = [
1212
{ name="T. Bartz-Beielstein", email="tbb@bartzundbartz.de" }
1313
]

src/spotPython/data/vbdp.py

Lines changed: 94 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,13 @@
22
import pandas as pd
33

44

5-
def cluster_features(df_vbdp):
5+
def cluster_features(X):
66
"""Clusters the features of a dataframe based on similarity
77
88
Args:
9-
df_vbdp (pd.DataFrame): dataframe with features
9+
X (pd.DataFrame): dataframe with features
1010
Returns:
11-
df_vbdp (pd.DataFrame): dataframe with new features
11+
X (pd.DataFrame): dataframe with new features
1212
Examples:
1313
>>> df = pd.DataFrame({"a": [True, False, True], "b": [True, True, False], "c": [False, False, True]})
1414
>>> df
@@ -22,24 +22,24 @@ def cluster_features(df_vbdp):
2222
1 False True False 1
2323
2 True False True 2
2424
"""
25-
c_0 = df_vbdp.columns[df_vbdp.columns.str.contains("pain")]
26-
c_1 = df_vbdp.columns[df_vbdp.columns.str.contains("inflammation")]
27-
c_2 = df_vbdp.columns[df_vbdp.columns.str.contains("bleed")]
28-
c_3 = df_vbdp.columns[df_vbdp.columns.str.contains("skin")]
29-
df_vbdp["c_0"] = df_vbdp[c_0].sum(axis=1)
30-
df_vbdp["c_1"] = df_vbdp[c_1].sum(axis=1)
31-
df_vbdp["c_2"] = df_vbdp[c_2].sum(axis=1)
32-
df_vbdp["c_3"] = df_vbdp[c_3].sum(axis=1)
33-
return df_vbdp
25+
c_0 = X.columns[X.columns.str.contains("pain")]
26+
c_1 = X.columns[X.columns.str.contains("inflammation")]
27+
c_2 = X.columns[X.columns.str.contains("bleed")]
28+
c_3 = X.columns[X.columns.str.contains("skin")]
29+
X["c_0"] = X[c_0].sum(axis=1)
30+
X["c_1"] = X[c_1].sum(axis=1)
31+
X["c_2"] = X[c_2].sum(axis=1)
32+
X["c_3"] = X[c_3].sum(axis=1)
33+
return X
3434

3535

36-
def affinity_propagation_features(df_vbdp):
36+
def affinity_propagation_features(X):
3737
"""Clusters the features of a dataframe using Affinity Propagation
3838
3939
Args:
40-
df_vbdp (pd.DataFrame): dataframe with features
40+
X (pd.DataFrame): dataframe with features
4141
Returns:
42-
df_vbdp (pd.DataFrame): dataframe with new features
42+
X (pd.DataFrame): dataframe with new features
4343
Examples:
4444
>>> df = pd.DataFrame({"a": [True, False, True], "b": [True, True, False], "c": [False, False, True]})
4545
>>> df
@@ -56,22 +56,22 @@ def affinity_propagation_features(df_vbdp):
5656
from sklearn.cluster import AffinityPropagation
5757
from sklearn.metrics.pairwise import manhattan_distances
5858

59-
X = manhattan_distances(df_vbdp)
60-
af = AffinityPropagation(random_state=0, affinity="precomputed").fit(X)
59+
D = manhattan_distances(X)
60+
af = AffinityPropagation(random_state=0, affinity="precomputed").fit(D)
6161
cluster_centers_indices = af.cluster_centers_indices_
6262
n_clusters_ = len(cluster_centers_indices)
6363
print("Estimated number of clusters: %d" % n_clusters_)
64-
df_vbdp["cluster"] = af.labels_
65-
return df_vbdp
64+
X["cluster"] = af.labels_
65+
return X
6666

6767

68-
def combine_features(df_vbdp):
68+
def combine_features(X):
6969
"""Combines all features in a dataframe with each other using bitwise operations
7070
7171
Args:
72-
df_vbdp (pd.DataFrame): dataframe with features
72+
X (pd.DataFrame): dataframe with features
7373
Returns:
74-
df_vbdp (pd.DataFrame): dataframe with new features
74+
X (pd.DataFrame): dataframe with new features
7575
Examples:
7676
>>> df = pd.DataFrame({"a": [True, False, True], "b": [True, True, False], "c": [False, False, True]})
7777
>>> df
@@ -87,12 +87,78 @@ def combine_features(df_vbdp):
8787
"""
8888
new_cols = []
8989
# Iterate over all pairs of columns
90-
for col1, col2 in itertools.combinations(df_vbdp.columns, 2):
90+
for col1, col2 in itertools.combinations(X.columns, 2):
9191
# Create new columns for the bitwise AND, OR and XOR operations
92-
and_col = df_vbdp[[col1, col2]].apply(lambda x: x[col1] & x[col2], axis=1)
93-
or_col = df_vbdp[[col1, col2]].apply(lambda x: x[col1] | x[col2], axis=1)
94-
xor_col = df_vbdp[[col1, col2]].apply(lambda x: x[col1] ^ x[col2], axis=1)
92+
and_col = X[[col1, col2]].apply(lambda x: x[col1] & x[col2], axis=1)
93+
or_col = X[[col1, col2]].apply(lambda x: x[col1] | x[col2], axis=1)
94+
xor_col = X[[col1, col2]].apply(lambda x: x[col1] ^ x[col2], axis=1)
9595
new_cols.extend([and_col, or_col, xor_col])
9696
# Join all the new columns at once
97-
df_vbdp = pd.concat([df_vbdp] + new_cols, axis=1)
98-
return df_vbdp
97+
X = pd.concat([X] + new_cols, axis=1)
98+
return X
99+
100+
101+
def symptom_features(X, y):
102+
"""Generate new features based on the joint symptoms of a disease
103+
Args:
104+
X (pd.DataFrame): dataframe with features
105+
y (pd.Series): series with target values
106+
"""
107+
# Combine X and y into one dataframe
108+
Xy = pd.concat([X, y], axis=1)
109+
# Add names to the columns: x1, x2, ..., xn, y
110+
Xy.columns = ["x" + str(i) for i in range(1, X.shape[1] + 1)] + ["y"]
111+
# full train data with X and y values
112+
marginals = Xy.groupby("y").mean()
113+
top_2_symptoms = {}
114+
bot_2_symptoms = {}
115+
# for feature generation
116+
combinations = []
117+
for i in range(marginals.shape[0]):
118+
symptoms = marginals.iloc[i]
119+
# for b in True, False:
120+
sorted = symptoms.sort_values(ascending=False)
121+
top_1 = sorted.keys()[0]
122+
top_1_per = sorted.values[0]
123+
top_2 = sorted.keys()[1]
124+
top_2_per = sorted.values[1]
125+
126+
bot_1 = sorted.keys()[-1]
127+
bot_1_per = sorted.values[-1]
128+
bot_2 = sorted.keys()[-2]
129+
bot_2_per = sorted.values[-2]
130+
131+
name = marginals.index[i]
132+
dic = {top_1: top_1_per, top_2: top_2_per}
133+
dic_bot = {bot_1: bot_1_per, bot_2: bot_2_per}
134+
top_2_symptoms[name] = dic
135+
bot_2_symptoms[name] = dic_bot
136+
combinations.append(((top_1, top_2), (bot_1, bot_2)))
137+
Xy_mod = Xy.copy()
138+
convert = Xy.drop(columns=["y"]).columns.values
139+
for val in convert:
140+
Xy_mod[val] = Xy_mod[val].astype("int")
141+
for group in combinations:
142+
for comb in group:
143+
col1, col2 = comb
144+
new_columns = pd.DataFrame(
145+
{
146+
f"{col1}_and_{col2}": Xy_mod[col1] & Xy_mod[col2],
147+
f"{col1}_or_{col2}": Xy_mod[col1] | Xy_mod[col2],
148+
f"{col1}_xor_{col2}": Xy_mod[col1] ^ Xy_mod[col2],
149+
}
150+
)
151+
Xy_mod = pd.concat([Xy_mod, new_columns], axis=1)
152+
# removing duplicate features
153+
Xy_mod = Xy_mod.loc[:, ~Xy_mod.columns.duplicated()].copy()
154+
print(f"Number of features: {Xy_mod.shape[1]}")
155+
print(f"Number of samples: {Xy_mod.shape[0]}")
156+
# remove the column y from the Xy_mod data frame
157+
X_mod = Xy_mod.drop(columns=["y"])
158+
# print the column names
159+
print(f"Column names: {Xy_mod.columns.values}")
160+
# X_new = add_logical_columns(X_mod, 2)
161+
X_new = combine_features(X_mod)
162+
print(f"Number of features: {X_new.shape[1]}")
163+
print(f"Number of samples: {X_new.shape[0]}")
164+
return X_new, top_2_symptoms, bot_2_symptoms

src/spotPython/utils/convert.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import importlib
22
import numpy as np
3+
import pandas as pd
4+
from itertools import combinations
35

46

57
def class_for_name(module_name, class_name) -> object:
@@ -67,3 +69,49 @@ def series_to_array(series):
6769
return series
6870
else:
6971
return series.to_numpy()
72+
73+
74+
def add_logical_columns(df, arity):
75+
"""Adds logical columns to a DataFrame.
76+
Args:
77+
df (pandas.DataFrame): The input DataFrame.
78+
arity (int): The arity of the logical columns.
79+
Returns:
80+
pandas.DataFrame: The output DataFrame.
81+
Example:
82+
>>> from spotPython.utils.convert import add_logical_columns
83+
>>> import pandas as pd
84+
>>> df = pd.DataFrame({'A': [True, False, True], 'B': [False, True, True], 'C': [True, True, False]})
85+
>>> result = add_logical_columns(df, 2)
86+
>>> print(result)
87+
A B C and_A_B or_A_B xor_A_B and_A_C or_A_C xor_A_C and_B_C or_B_C xor_B_C
88+
0 True False True False True True True True False False True True
89+
1 False True True False True True False True True False True True
90+
2 True True False True True False False True True True True False
91+
"""
92+
# Create a copy of the input DataFrame to avoid modifying it
93+
result = df.copy()
94+
95+
# Create empty DataFrames for the additional columns
96+
and_df = pd.DataFrame(index=df.index)
97+
or_df = pd.DataFrame(index=df.index)
98+
xor_df = pd.DataFrame(index=df.index)
99+
100+
# Get all combinations of columns with the specified arity
101+
column_combinations = list(combinations(df.columns, arity))
102+
103+
# Apply the logical_and, logical_or and logical_xor functions to all combinations of columns
104+
for cols in column_combinations:
105+
col_name = "_".join(cols)
106+
and_df[f"and_{col_name}"] = result[cols[0]]
107+
or_df[f"or_{col_name}"] = result[cols[0]]
108+
xor_df[f"xor_{col_name}"] = result[cols[0]]
109+
for col in cols[1:]:
110+
and_df[f"and_{col_name}"] &= result[col]
111+
or_df[f"or_{col_name}"] |= result[col]
112+
xor_df[f"xor_{col_name}"] ^= result[col]
113+
114+
# Concatenate the input DataFrame with the additional columns
115+
result = pd.concat([result, and_df, or_df, xor_df], axis=1)
116+
117+
return result

0 commit comments

Comments
 (0)