22import pandas as pd
33
44
5- def cluster_features (df_vbdp ):
5+ def cluster_features (X ):
66 """Clusters the features of a dataframe based on similarity
77
88 Args:
9- df_vbdp (pd.DataFrame): dataframe with features
9+ X (pd.DataFrame): dataframe with features
1010 Returns:
11- df_vbdp (pd.DataFrame): dataframe with new features
11+ X (pd.DataFrame): dataframe with new features
1212 Examples:
1313 >>> df = pd.DataFrame({"a": [True, False, True], "b": [True, True, False], "c": [False, False, True]})
1414 >>> df
@@ -22,24 +22,24 @@ def cluster_features(df_vbdp):
2222 1 False True False 1
2323 2 True False True 2
2424 """
25- c_0 = df_vbdp .columns [df_vbdp .columns .str .contains ("pain" )]
26- c_1 = df_vbdp .columns [df_vbdp .columns .str .contains ("inflammation" )]
27- c_2 = df_vbdp .columns [df_vbdp .columns .str .contains ("bleed" )]
28- c_3 = df_vbdp .columns [df_vbdp .columns .str .contains ("skin" )]
29- df_vbdp ["c_0" ] = df_vbdp [c_0 ].sum (axis = 1 )
30- df_vbdp ["c_1" ] = df_vbdp [c_1 ].sum (axis = 1 )
31- df_vbdp ["c_2" ] = df_vbdp [c_2 ].sum (axis = 1 )
32- df_vbdp ["c_3" ] = df_vbdp [c_3 ].sum (axis = 1 )
33- return df_vbdp
25+ c_0 = X .columns [X .columns .str .contains ("pain" )]
26+ c_1 = X .columns [X .columns .str .contains ("inflammation" )]
27+ c_2 = X .columns [X .columns .str .contains ("bleed" )]
28+ c_3 = X .columns [X .columns .str .contains ("skin" )]
29+ X ["c_0" ] = X [c_0 ].sum (axis = 1 )
30+ X ["c_1" ] = X [c_1 ].sum (axis = 1 )
31+ X ["c_2" ] = X [c_2 ].sum (axis = 1 )
32+ X ["c_3" ] = X [c_3 ].sum (axis = 1 )
33+ return X
3434
3535
36- def affinity_propagation_features (df_vbdp ):
36+ def affinity_propagation_features (X ):
3737 """Clusters the features of a dataframe using Affinity Propagation
3838
3939 Args:
40- df_vbdp (pd.DataFrame): dataframe with features
40+ X (pd.DataFrame): dataframe with features
4141 Returns:
42- df_vbdp (pd.DataFrame): dataframe with new features
42+ X (pd.DataFrame): dataframe with new features
4343 Examples:
4444 >>> df = pd.DataFrame({"a": [True, False, True], "b": [True, True, False], "c": [False, False, True]})
4545 >>> df
@@ -56,22 +56,22 @@ def affinity_propagation_features(df_vbdp):
5656 from sklearn .cluster import AffinityPropagation
5757 from sklearn .metrics .pairwise import manhattan_distances
5858
59- X = manhattan_distances (df_vbdp )
60- af = AffinityPropagation (random_state = 0 , affinity = "precomputed" ).fit (X )
59+ D = manhattan_distances (X )
60+ af = AffinityPropagation (random_state = 0 , affinity = "precomputed" ).fit (D )
6161 cluster_centers_indices = af .cluster_centers_indices_
6262 n_clusters_ = len (cluster_centers_indices )
6363 print ("Estimated number of clusters: %d" % n_clusters_ )
64- df_vbdp ["cluster" ] = af .labels_
65- return df_vbdp
64+ X ["cluster" ] = af .labels_
65+ return X
6666
6767
68- def combine_features (df_vbdp ):
68+ def combine_features (X ):
6969 """Combines all features in a dataframe with each other using bitwise operations
7070
7171 Args:
72- df_vbdp (pd.DataFrame): dataframe with features
72+ X (pd.DataFrame): dataframe with features
7373 Returns:
74- df_vbdp (pd.DataFrame): dataframe with new features
74+ X (pd.DataFrame): dataframe with new features
7575 Examples:
7676 >>> df = pd.DataFrame({"a": [True, False, True], "b": [True, True, False], "c": [False, False, True]})
7777 >>> df
@@ -87,12 +87,78 @@ def combine_features(df_vbdp):
8787 """
8888 new_cols = []
8989 # Iterate over all pairs of columns
90- for col1 , col2 in itertools .combinations (df_vbdp .columns , 2 ):
90+ for col1 , col2 in itertools .combinations (X .columns , 2 ):
9191 # Create new columns for the bitwise AND, OR and XOR operations
92- and_col = df_vbdp [[col1 , col2 ]].apply (lambda x : x [col1 ] & x [col2 ], axis = 1 )
93- or_col = df_vbdp [[col1 , col2 ]].apply (lambda x : x [col1 ] | x [col2 ], axis = 1 )
94- xor_col = df_vbdp [[col1 , col2 ]].apply (lambda x : x [col1 ] ^ x [col2 ], axis = 1 )
92+ and_col = X [[col1 , col2 ]].apply (lambda x : x [col1 ] & x [col2 ], axis = 1 )
93+ or_col = X [[col1 , col2 ]].apply (lambda x : x [col1 ] | x [col2 ], axis = 1 )
94+ xor_col = X [[col1 , col2 ]].apply (lambda x : x [col1 ] ^ x [col2 ], axis = 1 )
9595 new_cols .extend ([and_col , or_col , xor_col ])
9696 # Join all the new columns at once
97- df_vbdp = pd .concat ([df_vbdp ] + new_cols , axis = 1 )
98- return df_vbdp
97+ X = pd .concat ([X ] + new_cols , axis = 1 )
98+ return X
99+
100+
101+ def symptom_features (X , y ):
102+ """Generate new features based on the joint symptoms of a disease
103+ Args:
104+ X (pd.DataFrame): dataframe with features
105+ y (pd.Series): series with target values
106+ """
107+ # Combine X and y into one dataframe
108+ Xy = pd .concat ([X , y ], axis = 1 )
109+ # Add names to the columns: x1, x2, ..., xn, y
110+ Xy .columns = ["x" + str (i ) for i in range (1 , X .shape [1 ] + 1 )] + ["y" ]
111+ # full train data with X and y values
112+ marginals = Xy .groupby ("y" ).mean ()
113+ top_2_symptoms = {}
114+ bot_2_symptoms = {}
115+ # for feature generation
116+ combinations = []
117+ for i in range (marginals .shape [0 ]):
118+ symptoms = marginals .iloc [i ]
119+ # for b in True, False:
120+ sorted = symptoms .sort_values (ascending = False )
121+ top_1 = sorted .keys ()[0 ]
122+ top_1_per = sorted .values [0 ]
123+ top_2 = sorted .keys ()[1 ]
124+ top_2_per = sorted .values [1 ]
125+
126+ bot_1 = sorted .keys ()[- 1 ]
127+ bot_1_per = sorted .values [- 1 ]
128+ bot_2 = sorted .keys ()[- 2 ]
129+ bot_2_per = sorted .values [- 2 ]
130+
131+ name = marginals .index [i ]
132+ dic = {top_1 : top_1_per , top_2 : top_2_per }
133+ dic_bot = {bot_1 : bot_1_per , bot_2 : bot_2_per }
134+ top_2_symptoms [name ] = dic
135+ bot_2_symptoms [name ] = dic_bot
136+ combinations .append (((top_1 , top_2 ), (bot_1 , bot_2 )))
137+ Xy_mod = Xy .copy ()
138+ convert = Xy .drop (columns = ["y" ]).columns .values
139+ for val in convert :
140+ Xy_mod [val ] = Xy_mod [val ].astype ("int" )
141+ for group in combinations :
142+ for comb in group :
143+ col1 , col2 = comb
144+ new_columns = pd .DataFrame (
145+ {
146+ f"{ col1 } _and_{ col2 } " : Xy_mod [col1 ] & Xy_mod [col2 ],
147+ f"{ col1 } _or_{ col2 } " : Xy_mod [col1 ] | Xy_mod [col2 ],
148+ f"{ col1 } _xor_{ col2 } " : Xy_mod [col1 ] ^ Xy_mod [col2 ],
149+ }
150+ )
151+ Xy_mod = pd .concat ([Xy_mod , new_columns ], axis = 1 )
152+ # removing duplicate features
153+ Xy_mod = Xy_mod .loc [:, ~ Xy_mod .columns .duplicated ()].copy ()
154+ print (f"Number of features: { Xy_mod .shape [1 ]} " )
155+ print (f"Number of samples: { Xy_mod .shape [0 ]} " )
156+ # remove the column y from the Xy_mod data frame
157+ X_mod = Xy_mod .drop (columns = ["y" ])
158+ # print the column names
159+ print (f"Column names: { Xy_mod .columns .values } " )
160+ # X_new = add_logical_columns(X_mod, 2)
161+ X_new = combine_features (X_mod )
162+ print (f"Number of features: { X_new .shape [1 ]} " )
163+ print (f"Number of samples: { X_new .shape [0 ]} " )
164+ return X_new , top_2_symptoms , bot_2_symptoms
0 commit comments