Skip to content

Commit 3532b3c

Browse files
stats
1 parent 3af7394 commit 3532b3c

3 files changed

Lines changed: 294 additions & 0 deletions

File tree

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ classifiers = [
2626
dependencies = [
2727
"build",
2828
"captum",
29+
"factor_analyzer",
2930
"forestci",
3031
"graphviz",
3132
"lightning>=2.0.0rc0",

src/spotpython/utils/misc.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
from sklearn.ensemble import RandomForestRegressor
2+
from sklearn.linear_model import LinearRegression
3+
from sklearn.pipeline import Pipeline
4+
from sklearn.preprocessing import PolynomialFeatures
5+
from spotpython.surrogate.kriging import Kriging
6+
7+
8+
def get_regressor(name) -> object:
9+
"""
10+
Returns a scikit-learn regressor based on the given name.
11+
12+
Args:
13+
name (str): The name of the regressor.
14+
Supported names are: "linear", "polynomial", "random_forest", and "kriging".
15+
16+
Returns:
17+
object: A scikit-learn regressor object.
18+
19+
Raises:
20+
ValueError: If an unknown regressor name is provided.
21+
22+
Example:
23+
>>> from spotpython.utils.misc import get_regressor
24+
>>> regressor = get_regressor("linear")
25+
>>> print(type(regressor))
26+
<class 'sklearn.linear_model._base.LinearRegression'>
27+
"""
28+
if name == "linear":
29+
mdl = LinearRegression()
30+
elif name == "polynomial":
31+
degree_polyn = 2
32+
mdl = Pipeline([("poly", PolynomialFeatures(degree=degree_polyn)), ("linear", LinearRegression())])
33+
elif name == "random_forest":
34+
mdl = RandomForestRegressor()
35+
# elif name == "xgboost":
36+
# mdl = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=100, random_state=42)
37+
elif name == "kriging":
38+
mdl = Kriging()
39+
else:
40+
raise ValueError(f"Unknown regressor {name}")
41+
return mdl

src/spotpython/utils/stats.py

Lines changed: 252 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
import matplotlib.pyplot as plt
88
import seaborn as sns
99
from statsmodels.formula.api import ols
10+
from statsmodels.stats.outliers_influence import variance_inflation_factor
11+
import statsmodels.api as sm
1012

1113

1214
def cov_to_cor(covariance_matrix) -> np.ndarray:
@@ -486,3 +488,253 @@ def plot_coeff_vs_pvals_by_included(data, xlabels=None, xlim=(0, 1), xlab="P val
486488
g.figure.suptitle(title)
487489
if show:
488490
plt.show()
491+
492+
493+
def vif(X, sorted=True) -> pd.DataFrame:
494+
"""
495+
Calculates the Variance Inflation Factor (VIF) for each feature in a DataFrame.
496+
497+
VIF measures the multicollinearity among independent variables within a regression model.
498+
High VIF values indicate high multicollinearity, which can cause issues with model
499+
interpretation and stability.
500+
501+
Args:
502+
X (pandas.DataFrame): A DataFrame containing the independent variables.
503+
sorted (bool): Whether to sort the output DataFrame by VIF values.
504+
505+
Returns:
506+
pandas.DataFrame: A DataFrame with two columns:
507+
- "feature": The name of the feature.
508+
- "VIF": The Variance Inflation Factor for the feature.
509+
510+
Examples:
511+
>>> from spotpython.utils.stats import vif
512+
>>> import pandas as pd
513+
>>> data = pd.DataFrame({
514+
... 'x1': [1, 2, 3, 4, 5],
515+
... 'x2': [2, 4, 6, 8, 10],
516+
... 'x3': [1, 3, 5, 7, 9]
517+
... })
518+
>>> vif(data)
519+
feature VIF
520+
0 x1 1260.000000
521+
1 x2 0.000000
522+
2 x3 630.000000
523+
"""
524+
vif_data = pd.DataFrame()
525+
vif_data["feature"] = X.columns
526+
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
527+
if sorted:
528+
vif_data = vif_data.sort_values(by="VIF", ascending=False).reset_index(drop=True)
529+
return vif_data
530+
531+
532+
def condition_index(df) -> pd.DataFrame:
533+
"""
534+
Calculates the Condition Index for each feature in a DataFrame to assess multicollinearity.
535+
536+
The Condition Index is computed based on the eigenvalues of the covariance matrix
537+
of the standardized data. High condition indices suggest potential multicollinearity issues.
538+
539+
Args:
540+
df (pandas.DataFrame): A DataFrame containing the independent variables.
541+
542+
Returns:
543+
pandas.DataFrame: A DataFrame with the following columns:
544+
- 'Feature': The name of the feature.
545+
- 'Eigenvalue': The eigenvalue of the covariance matrix.
546+
- 'Condition Index': The Condition Index for the feature.
547+
548+
Examples:
549+
>>> from spotpython.utils.stats import condition_index
550+
>>> import pandas as pd
551+
>>> data = pd.DataFrame({
552+
... 'x1': [1, 2, 3, 4, 5],
553+
... 'x2': [2, 4, 6, 8, 10],
554+
... 'x3': [1, 3, 5, 7, 9]
555+
... })
556+
>>> condition_index(data)
557+
Feature Eigenvalue Condition Index
558+
0 x1 1.140000 1.000000
559+
1 x2 0.000000 inf
560+
2 x3 0.002857 20.000000
561+
"""
562+
# Standardisieren der Daten
563+
X = df.values
564+
X_centered = X - np.mean(X, axis=0)
565+
566+
# Berechnung der Kovarianzmatrix
567+
covariance_matrix = np.cov(X_centered, rowvar=False)
568+
569+
# Berechnung der Eigenwerte der Kovarianzmatrix
570+
eigenvalues, _ = np.linalg.eigh(covariance_matrix)
571+
572+
# Berechnung des Condition Index
573+
# Condition Index ist die Wurzel des Verhältnisses des größten Eigenwertes zum jeweiligen Eigenwert
574+
max_eigenvalue = max(eigenvalues)
575+
condition_indices = np.sqrt(max_eigenvalue / eigenvalues)
576+
577+
# Erstellen eines DataFrames zur Anzeige der Ergebnisse
578+
condition_index_df = pd.DataFrame({"Feature": df.columns, "Eigenvalue": eigenvalues, "Condition Index": condition_indices})
579+
580+
return condition_index_df
581+
582+
583+
def compute_standardized_betas(model, X_encoded, y) -> pd.DataFrame:
584+
"""
585+
Computes standardized (beta) coefficients for a fitted statsmodels OLS model.
586+
587+
Args:
588+
model (statsmodels.regression.linear_model.RegressionResultsWrapper): The fitted OLS model.
589+
X_encoded (pandas.DataFrame): The design matrix of independent variables.
590+
y (pandas.Series): The dependent variable.
591+
592+
Returns:
593+
pandas.DataFrame: A DataFrame containing the standardized beta coefficients.
594+
595+
Examples:
596+
>>> from spotpython.utils.stats import compute_standardized_betas
597+
>>> import pandas as pd
598+
>>> import statsmodels.api as sm
599+
>>> data = pd.DataFrame({
600+
... 'x1': [1, 2, 3, 4, 5],
601+
... 'x2': [2, 4, 6, 8, 10],
602+
... 'x3': [1, 3, 5, 7, 9]
603+
... })
604+
>>> y = pd.Series([1, 2, 3, 4, 5])
605+
>>> X = sm.add_constant(data)
606+
>>> model = sm.OLS(y, X).fit()
607+
>>> compute_standardized_betas(model, data, y)
608+
Variable Standardized Beta
609+
0 const 0.000000
610+
1 x1 0.000000
611+
2 x2 0.000000
612+
3 x3 0.000000
613+
614+
"""
615+
coeffs_unstd = model.params
616+
std_X = X_encoded.drop(columns=["const"], errors="ignore").std()
617+
std_y = y.std()
618+
beta_std = coeffs_unstd.drop("const", errors="ignore") * (std_X / std_y)
619+
beta_std_df = pd.DataFrame({"Variable": beta_std.index, "Standardized Beta": beta_std.values})
620+
return beta_std_df
621+
622+
623+
def compute_coefficients_table(model, X_encoded, y, vif_table=None) -> pd.DataFrame:
624+
"""
625+
Compute a coefficients table containing:
626+
1. Variable name
627+
2. Zero-order correlation
628+
3. Partial correlation
629+
4. Semipartial (part) correlation
630+
5. Tolerance (1 / VIF)
631+
6. VIF
632+
633+
Args:
634+
model (statsmodels.regression.linear_model.RegressionResultsWrapper):
635+
A fitted OLS model from statsmodels.
636+
X_encoded (pd.DataFrame):
637+
The DataFrame used to fit the model, including 'const'.
638+
y (pd.Series):
639+
Dependent variable used in fitting the model.
640+
vif_table (pd.DataFrame):
641+
A DataFrame with columns ["feature", "VIF"] for each column in X_encoded
642+
(typ. from statsmodels.stats.outliers_influence.variance_inflation_factor).
643+
Default is None.
644+
645+
Returns:
646+
pd.DataFrame with columns:
647+
- "Variable"
648+
- "Zero-Order r"
649+
- "Partial r"
650+
- "Semipartial r"
651+
- "Tolerance"
652+
- "VIF"
653+
654+
Examples:
655+
>>> from spotpython.utils.stats import compute_coefficients_table
656+
>>> import pandas as pd
657+
>>> import statsmodels.api as sm
658+
>>> data = pd.DataFrame({
659+
... 'x1': [1, 2, 3, 4, 5],
660+
... 'x2': [2, 4, 6, 8, 10],
661+
... 'x3': [1, 3, 5, 7, 9]
662+
... })
663+
>>> y = pd.Series([1, 2, 3, 4, 5])
664+
>>> X = sm.add_constant(data)
665+
>>> model = sm.OLS(y, X).fit()
666+
>>> vif_table = pd.DataFrame({
667+
... 'feature': ['x1', 'x2', 'x3'],
668+
... 'VIF': [1, 2, 3]
669+
... })
670+
>>> compute_coefficients_table(model, data, y, vif_table)
671+
Variable Zero-Order r Partial r Semipartial r Tolerance VIF
672+
0 x1 0.0 0.0 0.0 1.0 1.0
673+
1 x2 0.0 0.0 0.0 0.5 2.0
674+
2 x3 0.0 0.0 0.0 0.333333 3.0
675+
676+
"""
677+
678+
# Full-model R^2 and residual df
679+
r2_full = model.rsquared
680+
681+
# We want to iterate over each predictor except the intercept
682+
predictors = [col for col in X_encoded.columns if col != "const"]
683+
684+
results = []
685+
686+
for var in predictors:
687+
# -------------------------------------------------------------------
688+
# 1) Zero-order correlation: Pearson correlation of var with y
689+
# -------------------------------------------------------------------
690+
zero_order_r = X_encoded[var].corr(y)
691+
692+
# -------------------------------------------------------------------
693+
# 2) Partial Correlation & 3) Semipartial Correlation
694+
# We compare a 'full' model vs. a 'reduced' model (without var)
695+
# -------------------------------------------------------------------
696+
X_reduced = X_encoded.drop(columns=[var])
697+
reduced_model = sm.OLS(y, X_reduced).fit()
698+
r2_reduced = reduced_model.rsquared
699+
700+
# The difference in R^2 contributed by this predictor
701+
delta_r2 = r2_full - r2_reduced
702+
703+
# Determine sign from the unstandardized coefficient in the full model
704+
coeff_sign = np.sign(model.params.get(var, 0.0))
705+
706+
# If numeric issues occur (e.g., delta_r2 < 0), set correlations to NaN
707+
if delta_r2 <= 0.0 or (1 - r2_reduced) <= 0.0:
708+
partial_r = np.nan
709+
semipartial_r = np.nan
710+
else:
711+
# partial correlation
712+
# partial_r² = (R²_full - R²_reduced) / (1 - R²_reduced)
713+
partial_r = coeff_sign * np.sqrt(delta_r2 / (1 - r2_reduced))
714+
715+
# semipartial correlation (also called part correlation)
716+
# semipartial_r² = (R²_full - R²_reduced)
717+
# By definition, semipartial_r = sqrt( delta_r2 ), but we treat R² as a fraction
718+
# Because the base R² is SSR / TSS, so:
719+
semipartial_r = coeff_sign * np.sqrt(delta_r2)
720+
721+
# -------------------------------------------------------------------
722+
# 4) Tolerance & 5) VIF
723+
# -------------------------------------------------------------------
724+
if vif_table is None:
725+
results.append({"Variable": var, "Zero-Order r": zero_order_r, "Partial r": partial_r, "Semipartial r": semipartial_r})
726+
else:
727+
# Get the VIF for this predictor
728+
vif_row = vif_table.loc[vif_table["feature"] == var, "VIF"]
729+
if len(vif_row) == 0:
730+
var_vif = np.nan
731+
else:
732+
var_vif = vif_row.iloc[0]
733+
if var_vif <= 0 or np.isnan(var_vif):
734+
tolerance = np.nan
735+
else:
736+
tolerance = 1.0 / var_vif
737+
# Collect results
738+
results.append({"Variable": var, "Zero-Order r": zero_order_r, "Partial r": partial_r, "Semipartial r": semipartial_r, "Tolerance": tolerance, "VIF": var_vif})
739+
740+
return pd.DataFrame(results)

0 commit comments

Comments
 (0)