|
7 | 7 | import matplotlib.pyplot as plt |
8 | 8 | import seaborn as sns |
9 | 9 | from statsmodels.formula.api import ols |
| 10 | +from statsmodels.stats.outliers_influence import variance_inflation_factor |
| 11 | +import statsmodels.api as sm |
10 | 12 |
|
11 | 13 |
|
12 | 14 | def cov_to_cor(covariance_matrix) -> np.ndarray: |
@@ -486,3 +488,253 @@ def plot_coeff_vs_pvals_by_included(data, xlabels=None, xlim=(0, 1), xlab="P val |
486 | 488 | g.figure.suptitle(title) |
487 | 489 | if show: |
488 | 490 | plt.show() |
| 491 | + |
| 492 | + |
| 493 | +def vif(X, sorted=True) -> pd.DataFrame: |
| 494 | + """ |
| 495 | + Calculates the Variance Inflation Factor (VIF) for each feature in a DataFrame. |
| 496 | +
|
| 497 | + VIF measures the multicollinearity among independent variables within a regression model. |
| 498 | + High VIF values indicate high multicollinearity, which can cause issues with model |
| 499 | + interpretation and stability. |
| 500 | +
|
| 501 | + Args: |
| 502 | + X (pandas.DataFrame): A DataFrame containing the independent variables. |
| 503 | + sorted (bool): Whether to sort the output DataFrame by VIF values. |
| 504 | +
|
| 505 | + Returns: |
| 506 | + pandas.DataFrame: A DataFrame with two columns: |
| 507 | + - "feature": The name of the feature. |
| 508 | + - "VIF": The Variance Inflation Factor for the feature. |
| 509 | +
|
| 510 | + Examples: |
| 511 | + >>> from spotpython.utils.stats import vif |
| 512 | + >>> import pandas as pd |
| 513 | + >>> data = pd.DataFrame({ |
| 514 | + ... 'x1': [1, 2, 3, 4, 5], |
| 515 | + ... 'x2': [2, 4, 6, 8, 10], |
| 516 | + ... 'x3': [1, 3, 5, 7, 9] |
| 517 | + ... }) |
| 518 | + >>> vif(data) |
| 519 | + feature VIF |
| 520 | + 0 x1 1260.000000 |
| 521 | + 1 x2 0.000000 |
| 522 | + 2 x3 630.000000 |
| 523 | + """ |
| 524 | + vif_data = pd.DataFrame() |
| 525 | + vif_data["feature"] = X.columns |
| 526 | + vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])] |
| 527 | + if sorted: |
| 528 | + vif_data = vif_data.sort_values(by="VIF", ascending=False).reset_index(drop=True) |
| 529 | + return vif_data |
| 530 | + |
| 531 | + |
| 532 | +def condition_index(df) -> pd.DataFrame: |
| 533 | + """ |
| 534 | + Calculates the Condition Index for each feature in a DataFrame to assess multicollinearity. |
| 535 | +
|
| 536 | + The Condition Index is computed based on the eigenvalues of the covariance matrix |
| 537 | + of the standardized data. High condition indices suggest potential multicollinearity issues. |
| 538 | +
|
| 539 | + Args: |
| 540 | + df (pandas.DataFrame): A DataFrame containing the independent variables. |
| 541 | +
|
| 542 | + Returns: |
| 543 | + pandas.DataFrame: A DataFrame with the following columns: |
| 544 | + - 'Feature': The name of the feature. |
| 545 | + - 'Eigenvalue': The eigenvalue of the covariance matrix. |
| 546 | + - 'Condition Index': The Condition Index for the feature. |
| 547 | +
|
| 548 | + Examples: |
| 549 | + >>> from spotpython.utils.stats import condition_index |
| 550 | + >>> import pandas as pd |
| 551 | + >>> data = pd.DataFrame({ |
| 552 | + ... 'x1': [1, 2, 3, 4, 5], |
| 553 | + ... 'x2': [2, 4, 6, 8, 10], |
| 554 | + ... 'x3': [1, 3, 5, 7, 9] |
| 555 | + ... }) |
| 556 | + >>> condition_index(data) |
| 557 | + Feature Eigenvalue Condition Index |
| 558 | + 0 x1 1.140000 1.000000 |
| 559 | + 1 x2 0.000000 inf |
| 560 | + 2 x3 0.002857 20.000000 |
| 561 | + """ |
| 562 | + # Standardisieren der Daten |
| 563 | + X = df.values |
| 564 | + X_centered = X - np.mean(X, axis=0) |
| 565 | + |
| 566 | + # Berechnung der Kovarianzmatrix |
| 567 | + covariance_matrix = np.cov(X_centered, rowvar=False) |
| 568 | + |
| 569 | + # Berechnung der Eigenwerte der Kovarianzmatrix |
| 570 | + eigenvalues, _ = np.linalg.eigh(covariance_matrix) |
| 571 | + |
| 572 | + # Berechnung des Condition Index |
| 573 | + # Condition Index ist die Wurzel des Verhältnisses des größten Eigenwertes zum jeweiligen Eigenwert |
| 574 | + max_eigenvalue = max(eigenvalues) |
| 575 | + condition_indices = np.sqrt(max_eigenvalue / eigenvalues) |
| 576 | + |
| 577 | + # Erstellen eines DataFrames zur Anzeige der Ergebnisse |
| 578 | + condition_index_df = pd.DataFrame({"Feature": df.columns, "Eigenvalue": eigenvalues, "Condition Index": condition_indices}) |
| 579 | + |
| 580 | + return condition_index_df |
| 581 | + |
| 582 | + |
| 583 | +def compute_standardized_betas(model, X_encoded, y) -> pd.DataFrame: |
| 584 | + """ |
| 585 | + Computes standardized (beta) coefficients for a fitted statsmodels OLS model. |
| 586 | +
|
| 587 | + Args: |
| 588 | + model (statsmodels.regression.linear_model.RegressionResultsWrapper): The fitted OLS model. |
| 589 | + X_encoded (pandas.DataFrame): The design matrix of independent variables. |
| 590 | + y (pandas.Series): The dependent variable. |
| 591 | +
|
| 592 | + Returns: |
| 593 | + pandas.DataFrame: A DataFrame containing the standardized beta coefficients. |
| 594 | +
|
| 595 | + Examples: |
| 596 | + >>> from spotpython.utils.stats import compute_standardized_betas |
| 597 | + >>> import pandas as pd |
| 598 | + >>> import statsmodels.api as sm |
| 599 | + >>> data = pd.DataFrame({ |
| 600 | + ... 'x1': [1, 2, 3, 4, 5], |
| 601 | + ... 'x2': [2, 4, 6, 8, 10], |
| 602 | + ... 'x3': [1, 3, 5, 7, 9] |
| 603 | + ... }) |
| 604 | + >>> y = pd.Series([1, 2, 3, 4, 5]) |
| 605 | + >>> X = sm.add_constant(data) |
| 606 | + >>> model = sm.OLS(y, X).fit() |
| 607 | + >>> compute_standardized_betas(model, data, y) |
| 608 | + Variable Standardized Beta |
| 609 | + 0 const 0.000000 |
| 610 | + 1 x1 0.000000 |
| 611 | + 2 x2 0.000000 |
| 612 | + 3 x3 0.000000 |
| 613 | +
|
| 614 | + """ |
| 615 | + coeffs_unstd = model.params |
| 616 | + std_X = X_encoded.drop(columns=["const"], errors="ignore").std() |
| 617 | + std_y = y.std() |
| 618 | + beta_std = coeffs_unstd.drop("const", errors="ignore") * (std_X / std_y) |
| 619 | + beta_std_df = pd.DataFrame({"Variable": beta_std.index, "Standardized Beta": beta_std.values}) |
| 620 | + return beta_std_df |
| 621 | + |
| 622 | + |
| 623 | +def compute_coefficients_table(model, X_encoded, y, vif_table=None) -> pd.DataFrame: |
| 624 | + """ |
| 625 | + Compute a coefficients table containing: |
| 626 | + 1. Variable name |
| 627 | + 2. Zero-order correlation |
| 628 | + 3. Partial correlation |
| 629 | + 4. Semipartial (part) correlation |
| 630 | + 5. Tolerance (1 / VIF) |
| 631 | + 6. VIF |
| 632 | +
|
| 633 | + Args: |
| 634 | + model (statsmodels.regression.linear_model.RegressionResultsWrapper): |
| 635 | + A fitted OLS model from statsmodels. |
| 636 | + X_encoded (pd.DataFrame): |
| 637 | + The DataFrame used to fit the model, including 'const'. |
| 638 | + y (pd.Series): |
| 639 | + Dependent variable used in fitting the model. |
| 640 | + vif_table (pd.DataFrame): |
| 641 | + A DataFrame with columns ["feature", "VIF"] for each column in X_encoded |
| 642 | + (typ. from statsmodels.stats.outliers_influence.variance_inflation_factor). |
| 643 | + Default is None. |
| 644 | +
|
| 645 | + Returns: |
| 646 | + pd.DataFrame with columns: |
| 647 | + - "Variable" |
| 648 | + - "Zero-Order r" |
| 649 | + - "Partial r" |
| 650 | + - "Semipartial r" |
| 651 | + - "Tolerance" |
| 652 | + - "VIF" |
| 653 | +
|
| 654 | + Examples: |
| 655 | + >>> from spotpython.utils.stats import compute_coefficients_table |
| 656 | + >>> import pandas as pd |
| 657 | + >>> import statsmodels.api as sm |
| 658 | + >>> data = pd.DataFrame({ |
| 659 | + ... 'x1': [1, 2, 3, 4, 5], |
| 660 | + ... 'x2': [2, 4, 6, 8, 10], |
| 661 | + ... 'x3': [1, 3, 5, 7, 9] |
| 662 | + ... }) |
| 663 | + >>> y = pd.Series([1, 2, 3, 4, 5]) |
| 664 | + >>> X = sm.add_constant(data) |
| 665 | + >>> model = sm.OLS(y, X).fit() |
| 666 | + >>> vif_table = pd.DataFrame({ |
| 667 | + ... 'feature': ['x1', 'x2', 'x3'], |
| 668 | + ... 'VIF': [1, 2, 3] |
| 669 | + ... }) |
| 670 | + >>> compute_coefficients_table(model, data, y, vif_table) |
| 671 | + Variable Zero-Order r Partial r Semipartial r Tolerance VIF |
| 672 | + 0 x1 0.0 0.0 0.0 1.0 1.0 |
| 673 | + 1 x2 0.0 0.0 0.0 0.5 2.0 |
| 674 | + 2 x3 0.0 0.0 0.0 0.333333 3.0 |
| 675 | +
|
| 676 | + """ |
| 677 | + |
| 678 | + # Full-model R^2 and residual df |
| 679 | + r2_full = model.rsquared |
| 680 | + |
| 681 | + # We want to iterate over each predictor except the intercept |
| 682 | + predictors = [col for col in X_encoded.columns if col != "const"] |
| 683 | + |
| 684 | + results = [] |
| 685 | + |
| 686 | + for var in predictors: |
| 687 | + # ------------------------------------------------------------------- |
| 688 | + # 1) Zero-order correlation: Pearson correlation of var with y |
| 689 | + # ------------------------------------------------------------------- |
| 690 | + zero_order_r = X_encoded[var].corr(y) |
| 691 | + |
| 692 | + # ------------------------------------------------------------------- |
| 693 | + # 2) Partial Correlation & 3) Semipartial Correlation |
| 694 | + # We compare a 'full' model vs. a 'reduced' model (without var) |
| 695 | + # ------------------------------------------------------------------- |
| 696 | + X_reduced = X_encoded.drop(columns=[var]) |
| 697 | + reduced_model = sm.OLS(y, X_reduced).fit() |
| 698 | + r2_reduced = reduced_model.rsquared |
| 699 | + |
| 700 | + # The difference in R^2 contributed by this predictor |
| 701 | + delta_r2 = r2_full - r2_reduced |
| 702 | + |
| 703 | + # Determine sign from the unstandardized coefficient in the full model |
| 704 | + coeff_sign = np.sign(model.params.get(var, 0.0)) |
| 705 | + |
| 706 | + # If numeric issues occur (e.g., delta_r2 < 0), set correlations to NaN |
| 707 | + if delta_r2 <= 0.0 or (1 - r2_reduced) <= 0.0: |
| 708 | + partial_r = np.nan |
| 709 | + semipartial_r = np.nan |
| 710 | + else: |
| 711 | + # partial correlation |
| 712 | + # partial_r² = (R²_full - R²_reduced) / (1 - R²_reduced) |
| 713 | + partial_r = coeff_sign * np.sqrt(delta_r2 / (1 - r2_reduced)) |
| 714 | + |
| 715 | + # semipartial correlation (also called part correlation) |
| 716 | + # semipartial_r² = (R²_full - R²_reduced) |
| 717 | + # By definition, semipartial_r = sqrt( delta_r2 ), but we treat R² as a fraction |
| 718 | + # Because the base R² is SSR / TSS, so: |
| 719 | + semipartial_r = coeff_sign * np.sqrt(delta_r2) |
| 720 | + |
| 721 | + # ------------------------------------------------------------------- |
| 722 | + # 4) Tolerance & 5) VIF |
| 723 | + # ------------------------------------------------------------------- |
| 724 | + if vif_table is None: |
| 725 | + results.append({"Variable": var, "Zero-Order r": zero_order_r, "Partial r": partial_r, "Semipartial r": semipartial_r}) |
| 726 | + else: |
| 727 | + # Get the VIF for this predictor |
| 728 | + vif_row = vif_table.loc[vif_table["feature"] == var, "VIF"] |
| 729 | + if len(vif_row) == 0: |
| 730 | + var_vif = np.nan |
| 731 | + else: |
| 732 | + var_vif = vif_row.iloc[0] |
| 733 | + if var_vif <= 0 or np.isnan(var_vif): |
| 734 | + tolerance = np.nan |
| 735 | + else: |
| 736 | + tolerance = 1.0 / var_vif |
| 737 | + # Collect results |
| 738 | + results.append({"Variable": var, "Zero-Order r": zero_order_r, "Partial r": partial_r, "Semipartial r": semipartial_r, "Tolerance": tolerance, "VIF": var_vif}) |
| 739 | + |
| 740 | + return pd.DataFrame(results) |
0 commit comments